From 307e5968d6314516560f832dd7ed9ce1757d00bb Mon Sep 17 00:00:00 2001 From: Ralph Giles Date: Tue, 7 Oct 2014 09:49:40 -0700 Subject: [PATCH] Bug 1063356 - Update libvpx source. r=kinetik Results of running ./update.py --ndk ~/android/android-ndk-r9 --commit c731d6a4f19eea861ceb2ff31399420b2452eb74 --- media/libvpx/PATENTS | 39 +- media/libvpx/README_MOZILLA | 2 +- media/libvpx/build/make/ads2gas.pl | 2 +- media/libvpx/build/make/obj_int_extract.c | 36 +- media/libvpx/build/make/thumb.pm | 6 +- media/libvpx/sources.mozbuild | 639 +-- media/libvpx/third_party/x86inc/x86inc.asm | 45 +- media/libvpx/vp8/common/alloccommon.h | 12 +- .../arm/armv6/vp8_variance16x16_armv6.asm | 8 +- .../arm/armv6/vp8_variance8x8_armv6.asm | 4 +- .../vp8_variance_halfpixvar16x16_h_armv6.asm | 8 +- .../vp8_variance_halfpixvar16x16_hv_armv6.asm | 8 +- .../vp8_variance_halfpixvar16x16_v_armv6.asm | 8 +- .../vp8/common/arm/bilinearfilter_arm.h | 14 +- media/libvpx/vp8/common/arm/dequantize_arm.c | 17 - media/libvpx/vp8/common/arm/loopfilter_arm.c | 6 +- .../arm/neon/bilinearpredict16x16_neon.asm | 357 -- .../arm/neon/bilinearpredict4x4_neon.asm | 130 - .../arm/neon/bilinearpredict8x4_neon.asm | 135 - .../arm/neon/bilinearpredict8x8_neon.asm | 183 - .../common/arm/neon/bilinearpredict_neon.c | 699 +++ .../arm/neon/buildintrapredictorsmby_neon.asm | 584 --- .../vp8/common/arm/neon/copymem16x16_neon.asm | 59 - .../vp8/common/arm/neon/copymem8x4_neon.asm | 34 - .../vp8/common/arm/neon/copymem8x8_neon.asm | 43 - .../libvpx/vp8/common/arm/neon/copymem_neon.c | 59 + .../common/arm/neon/dc_only_idct_add_neon.asm | 54 - .../common/arm/neon/dc_only_idct_add_neon.c | 42 + .../vp8/common/arm/neon/dequant_idct_neon.asm | 131 - .../vp8/common/arm/neon/dequant_idct_neon.c | 142 + .../vp8/common/arm/neon/dequantizeb_neon.asm | 34 - .../vp8/common/arm/neon/dequantizeb_neon.c | 25 + .../arm/neon/idct_dequant_0_2x_neon.asm | 79 - .../common/arm/neon/idct_dequant_0_2x_neon.c | 62 + .../arm/neon/idct_dequant_full_2x_neon.asm | 196 - .../arm/neon/idct_dequant_full_2x_neon.c | 185 + .../vp8/common/arm/neon/iwalsh_neon.asm | 87 - .../libvpx/vp8/common/arm/neon/iwalsh_neon.c | 102 + .../vp8/common/arm/neon/loopfilter_neon.asm | 397 -- .../vp8/common/arm/neon/loopfilter_neon.c | 549 +++ .../loopfiltersimplehorizontaledge_neon.asm | 117 - .../loopfiltersimplehorizontaledge_neon.c | 111 + .../loopfiltersimpleverticaledge_neon.asm | 154 - .../neon/loopfiltersimpleverticaledge_neon.c | 279 ++ .../vp8/common/arm/neon/mbloopfilter_neon.asm | 469 -- .../vp8/common/arm/neon/mbloopfilter_neon.c | 625 +++ .../vp8/common/arm/neon/reconintra_neon.c | 210 + .../libvpx/vp8/common/arm/neon/sad16_neon.asm | 207 - .../libvpx/vp8/common/arm/neon/sad8_neon.asm | 209 - media/libvpx/vp8/common/arm/neon/sad_neon.c | 184 + .../vp8/common/arm/neon/save_reg_neon.asm | 36 - .../common/arm/neon/shortidct4x4llm_neon.asm | 139 - .../common/arm/neon/shortidct4x4llm_neon.c | 123 + .../arm/neon/sixtappredict16x16_neon.asm | 490 -- .../common/arm/neon/sixtappredict4x4_neon.asm | 422 -- .../common/arm/neon/sixtappredict8x4_neon.asm | 473 -- .../common/arm/neon/sixtappredict8x8_neon.asm | 524 -- .../vp8/common/arm/neon/sixtappredict_neon.c | 1757 +++++++ .../vp8/common/arm/neon/variance_neon.asm | 276 -- .../vp8/common/arm/neon/variance_neon.c | 323 ++ .../neon/vp8_subpixelvariance16x16_neon.asm | 423 -- .../neon/vp8_subpixelvariance16x16s_neon.asm | 572 --- .../arm/neon/vp8_subpixelvariance8x8_neon.asm | 222 - .../arm/neon/vp8_subpixelvariance_neon.c | 1028 ++++ media/libvpx/vp8/common/arm/reconintra_arm.c | 58 - media/libvpx/vp8/common/blockd.h | 14 +- media/libvpx/vp8/common/coefupdateprobs.h | 12 + media/libvpx/vp8/common/common.h | 17 +- media/libvpx/vp8/common/default_coef_probs.h | 12 + media/libvpx/vp8/common/entropy.h | 12 +- media/libvpx/vp8/common/entropymode.h | 12 +- media/libvpx/vp8/common/entropymv.h | 12 +- media/libvpx/vp8/common/extend.h | 12 +- media/libvpx/vp8/common/filter.h | 12 +- media/libvpx/vp8/common/findnearmv.h | 12 +- media/libvpx/vp8/common/header.h | 12 +- media/libvpx/vp8/common/invtrans.h | 12 +- media/libvpx/vp8/common/loopfilter.c | 1 + media/libvpx/vp8/common/loopfilter.h | 12 +- media/libvpx/vp8/common/modecont.h | 12 +- media/libvpx/vp8/common/mv.h | 12 +- media/libvpx/vp8/common/onyx.h | 25 +- media/libvpx/vp8/common/onyxc_int.h | 12 +- media/libvpx/vp8/common/onyxd.h | 6 +- media/libvpx/vp8/common/postproc.c | 17 +- media/libvpx/vp8/common/postproc.h | 12 +- media/libvpx/vp8/common/ppflags.h | 12 +- media/libvpx/vp8/common/pragmas.h | 19 - media/libvpx/vp8/common/quant_common.h | 13 + media/libvpx/vp8/common/reconinter.h | 12 +- media/libvpx/vp8/common/reconintra4x4.h | 12 +- media/libvpx/vp8/common/setupintrarecon.h | 12 +- media/libvpx/vp8/common/swapyv12buffer.h | 12 +- media/libvpx/vp8/common/systemdependent.h | 12 + media/libvpx/vp8/common/threading.h | 12 +- media/libvpx/vp8/common/treecoder.h | 12 +- media/libvpx/vp8/common/variance.h | 12 +- media/libvpx/vp8/common/vp8_entropymodedata.h | 12 + media/libvpx/vp8/common/x86/filter_x86.h | 14 +- ...2.asm => loopfilter_block_sse2_x86_64.asm} | 0 .../libvpx/vp8/common/x86/loopfilter_mmx.asm | 4 +- .../libvpx/vp8/common/x86/loopfilter_sse2.asm | 4 +- media/libvpx/vp8/common/x86/postproc_mmx.asm | 10 +- media/libvpx/vp8/common/x86/postproc_sse2.asm | 10 +- media/libvpx/vp8/common/x86/postproc_x86.c | 24 - media/libvpx/vp8/common/x86/recon_sse2.asm | 79 +- .../vp8/common/x86/variance_impl_mmx.asm | 18 +- media/libvpx/vp8/common/x86/variance_mmx.c | 1 - media/libvpx/vp8/common/x86/variance_sse2.c | 1 - media/libvpx/vp8/common/x86/variance_ssse3.c | 1 - media/libvpx/vp8/decoder/dboolhuff.c | 7 +- media/libvpx/vp8/decoder/dboolhuff.h | 25 +- .../decoder/{decodframe.c => decodeframe.c} | 29 +- media/libvpx/vp8/decoder/decodemv.h | 14 +- media/libvpx/vp8/decoder/decoderthreading.h | 14 +- media/libvpx/vp8/decoder/detokenize.h | 14 +- media/libvpx/vp8/decoder/ec_types.h | 14 +- media/libvpx/vp8/decoder/error_concealment.c | 4 +- media/libvpx/vp8/decoder/error_concealment.h | 14 +- media/libvpx/vp8/decoder/onyxd_if.c | 28 +- media/libvpx/vp8/decoder/onyxd_int.h | 16 +- media/libvpx/vp8/decoder/treereader.h | 14 +- .../vp8/encoder/arm/neon/denoising_neon.c | 478 ++ .../libvpx/vp8/encoder/arm/neon/picklpf_arm.c | 46 - .../vp8/encoder/arm/neon/shortfdct_neon.asm | 221 - .../vp8/encoder/arm/neon/shortfdct_neon.c | 269 + .../vp8/encoder/arm/neon/subtract_neon.asm | 199 - .../vp8/encoder/arm/neon/subtract_neon.c | 154 + .../vp8/encoder/arm/neon/vp8_memcpy_neon.asm | 70 - .../encoder/arm/neon/vp8_mse16x16_neon.asm | 9 +- .../arm/neon/vp8_shortwalsh4x4_neon.asm | 103 - .../encoder/arm/neon/vp8_shortwalsh4x4_neon.c | 118 + media/libvpx/vp8/encoder/bitstream.c | 1 - media/libvpx/vp8/encoder/bitstream.h | 18 +- media/libvpx/vp8/encoder/block.h | 13 +- media/libvpx/vp8/encoder/boolhuff.h | 16 +- media/libvpx/vp8/encoder/dct_value_cost.h | 13 + media/libvpx/vp8/encoder/dct_value_tokens.h | 13 + media/libvpx/vp8/encoder/defaultcoefcounts.h | 13 + media/libvpx/vp8/encoder/denoising.c | 462 +- media/libvpx/vp8/encoder/denoising.h | 73 +- media/libvpx/vp8/encoder/encodeframe.c | 15 +- media/libvpx/vp8/encoder/encodeframe.h | 12 +- media/libvpx/vp8/encoder/encodeintra.h | 12 +- media/libvpx/vp8/encoder/encodemb.h | 12 +- media/libvpx/vp8/encoder/encodemv.h | 12 +- media/libvpx/vp8/encoder/ethreading.c | 15 + media/libvpx/vp8/encoder/firstpass.c | 16 +- media/libvpx/vp8/encoder/firstpass.h | 12 +- media/libvpx/vp8/encoder/lookahead.h | 12 +- media/libvpx/vp8/encoder/mcomp.c | 1 + media/libvpx/vp8/encoder/mcomp.h | 12 +- media/libvpx/vp8/encoder/modecosts.h | 12 +- media/libvpx/vp8/encoder/mr_dissim.c | 1 + media/libvpx/vp8/encoder/mr_dissim.h | 12 +- media/libvpx/vp8/encoder/onyx_if.c | 427 +- media/libvpx/vp8/encoder/onyx_int.h | 20 +- media/libvpx/vp8/encoder/pickinter.c | 41 +- media/libvpx/vp8/encoder/pickinter.h | 12 +- media/libvpx/vp8/encoder/picklpf.c | 10 +- media/libvpx/vp8/encoder/psnr.c | 31 - media/libvpx/vp8/encoder/psnr.h | 17 - media/libvpx/vp8/encoder/quantize.h | 12 +- media/libvpx/vp8/encoder/ratectrl.c | 9 - media/libvpx/vp8/encoder/ratectrl.h | 11 +- media/libvpx/vp8/encoder/rdopt.c | 23 +- media/libvpx/vp8/encoder/rdopt.h | 17 +- media/libvpx/vp8/encoder/segmentation.h | 12 + media/libvpx/vp8/encoder/temporal_filter.c | 4 +- media/libvpx/vp8/encoder/tokenize.c | 12 +- media/libvpx/vp8/encoder/tokenize.h | 14 +- media/libvpx/vp8/encoder/treewriter.h | 12 +- media/libvpx/vp8/encoder/x86/denoising_sse2.c | 311 +- media/libvpx/vp8/encoder/x86/quantize_sse2.c | 3 +- .../libvpx/vp8/encoder/x86/quantize_sse4.asm | 256 - media/libvpx/vp8/encoder/x86/quantize_sse4.c | 128 + .../libvpx/vp8/encoder/x86/quantize_ssse3.asm | 138 - media/libvpx/vp8/encoder/x86/quantize_ssse3.c | 114 + .../vp8/encoder/x86/ssim_opt_x86_64.asm | 216 + media/libvpx/vp8/vp8_cx_iface.c | 351 +- media/libvpx/vp8/vp8_dx_iface.c | 403 +- media/libvpx/vp8_rtcd_armv7-android-gcc.h | 905 ++-- media/libvpx/vp8_rtcd_generic-gnu.h | 465 +- media/libvpx/vp8_rtcd_x86-darwin9-gcc.h | 1184 +++-- media/libvpx/vp8_rtcd_x86-linux-gcc.h | 1184 +++-- media/libvpx/vp8_rtcd_x86-win32-gcc.h | 1184 +++-- media/libvpx/vp8_rtcd_x86-win32-vs8.h | 1184 +++-- media/libvpx/vp8_rtcd_x86_64-darwin9-gcc.h | 866 ++-- media/libvpx/vp8_rtcd_x86_64-linux-gcc.h | 866 ++-- media/libvpx/vp8_rtcd_x86_64-win64-gcc.h | 866 ++-- media/libvpx/vp8_rtcd_x86_64-win64-vs8.h | 866 ++-- .../vp9/common/arm/neon/vp9_convolve_neon.c | 28 +- ..._neon.asm => vp9_idct16x16_1_add_neon.asm} | 0 ...dd_neon.asm => vp9_idct16x16_add_neon.asm} | 0 ..._neon.asm => vp9_idct32x32_1_add_neon.asm} | 0 ...dd_neon.asm => vp9_idct32x32_add_neon.asm} | 10 +- ...dd_neon.asm => vp9_idct4x4_1_add_neon.asm} | 0 ..._add_neon.asm => vp9_idct4x4_add_neon.asm} | 0 ...dd_neon.asm => vp9_idct8x8_1_add_neon.asm} | 0 ..._add_neon.asm => vp9_idct8x8_add_neon.asm} | 8 +- ...4_add_neon.asm => vp9_iht4x4_add_neon.asm} | 0 ...8_add_neon.asm => vp9_iht8x8_add_neon.asm} | 2 + .../arm/neon/vp9_loopfilter_16_neon.asm | 199 + .../common/arm/neon/vp9_loopfilter_16_neon.c | 53 + .../common/arm/neon/vp9_loopfilter_neon.asm | 70 +- .../vp9/common/arm/neon/vp9_mb_lpf_neon.asm | 33 +- .../common/arm/neon/vp9_reconintra_neon.asm | 634 +++ .../vp9/common/generic/vp9_systemdependent.c | 19 - media/libvpx/vp9/common/vp9_alloccommon.c | 298 +- media/libvpx/vp9/common/vp9_alloccommon.h | 27 +- media/libvpx/vp9/common/vp9_blockd.c | 149 + media/libvpx/vp9/common/vp9_blockd.h | 392 +- media/libvpx/vp9/common/vp9_common.h | 30 +- media/libvpx/vp9/common/vp9_common_data.c | 37 +- media/libvpx/vp9/common/vp9_common_data.h | 13 +- media/libvpx/vp9/common/vp9_convolve.c | 297 +- media/libvpx/vp9/common/vp9_convolve.h | 8 + media/libvpx/vp9/common/vp9_debugmodes.c | 19 +- .../vp9/common/vp9_default_coef_probs.h | 699 --- media/libvpx/vp9/common/vp9_entropy.c | 643 ++- media/libvpx/vp9/common/vp9_entropy.h | 130 +- media/libvpx/vp9/common/vp9_entropymode.c | 62 +- media/libvpx/vp9/common/vp9_entropymode.h | 71 +- media/libvpx/vp9/common/vp9_entropymv.c | 72 +- media/libvpx/vp9/common/vp9_entropymv.h | 36 +- media/libvpx/vp9/common/vp9_enums.h | 51 +- media/libvpx/vp9/common/vp9_filter.c | 18 +- media/libvpx/vp9/common/vp9_filter.h | 27 +- media/libvpx/vp9/common/vp9_findnearmv.c | 85 - media/libvpx/vp9/common/vp9_findnearmv.h | 72 - media/libvpx/vp9/common/vp9_frame_buffers.c | 82 + media/libvpx/vp9/common/vp9_frame_buffers.h | 53 + media/libvpx/vp9/common/vp9_idct.c | 126 +- media/libvpx/vp9/common/vp9_idct.h | 22 +- media/libvpx/vp9/common/vp9_loopfilter.c | 743 +-- media/libvpx/vp9/common/vp9_loopfilter.h | 58 +- .../vp9/common/vp9_loopfilter_filters.c | 128 +- media/libvpx/vp9/common/vp9_mv.h | 24 +- media/libvpx/vp9/common/vp9_mvref_common.c | 322 +- media/libvpx/vp9/common/vp9_mvref_common.h | 226 +- media/libvpx/vp9/common/vp9_onyx.h | 239 - media/libvpx/vp9/common/vp9_onyxc_int.h | 281 +- media/libvpx/vp9/common/vp9_postproc.c | 670 +-- media/libvpx/vp9/common/vp9_postproc.h | 13 +- media/libvpx/vp9/common/vp9_ppflags.h | 12 +- media/libvpx/vp9/common/vp9_pragmas.h | 22 - media/libvpx/vp9/common/vp9_pred_common.c | 348 +- media/libvpx/vp9/common/vp9_pred_common.h | 105 +- media/libvpx/vp9/common/vp9_prob.c | 61 + media/libvpx/vp9/common/vp9_prob.h | 84 + media/libvpx/vp9/common/vp9_quant_common.c | 48 +- media/libvpx/vp9/common/vp9_quant_common.h | 13 +- media/libvpx/vp9/common/vp9_reconinter.c | 404 +- media/libvpx/vp9/common/vp9_reconinter.h | 87 +- media/libvpx/vp9/common/vp9_reconintra.c | 200 +- media/libvpx/vp9/common/vp9_reconintra.h | 15 +- media/libvpx/vp9/common/vp9_sadmxn.h | 38 - media/libvpx/vp9/common/vp9_scale.c | 147 +- media/libvpx/vp9/common/vp9_scale.h | 46 +- media/libvpx/vp9/common/vp9_scan.c | 164 +- media/libvpx/vp9/common/vp9_scan.h | 184 +- media/libvpx/vp9/common/vp9_seg_common.c | 10 - media/libvpx/vp9/common/vp9_seg_common.h | 18 +- media/libvpx/vp9/common/vp9_systemdependent.h | 53 +- media/libvpx/vp9/common/vp9_thread.c | 184 + media/libvpx/vp9/common/vp9_thread.h | 219 + media/libvpx/vp9/common/vp9_tile_common.c | 10 +- media/libvpx/vp9/common/vp9_tile_common.h | 11 + media/libvpx/vp9/common/vp9_treecoder.c | 64 - media/libvpx/vp9/common/vp9_treecoder.h | 125 - media/libvpx/vp9/common/x86/vp9_asm_stubs.c | 693 ++- media/libvpx/vp9/common/x86/vp9_copy_sse2.asm | 12 +- .../vp9/common/x86/vp9_idct_intrin_sse2.c | 2492 +++++----- .../vp9/common/x86/vp9_idct_intrin_sse2.h | 175 + .../vp9/common/x86/vp9_idct_intrin_ssse3.c | 762 +++ .../vp9/common/x86/vp9_idct_ssse3_x86_64.asm | 300 ++ .../common/x86/vp9_loopfilter_intrin_avx2.c | 2 +- .../common/x86/vp9_loopfilter_intrin_sse2.c | 932 +++- .../vp9/common/x86/vp9_loopfilter_mmx.asm | 14 +- .../vp9/common/x86/vp9_postproc_mmx.asm | 534 -- .../vp9/common/x86/vp9_postproc_sse2.asm | 3 +- .../libvpx/vp9/common/x86/vp9_postproc_x86.h | 64 - .../common/x86/vp9_subpixel_8t_intrin_avx2.c | 544 +++ .../common/x86/vp9_subpixel_8t_intrin_ssse3.c | 492 ++ .../vp9/common/x86/vp9_subpixel_8t_ssse3.asm | 9 +- .../common/x86/vp9_subpixel_bilinear_sse2.asm | 448 ++ .../x86/vp9_subpixel_bilinear_ssse3.asm | 422 ++ media/libvpx/vp9/decoder/vp9_dboolhuff.h | 93 - media/libvpx/vp9/decoder/vp9_decodeframe.c | 1483 ++++++ media/libvpx/vp9/decoder/vp9_decodeframe.h | 38 + media/libvpx/vp9/decoder/vp9_decodemv.c | 355 +- media/libvpx/vp9/decoder/vp9_decodemv.h | 11 +- media/libvpx/vp9/decoder/vp9_decoder.c | 380 ++ media/libvpx/vp9/decoder/vp9_decoder.h | 104 + media/libvpx/vp9/decoder/vp9_decodframe.c | 1389 ------ media/libvpx/vp9/decoder/vp9_decodframe.h | 21 - media/libvpx/vp9/decoder/vp9_detokenize.c | 226 +- media/libvpx/vp9/decoder/vp9_detokenize.h | 15 +- media/libvpx/vp9/decoder/vp9_dsubexp.c | 57 +- media/libvpx/vp9/decoder/vp9_dsubexp.h | 10 +- media/libvpx/vp9/decoder/vp9_dthread.c | 271 ++ media/libvpx/vp9/decoder/vp9_dthread.h | 58 + media/libvpx/vp9/decoder/vp9_onyxd.h | 69 - media/libvpx/vp9/decoder/vp9_onyxd_if.c | 460 -- media/libvpx/vp9/decoder/vp9_onyxd_int.h | 60 - .../libvpx/vp9/decoder/vp9_read_bit_buffer.c | 41 + .../libvpx/vp9/decoder/vp9_read_bit_buffer.h | 41 +- .../decoder/{vp9_dboolhuff.c => vp9_reader.c} | 58 +- media/libvpx/vp9/decoder/vp9_reader.h | 119 + media/libvpx/vp9/decoder/vp9_thread.c | 251 - media/libvpx/vp9/decoder/vp9_thread.h | 98 - media/libvpx/vp9/decoder/vp9_treereader.h | 30 - .../vp9/encoder/arm/neon/vp9_dct_neon.c | 223 + .../vp9/encoder/arm/neon/vp9_quantize_neon.c | 119 + .../vp9/encoder/arm/neon/vp9_sad_neon.c | 130 + .../vp9/encoder/arm/neon/vp9_subtract_neon.c | 81 + .../vp9/encoder/arm/neon/vp9_variance_neon.c | 227 + media/libvpx/vp9/encoder/vp9_aq_complexity.c | 150 + media/libvpx/vp9/encoder/vp9_aq_complexity.h | 34 + .../libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c | 323 ++ .../libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h | 50 + .../encoder/{vp9_vaq.c => vp9_aq_variance.c} | 73 +- .../encoder/{vp9_vaq.h => vp9_aq_variance.h} | 16 +- media/libvpx/vp9/encoder/vp9_bitstream.c | 1123 ++--- media/libvpx/vp9/encoder/vp9_bitstream.h | 24 +- media/libvpx/vp9/encoder/vp9_block.h | 193 +- media/libvpx/vp9/encoder/vp9_context_tree.c | 158 + media/libvpx/vp9/encoder/vp9_context_tree.h | 78 + .../encoder/{vp9_boolhuff.c => vp9_cost.c} | 54 +- media/libvpx/vp9/encoder/vp9_cost.h | 55 + media/libvpx/vp9/encoder/vp9_dct.c | 259 +- media/libvpx/vp9/encoder/vp9_dct.h | 24 - media/libvpx/vp9/encoder/vp9_denoiser.c | 491 ++ media/libvpx/vp9/encoder/vp9_denoiser.h | 64 + media/libvpx/vp9/encoder/vp9_encodeframe.c | 4213 ++++++++++------ media/libvpx/vp9/encoder/vp9_encodeframe.h | 18 + media/libvpx/vp9/encoder/vp9_encodeintra.c | 28 - media/libvpx/vp9/encoder/vp9_encodeintra.h | 20 - media/libvpx/vp9/encoder/vp9_encodemb.c | 777 +-- media/libvpx/vp9/encoder/vp9_encodemb.h | 48 +- media/libvpx/vp9/encoder/vp9_encodemv.c | 227 +- media/libvpx/vp9/encoder/vp9_encodemv.h | 24 +- media/libvpx/vp9/encoder/vp9_encoder.c | 3055 ++++++++++++ media/libvpx/vp9/encoder/vp9_encoder.h | 533 ++ .../vp9/{common => encoder}/vp9_extend.c | 14 +- .../vp9/{common => encoder}/vp9_extend.h | 14 +- media/libvpx/vp9/encoder/vp9_firstpass.c | 3406 ++++++------- media/libvpx/vp9/encoder/vp9_firstpass.h | 127 +- media/libvpx/vp9/encoder/vp9_lookahead.c | 77 +- media/libvpx/vp9/encoder/vp9_lookahead.h | 33 +- media/libvpx/vp9/encoder/vp9_mbgraph.c | 140 +- media/libvpx/vp9/encoder/vp9_mbgraph.h | 26 +- media/libvpx/vp9/encoder/vp9_mcomp.c | 2165 +++------ media/libvpx/vp9/encoder/vp9_mcomp.h | 171 +- media/libvpx/vp9/encoder/vp9_modecosts.c | 43 - media/libvpx/vp9/encoder/vp9_modecosts.h | 17 - media/libvpx/vp9/encoder/vp9_onyx_if.c | 4309 ----------------- media/libvpx/vp9/encoder/vp9_onyx_int.h | 709 --- media/libvpx/vp9/encoder/vp9_picklpf.c | 238 +- media/libvpx/vp9/encoder/vp9_picklpf.h | 14 +- media/libvpx/vp9/encoder/vp9_pickmode.c | 766 +++ media/libvpx/vp9/encoder/vp9_pickmode.h | 38 + media/libvpx/vp9/encoder/vp9_psnr.c | 29 - media/libvpx/vp9/encoder/vp9_psnr.h | 17 - media/libvpx/vp9/encoder/vp9_quantize.c | 447 +- media/libvpx/vp9/encoder/vp9_quantize.h | 49 +- media/libvpx/vp9/encoder/vp9_ratectrl.c | 1555 ++++-- media/libvpx/vp9/encoder/vp9_ratectrl.h | 194 +- media/libvpx/vp9/encoder/vp9_rd.c | 571 +++ media/libvpx/vp9/encoder/vp9_rd.h | 165 + media/libvpx/vp9/encoder/vp9_rdopt.c | 4195 +++++++--------- media/libvpx/vp9/encoder/vp9_rdopt.h | 47 +- media/libvpx/vp9/encoder/vp9_resize.c | 576 +++ media/libvpx/vp9/encoder/vp9_resize.h | 68 + media/libvpx/vp9/encoder/vp9_sad.c | 133 + media/libvpx/vp9/encoder/vp9_sad_c.c | 615 --- media/libvpx/vp9/encoder/vp9_segmentation.c | 106 +- media/libvpx/vp9/encoder/vp9_segmentation.h | 29 +- media/libvpx/vp9/encoder/vp9_speed_features.c | 482 ++ media/libvpx/vp9/encoder/vp9_speed_features.h | 414 ++ media/libvpx/vp9/encoder/vp9_ssim.h | 30 + media/libvpx/vp9/encoder/vp9_subexp.c | 123 +- media/libvpx/vp9/encoder/vp9_subexp.h | 18 +- .../libvpx/vp9/encoder/vp9_svc_layercontext.c | 390 ++ .../libvpx/vp9/encoder/vp9_svc_layercontext.h | 111 + .../libvpx/vp9/encoder/vp9_temporal_filter.c | 536 +- .../libvpx/vp9/encoder/vp9_temporal_filter.h | 13 +- media/libvpx/vp9/encoder/vp9_tokenize.c | 259 +- media/libvpx/vp9/encoder/vp9_tokenize.h | 23 +- media/libvpx/vp9/encoder/vp9_treewriter.c | 58 +- media/libvpx/vp9/encoder/vp9_treewriter.h | 80 +- media/libvpx/vp9/encoder/vp9_variance.c | 269 + media/libvpx/vp9/encoder/vp9_variance.h | 61 +- media/libvpx/vp9/encoder/vp9_variance_c.c | 1094 ----- .../libvpx/vp9/encoder/vp9_write_bit_buffer.c | 35 + .../libvpx/vp9/encoder/vp9_write_bit_buffer.h | 40 +- media/libvpx/vp9/encoder/vp9_writer.c | 34 + .../encoder/{vp9_boolhuff.h => vp9_writer.h} | 51 +- .../vp9/encoder/x86/vp9_dct32x32_avx2.c | 2710 +++++++++++ .../vp9/encoder/x86/vp9_dct32x32_sse2.c | 3 + media/libvpx/vp9/encoder/x86/vp9_dct_avx2.c | 26 + media/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm | 70 + media/libvpx/vp9/encoder/x86/vp9_dct_sse2.c | 717 ++- .../vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm | 182 + .../vp9/encoder/x86/vp9_error_intrin_avx2.c | 72 + media/libvpx/vp9/encoder/x86/vp9_mcomp_x86.h | 40 - ...sse3.asm => vp9_quantize_ssse3_x86_64.asm} | 188 +- .../vp9/encoder/x86/vp9_sad4d_intrin_avx2.c | 167 + media/libvpx/vp9/encoder/x86/vp9_sad_mmx.asm | 427 -- .../vp9/encoder/x86/vp9_ssim_opt_x86_64.asm | 216 + .../vp9_subpel_variance_impl_intrin_avx2.c | 539 +++ .../x86/vp9_subpel_variance_impl_sse2.asm | 337 -- .../x86/vp9_temporal_filter_apply_sse2.asm | 51 +- .../vp9/encoder/x86/vp9_variance_avx2.c | 190 + .../x86/vp9_variance_impl_intrin_avx2.c | 213 + .../vp9/encoder/x86/vp9_variance_impl_mmx.asm | 510 -- .../encoder/x86/vp9_variance_impl_sse2.asm | 734 --- .../libvpx/vp9/encoder/x86/vp9_variance_mmx.c | 154 - .../vp9/encoder/x86/vp9_variance_sse2.c | 587 +-- media/libvpx/vp9/vp9_cx_iface.c | 1528 +++--- media/libvpx/vp9/vp9_dx_iface.c | 989 ++-- media/libvpx/vp9/vp9_iface_common.h | 32 +- media/libvpx/vp9_rtcd_armv7-android-gcc.h | 1427 +++--- media/libvpx/vp9_rtcd_generic-gnu.h | 1143 ++--- media/libvpx/vp9_rtcd_x86-darwin9-gcc.h | 1795 ++++--- media/libvpx/vp9_rtcd_x86-linux-gcc.h | 1793 ++++--- media/libvpx/vp9_rtcd_x86-win32-gcc.h | 2506 +++++----- media/libvpx/vp9_rtcd_x86-win32-vs8.h | 2446 +++++----- media/libvpx/vp9_rtcd_x86_64-darwin9-gcc.h | 2052 ++++---- media/libvpx/vp9_rtcd_x86_64-linux-gcc.h | 2049 ++++---- media/libvpx/vp9_rtcd_x86_64-win64-gcc.h | 2049 ++++---- media/libvpx/vp9_rtcd_x86_64-win64-vs8.h | 1974 ++++---- .../libvpx/vpx/internal/vpx_codec_internal.h | 164 +- media/libvpx/vpx/internal/vpx_psnr.h | 34 + media/libvpx/vpx/src/svc_encodeframe.c | 819 ++-- media/libvpx/vpx/src/vpx_codec.c | 64 +- media/libvpx/vpx/src/vpx_decoder.c | 81 +- media/libvpx/vpx/src/vpx_encoder.c | 77 +- media/libvpx/vpx/src/vpx_image.c | 75 +- media/libvpx/vpx/src/vpx_psnr.c | 24 + media/libvpx/vpx/svc_context.h | 37 +- media/libvpx/vpx/vp8.h | 6 +- media/libvpx/vpx/vp8cx.h | 58 +- media/libvpx/vpx/vp8dx.h | 59 +- media/libvpx/vpx/vpx_codec.h | 117 +- media/libvpx/vpx/vpx_decoder.h | 64 +- media/libvpx/vpx/vpx_encoder.h | 125 +- media/libvpx/vpx/vpx_frame_buffer.h | 80 + media/libvpx/vpx/vpx_image.h | 33 +- media/libvpx/vpx/vpx_integer.h | 19 +- media/libvpx/vpx_config_armv7-android-gcc.asm | 17 +- media/libvpx/vpx_config_armv7-android-gcc.h | 17 +- media/libvpx/vpx_config_generic-gnu.asm | 17 +- media/libvpx/vpx_config_generic-gnu.h | 17 +- media/libvpx/vpx_config_x86-darwin9-gcc.asm | 17 +- media/libvpx/vpx_config_x86-darwin9-gcc.h | 17 +- media/libvpx/vpx_config_x86-linux-gcc.asm | 17 +- media/libvpx/vpx_config_x86-linux-gcc.h | 17 +- media/libvpx/vpx_config_x86-win32-gcc.asm | 21 +- media/libvpx/vpx_config_x86-win32-gcc.h | 21 +- media/libvpx/vpx_config_x86-win32-vs8.asm | 17 +- media/libvpx/vpx_config_x86-win32-vs8.h | 17 +- .../libvpx/vpx_config_x86_64-darwin9-gcc.asm | 17 +- media/libvpx/vpx_config_x86_64-darwin9-gcc.h | 17 +- media/libvpx/vpx_config_x86_64-linux-gcc.asm | 17 +- media/libvpx/vpx_config_x86_64-linux-gcc.h | 17 +- media/libvpx/vpx_config_x86_64-win64-gcc.asm | 21 +- media/libvpx/vpx_config_x86_64-win64-gcc.h | 21 +- media/libvpx/vpx_config_x86_64-win64-vs8.asm | 17 +- media/libvpx/vpx_config_x86_64-win64-vs8.h | 17 +- media/libvpx/vpx_mem/include/vpx_mem_intrnl.h | 6 +- .../libvpx/vpx_mem/include/vpx_mem_tracker.h | 6 +- media/libvpx/vpx_mem/vpx_mem.c | 24 + media/libvpx/vpx_mem/vpx_mem.h | 9 +- media/libvpx/vpx_ports/arm.h | 12 +- media/libvpx/vpx_ports/arm_cpudetect.c | 7 + media/libvpx/vpx_ports/asm_offsets.h | 6 +- media/libvpx/vpx_ports/config.h | 6 + media/libvpx/vpx_ports/emmintrin_compat.h | 6 +- media/libvpx/vpx_ports/mem.h | 6 +- media/libvpx/vpx_ports/mem_ops.h | 18 +- media/libvpx/vpx_ports/mem_ops_aligned.h | 24 +- media/libvpx/vpx_ports/vpx_once.h | 34 +- media/libvpx/vpx_ports/vpx_timer.h | 21 +- media/libvpx/vpx_ports/x86.h | 69 +- media/libvpx/vpx_ports/x86_abi_support.asm | 11 + media/libvpx/vpx_ports/x86_cpuid.c | 49 - .../arm/neon/vp8_vpxyv12_copy_y_neon.asm | 123 - .../neon/vp8_vpxyv12_copyframe_func_neon.asm | 233 - .../vp8_vpxyv12_copysrcframe_func_neon.asm | 259 - .../vp8_vpxyv12_extendframeborders_neon.asm | 308 -- .../vpx_scale/arm/neon/yv12extend_arm.c | 21 - media/libvpx/vpx_scale/generic/yv12config.c | 142 +- media/libvpx/vpx_scale/generic/yv12extend.c | 185 +- media/libvpx/vpx_scale/vpx_scale.h | 6 +- media/libvpx/vpx_scale/yv12config.h | 106 +- .../libvpx/vpx_scale_rtcd_armv7-android-gcc.h | 62 +- media/libvpx/vpx_scale_rtcd_generic-gnu.h | 47 +- media/libvpx/vpx_scale_rtcd_x86-darwin9-gcc.h | 42 +- media/libvpx/vpx_scale_rtcd_x86-linux-gcc.h | 42 +- media/libvpx/vpx_scale_rtcd_x86-win32-gcc.h | 42 +- media/libvpx/vpx_scale_rtcd_x86-win32-vs8.h | 42 +- .../vpx_scale_rtcd_x86_64-darwin9-gcc.h | 42 +- .../libvpx/vpx_scale_rtcd_x86_64-linux-gcc.h | 42 +- .../libvpx/vpx_scale_rtcd_x86_64-win64-gcc.h | 42 +- .../libvpx/vpx_scale_rtcd_x86_64-win64-vs8.h | 42 +- media/libvpx/vpx_version.h | 6 +- 507 files changed, 66918 insertions(+), 59184 deletions(-) delete mode 100644 media/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm delete mode 100644 media/libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm delete mode 100644 media/libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm delete mode 100644 media/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm create mode 100644 media/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c delete mode 100644 media/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm delete mode 100644 media/libvpx/vp8/common/arm/neon/copymem16x16_neon.asm delete mode 100644 media/libvpx/vp8/common/arm/neon/copymem8x4_neon.asm delete mode 100644 media/libvpx/vp8/common/arm/neon/copymem8x8_neon.asm create mode 100644 media/libvpx/vp8/common/arm/neon/copymem_neon.c delete mode 100644 media/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.asm create mode 100644 media/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c delete mode 100644 media/libvpx/vp8/common/arm/neon/dequant_idct_neon.asm create mode 100644 media/libvpx/vp8/common/arm/neon/dequant_idct_neon.c delete mode 100644 media/libvpx/vp8/common/arm/neon/dequantizeb_neon.asm create mode 100644 media/libvpx/vp8/common/arm/neon/dequantizeb_neon.c delete mode 100644 media/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm create mode 100644 media/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c delete mode 100644 media/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm create mode 100644 media/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c delete mode 100644 media/libvpx/vp8/common/arm/neon/iwalsh_neon.asm create mode 100644 media/libvpx/vp8/common/arm/neon/iwalsh_neon.c delete mode 100644 media/libvpx/vp8/common/arm/neon/loopfilter_neon.asm create mode 100644 media/libvpx/vp8/common/arm/neon/loopfilter_neon.c delete mode 100644 media/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm create mode 100644 media/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c delete mode 100644 media/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm create mode 100644 media/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c delete mode 100644 media/libvpx/vp8/common/arm/neon/mbloopfilter_neon.asm create mode 100644 media/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c create mode 100644 media/libvpx/vp8/common/arm/neon/reconintra_neon.c delete mode 100644 media/libvpx/vp8/common/arm/neon/sad16_neon.asm delete mode 100644 media/libvpx/vp8/common/arm/neon/sad8_neon.asm create mode 100644 media/libvpx/vp8/common/arm/neon/sad_neon.c delete mode 100644 media/libvpx/vp8/common/arm/neon/save_reg_neon.asm delete mode 100644 media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.asm create mode 100644 media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c delete mode 100644 media/libvpx/vp8/common/arm/neon/sixtappredict16x16_neon.asm delete mode 100644 media/libvpx/vp8/common/arm/neon/sixtappredict4x4_neon.asm delete mode 100644 media/libvpx/vp8/common/arm/neon/sixtappredict8x4_neon.asm delete mode 100644 media/libvpx/vp8/common/arm/neon/sixtappredict8x8_neon.asm create mode 100644 media/libvpx/vp8/common/arm/neon/sixtappredict_neon.c delete mode 100644 media/libvpx/vp8/common/arm/neon/variance_neon.asm create mode 100644 media/libvpx/vp8/common/arm/neon/variance_neon.c delete mode 100644 media/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm delete mode 100644 media/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm delete mode 100644 media/libvpx/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm create mode 100644 media/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c delete mode 100644 media/libvpx/vp8/common/arm/reconintra_arm.c delete mode 100644 media/libvpx/vp8/common/pragmas.h rename media/libvpx/vp8/common/x86/{loopfilter_block_sse2.asm => loopfilter_block_sse2_x86_64.asm} (100%) delete mode 100644 media/libvpx/vp8/common/x86/postproc_x86.c rename media/libvpx/vp8/decoder/{decodframe.c => decodeframe.c} (98%) create mode 100644 media/libvpx/vp8/encoder/arm/neon/denoising_neon.c delete mode 100644 media/libvpx/vp8/encoder/arm/neon/picklpf_arm.c delete mode 100644 media/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm create mode 100644 media/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c delete mode 100644 media/libvpx/vp8/encoder/arm/neon/subtract_neon.asm create mode 100644 media/libvpx/vp8/encoder/arm/neon/subtract_neon.c delete mode 100644 media/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm delete mode 100644 media/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm create mode 100644 media/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c delete mode 100644 media/libvpx/vp8/encoder/psnr.c delete mode 100644 media/libvpx/vp8/encoder/psnr.h delete mode 100644 media/libvpx/vp8/encoder/x86/quantize_sse4.asm create mode 100644 media/libvpx/vp8/encoder/x86/quantize_sse4.c delete mode 100644 media/libvpx/vp8/encoder/x86/quantize_ssse3.asm create mode 100644 media/libvpx/vp8/encoder/x86/quantize_ssse3.c create mode 100644 media/libvpx/vp8/encoder/x86/ssim_opt_x86_64.asm rename media/libvpx/vp9/common/arm/neon/{vp9_short_idct16x16_1_add_neon.asm => vp9_idct16x16_1_add_neon.asm} (100%) rename media/libvpx/vp9/common/arm/neon/{vp9_short_idct16x16_add_neon.asm => vp9_idct16x16_add_neon.asm} (100%) rename media/libvpx/vp9/common/arm/neon/{vp9_short_idct32x32_1_add_neon.asm => vp9_idct32x32_1_add_neon.asm} (100%) rename media/libvpx/vp9/common/arm/neon/{vp9_short_idct32x32_add_neon.asm => vp9_idct32x32_add_neon.asm} (99%) rename media/libvpx/vp9/common/arm/neon/{vp9_short_idct4x4_1_add_neon.asm => vp9_idct4x4_1_add_neon.asm} (100%) rename media/libvpx/vp9/common/arm/neon/{vp9_short_idct4x4_add_neon.asm => vp9_idct4x4_add_neon.asm} (100%) rename media/libvpx/vp9/common/arm/neon/{vp9_short_idct8x8_1_add_neon.asm => vp9_idct8x8_1_add_neon.asm} (100%) rename media/libvpx/vp9/common/arm/neon/{vp9_short_idct8x8_add_neon.asm => vp9_idct8x8_add_neon.asm} (98%) rename media/libvpx/vp9/common/arm/neon/{vp9_short_iht4x4_add_neon.asm => vp9_iht4x4_add_neon.asm} (100%) rename media/libvpx/vp9/common/arm/neon/{vp9_short_iht8x8_add_neon.asm => vp9_iht8x8_add_neon.asm} (99%) create mode 100644 media/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm create mode 100644 media/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c create mode 100644 media/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.asm delete mode 100644 media/libvpx/vp9/common/generic/vp9_systemdependent.c create mode 100644 media/libvpx/vp9/common/vp9_blockd.c delete mode 100644 media/libvpx/vp9/common/vp9_default_coef_probs.h delete mode 100644 media/libvpx/vp9/common/vp9_findnearmv.c delete mode 100644 media/libvpx/vp9/common/vp9_findnearmv.h create mode 100644 media/libvpx/vp9/common/vp9_frame_buffers.c create mode 100644 media/libvpx/vp9/common/vp9_frame_buffers.h delete mode 100644 media/libvpx/vp9/common/vp9_onyx.h delete mode 100644 media/libvpx/vp9/common/vp9_pragmas.h create mode 100644 media/libvpx/vp9/common/vp9_prob.c create mode 100644 media/libvpx/vp9/common/vp9_prob.h delete mode 100644 media/libvpx/vp9/common/vp9_sadmxn.h create mode 100644 media/libvpx/vp9/common/vp9_thread.c create mode 100644 media/libvpx/vp9/common/vp9_thread.h delete mode 100644 media/libvpx/vp9/common/vp9_treecoder.c delete mode 100644 media/libvpx/vp9/common/vp9_treecoder.h create mode 100644 media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h create mode 100644 media/libvpx/vp9/common/x86/vp9_idct_intrin_ssse3.c create mode 100644 media/libvpx/vp9/common/x86/vp9_idct_ssse3_x86_64.asm delete mode 100644 media/libvpx/vp9/common/x86/vp9_postproc_mmx.asm delete mode 100644 media/libvpx/vp9/common/x86/vp9_postproc_x86.h create mode 100644 media/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c create mode 100644 media/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c create mode 100644 media/libvpx/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm create mode 100644 media/libvpx/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm delete mode 100644 media/libvpx/vp9/decoder/vp9_dboolhuff.h create mode 100644 media/libvpx/vp9/decoder/vp9_decodeframe.c create mode 100644 media/libvpx/vp9/decoder/vp9_decodeframe.h create mode 100644 media/libvpx/vp9/decoder/vp9_decoder.c create mode 100644 media/libvpx/vp9/decoder/vp9_decoder.h delete mode 100644 media/libvpx/vp9/decoder/vp9_decodframe.c delete mode 100644 media/libvpx/vp9/decoder/vp9_decodframe.h create mode 100644 media/libvpx/vp9/decoder/vp9_dthread.c create mode 100644 media/libvpx/vp9/decoder/vp9_dthread.h delete mode 100644 media/libvpx/vp9/decoder/vp9_onyxd.h delete mode 100644 media/libvpx/vp9/decoder/vp9_onyxd_if.c delete mode 100644 media/libvpx/vp9/decoder/vp9_onyxd_int.h create mode 100644 media/libvpx/vp9/decoder/vp9_read_bit_buffer.c rename media/libvpx/vp9/decoder/{vp9_dboolhuff.c => vp9_reader.c} (62%) create mode 100644 media/libvpx/vp9/decoder/vp9_reader.h delete mode 100644 media/libvpx/vp9/decoder/vp9_thread.c delete mode 100644 media/libvpx/vp9/decoder/vp9_thread.h delete mode 100644 media/libvpx/vp9/decoder/vp9_treereader.h create mode 100644 media/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c create mode 100644 media/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c create mode 100644 media/libvpx/vp9/encoder/arm/neon/vp9_sad_neon.c create mode 100644 media/libvpx/vp9/encoder/arm/neon/vp9_subtract_neon.c create mode 100644 media/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c create mode 100644 media/libvpx/vp9/encoder/vp9_aq_complexity.c create mode 100644 media/libvpx/vp9/encoder/vp9_aq_complexity.h create mode 100644 media/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c create mode 100644 media/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h rename media/libvpx/vp9/encoder/{vp9_vaq.c => vp9_aq_variance.c} (66%) rename media/libvpx/vp9/encoder/{vp9_vaq.h => vp9_aq_variance.h} (73%) create mode 100644 media/libvpx/vp9/encoder/vp9_context_tree.c create mode 100644 media/libvpx/vp9/encoder/vp9_context_tree.h rename media/libvpx/vp9/encoder/{vp9_boolhuff.c => vp9_cost.c} (71%) create mode 100644 media/libvpx/vp9/encoder/vp9_cost.h delete mode 100644 media/libvpx/vp9/encoder/vp9_dct.h create mode 100644 media/libvpx/vp9/encoder/vp9_denoiser.c create mode 100644 media/libvpx/vp9/encoder/vp9_denoiser.h delete mode 100644 media/libvpx/vp9/encoder/vp9_encodeintra.c delete mode 100644 media/libvpx/vp9/encoder/vp9_encodeintra.h create mode 100644 media/libvpx/vp9/encoder/vp9_encoder.c create mode 100644 media/libvpx/vp9/encoder/vp9_encoder.h rename media/libvpx/vp9/{common => encoder}/vp9_extend.c (90%) rename media/libvpx/vp9/{common => encoder}/vp9_extend.h (81%) delete mode 100644 media/libvpx/vp9/encoder/vp9_modecosts.c delete mode 100644 media/libvpx/vp9/encoder/vp9_modecosts.h delete mode 100644 media/libvpx/vp9/encoder/vp9_onyx_if.c delete mode 100644 media/libvpx/vp9/encoder/vp9_onyx_int.h create mode 100644 media/libvpx/vp9/encoder/vp9_pickmode.c create mode 100644 media/libvpx/vp9/encoder/vp9_pickmode.h delete mode 100644 media/libvpx/vp9/encoder/vp9_psnr.c delete mode 100644 media/libvpx/vp9/encoder/vp9_psnr.h create mode 100644 media/libvpx/vp9/encoder/vp9_rd.c create mode 100644 media/libvpx/vp9/encoder/vp9_rd.h create mode 100644 media/libvpx/vp9/encoder/vp9_resize.c create mode 100644 media/libvpx/vp9/encoder/vp9_resize.h create mode 100644 media/libvpx/vp9/encoder/vp9_sad.c delete mode 100644 media/libvpx/vp9/encoder/vp9_sad_c.c create mode 100644 media/libvpx/vp9/encoder/vp9_speed_features.c create mode 100644 media/libvpx/vp9/encoder/vp9_speed_features.h create mode 100644 media/libvpx/vp9/encoder/vp9_ssim.h create mode 100644 media/libvpx/vp9/encoder/vp9_svc_layercontext.c create mode 100644 media/libvpx/vp9/encoder/vp9_svc_layercontext.h create mode 100644 media/libvpx/vp9/encoder/vp9_variance.c delete mode 100644 media/libvpx/vp9/encoder/vp9_variance_c.c create mode 100644 media/libvpx/vp9/encoder/vp9_write_bit_buffer.c create mode 100644 media/libvpx/vp9/encoder/vp9_writer.c rename media/libvpx/vp9/encoder/{vp9_boolhuff.h => vp9_writer.h} (62%) create mode 100644 media/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c create mode 100644 media/libvpx/vp9/encoder/x86/vp9_dct_avx2.c create mode 100644 media/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm create mode 100644 media/libvpx/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm create mode 100644 media/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c delete mode 100644 media/libvpx/vp9/encoder/x86/vp9_mcomp_x86.h rename media/libvpx/vp9/encoder/x86/{vp9_quantize_ssse3.asm => vp9_quantize_ssse3_x86_64.asm} (54%) create mode 100644 media/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c delete mode 100644 media/libvpx/vp9/encoder/x86/vp9_sad_mmx.asm create mode 100644 media/libvpx/vp9/encoder/x86/vp9_ssim_opt_x86_64.asm create mode 100644 media/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c delete mode 100644 media/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm create mode 100644 media/libvpx/vp9/encoder/x86/vp9_variance_avx2.c create mode 100644 media/libvpx/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c delete mode 100644 media/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm delete mode 100644 media/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm delete mode 100644 media/libvpx/vp9/encoder/x86/vp9_variance_mmx.c create mode 100644 media/libvpx/vpx/internal/vpx_psnr.h create mode 100644 media/libvpx/vpx/src/vpx_psnr.c create mode 100644 media/libvpx/vpx/vpx_frame_buffer.h delete mode 100644 media/libvpx/vpx_ports/x86_cpuid.c delete mode 100644 media/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copy_y_neon.asm delete mode 100644 media/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm delete mode 100644 media/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm delete mode 100644 media/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm delete mode 100644 media/libvpx/vpx_scale/arm/neon/yv12extend_arm.c diff --git a/media/libvpx/PATENTS b/media/libvpx/PATENTS index 4414d8385053..79d17d7d6a92 100644 --- a/media/libvpx/PATENTS +++ b/media/libvpx/PATENTS @@ -1,22 +1,23 @@ Additional IP Rights Grant (Patents) +------------------------------------ -"This implementation" means the copyrightable works distributed by -Google as part of the WebM Project. +"These implementations" means the copyrightable works that implement the WebM +codecs distributed by Google as part of the WebM Project. -Google hereby grants to you a perpetual, worldwide, non-exclusive, -no-charge, royalty-free, irrevocable (except as stated in this section) -patent license to make, have made, use, offer to sell, sell, import, -transfer, and otherwise run, modify and propagate the contents of this -implementation of VP8, where such license applies only to those patent -claims, both currently owned by Google and acquired in the future, -licensable by Google that are necessarily infringed by this -implementation of VP8. This grant does not include claims that would be -infringed only as a consequence of further modification of this -implementation. If you or your agent or exclusive licensee institute or -order or agree to the institution of patent litigation against any -entity (including a cross-claim or counterclaim in a lawsuit) alleging -that this implementation of VP8 or any code incorporated within this -implementation of VP8 constitutes direct or contributory patent -infringement, or inducement of patent infringement, then any patent -rights granted to you under this License for this implementation of VP8 -shall terminate as of the date such litigation is filed. +Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge, +royalty-free, irrevocable (except as stated in this section) patent license to +make, have made, use, offer to sell, sell, import, transfer, and otherwise +run, modify and propagate the contents of these implementations of WebM, where +such license applies only to those patent claims, both currently owned by +Google and acquired in the future, licensable by Google that are necessarily +infringed by these implementations of WebM. This grant does not include claims +that would be infringed only as a consequence of further modification of these +implementations. If you or your agent or exclusive licensee institute or order +or agree to the institution of patent litigation or any other patent +enforcement activity against any entity (including a cross-claim or +counterclaim in a lawsuit) alleging that any of these implementations of WebM +or any code incorporated within any of these implementations of WebM +constitutes direct or contributory patent infringement, or inducement of +patent infringement, then any patent rights granted to you under this License +for these implementations of WebM shall terminate as of the date such +litigation is filed. diff --git a/media/libvpx/README_MOZILLA b/media/libvpx/README_MOZILLA index 4a3574b8c909..d38c4334ff1c 100644 --- a/media/libvpx/README_MOZILLA +++ b/media/libvpx/README_MOZILLA @@ -8,4 +8,4 @@ The libvpx git repository is: https://gerrit.chromium.org/gerrit/webm/libvpx -The git commit ID used was 2e88f2f2ec777259bda1714e72f1ecd2519bceb5 +The git commit ID used was c731d6a4f19eea861ceb2ff31399420b2452eb74 diff --git a/media/libvpx/build/make/ads2gas.pl b/media/libvpx/build/make/ads2gas.pl index 9c41901295f5..7272424af2eb 100755 --- a/media/libvpx/build/make/ads2gas.pl +++ b/media/libvpx/build/make/ads2gas.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl ## ## Copyright (c) 2010 The WebM project authors. All Rights Reserved. ## diff --git a/media/libvpx/build/make/obj_int_extract.c b/media/libvpx/build/make/obj_int_extract.c index feed9d983049..2e50f387fa5f 100644 --- a/media/libvpx/build/make/obj_int_extract.c +++ b/media/libvpx/build/make/obj_int_extract.c @@ -21,6 +21,7 @@ typedef enum { OUTPUT_FMT_PLAIN, OUTPUT_FMT_RVDS, OUTPUT_FMT_GAS, + OUTPUT_FMT_C_HEADER, } output_fmt_t; int log_msg(const char *fmt, ...) { @@ -33,6 +34,18 @@ int log_msg(const char *fmt, ...) { } #if defined(__GNUC__) && __GNUC__ + +#if defined(FORCE_PARSE_ELF) + +#if defined(__MACH__) +#undef __MACH__ +#endif + +#if !defined(__ELF__) +#define __ELF__ +#endif +#endif + #if defined(__MACH__) #include @@ -43,9 +56,12 @@ int print_macho_equ(output_fmt_t mode, uint8_t* name, int val) { case OUTPUT_FMT_RVDS: printf("%-40s EQU %5d\n", name, val); return 0; - case OUTPUT_FMT_GAS: + case OUTPUT_FMT_GAS: printf(".set %-40s, %5d\n", name, val); return 0; + case OUTPUT_FMT_C_HEADER: + printf("#define %-40s %5d\n", name, val); + return 0; default: log_msg("Unsupported mode: %d", mode); return 1; @@ -321,7 +337,7 @@ bail: return 1; } -char *parse_elf_string_table(elf_obj_t *elf, int s_idx, int idx) { +const char *parse_elf_string_table(elf_obj_t *elf, int s_idx, int idx) { if (elf->bits == 32) { Elf32_Shdr shdr; @@ -491,6 +507,13 @@ int parse_elf(uint8_t *buf, size_t sz, output_fmt_t mode) { sym.st_name), val); break; + case OUTPUT_FMT_C_HEADER: + printf("#define %-40s %5d\n", + parse_elf_string_table(&elf, + shdr.sh_link, + sym.st_name), + val); + break; default: printf("%s = %d\n", parse_elf_string_table(&elf, @@ -655,7 +678,11 @@ int parse_coff(uint8_t *buf, size_t sz) { } strcpy(sectionlist[i], sectionname); - if (!strcmp(sectionname, ".rdata")) sectionrawdata_ptr = get_le32(ptr + 20); + // check if it's .rdata and is not a COMDAT section. + if (!strcmp(sectionname, ".rdata") && + (get_le32(ptr + 36) & 0x1000) == 0) { + sectionrawdata_ptr = get_le32(ptr + 20); + } ptr += 40; } @@ -762,6 +789,7 @@ int main(int argc, char **argv) { fprintf(stderr, "Output Formats:\n"); fprintf(stderr, " gas - compatible with GNU assembler\n"); fprintf(stderr, " rvds - compatible with armasm\n"); + fprintf(stderr, " cheader - c/c++ header file\n"); goto bail; } @@ -771,6 +799,8 @@ int main(int argc, char **argv) { mode = OUTPUT_FMT_RVDS; else if (!strcmp(argv[1], "gas")) mode = OUTPUT_FMT_GAS; + else if (!strcmp(argv[1], "cheader")) + mode = OUTPUT_FMT_C_HEADER; else f = argv[1]; diff --git a/media/libvpx/build/make/thumb.pm b/media/libvpx/build/make/thumb.pm index e1f34c1ec48c..483c2539c68a 100644 --- a/media/libvpx/build/make/thumb.pm +++ b/media/libvpx/build/make/thumb.pm @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl ## ## Copyright (c) 2013 The WebM project authors. All Rights Reserved. ## @@ -24,7 +24,7 @@ sub FixThumbInstructions($$) # with left shift, addition and a right shift (to restore the # register to the original value). Currently the right shift # isn't necessary in the code base since the values in these - # registers aren't used, but doing the shift for consitency. + # registers aren't used, but doing the shift for consistency. # This converts instructions such as "add r12, r12, r5, lsl r4" # into the sequence "lsl r5, r4", "add r12, r12, r5", "lsr r5, r4". s/^(\s*)(add)(\s+)(r\d+),\s*(r\d+),\s*(r\d+),\s*lsl (r\d+)/$1lsl$3$6, $7\n$1$2$3$4, $5, $6\n$1lsr$3$6, $7/g; @@ -51,7 +51,7 @@ sub FixThumbInstructions($$) # Convert register post indexing to a separate add instruction. # This converts "ldrneb r9, [r0], r2" into "ldrneb r9, [r0]", - # "add r0, r2". + # "addne r0, r0, r2". s/^(\s*)((ldr|str)(ne)?[bhd]?)(\s+)(\w+),(\s*\w+,)?\s*\[(\w+)\],\s*(\w+)/$1$2$5$6,$7 [$8]\n$1add$4$5$8, $8, $9/g; # Convert a conditional addition to the pc register into a series of diff --git a/media/libvpx/sources.mozbuild b/media/libvpx/sources.mozbuild index 8ef7c8ca7039..d30350a20596 100644 --- a/media/libvpx/sources.mozbuild +++ b/media/libvpx/sources.mozbuild @@ -1,244 +1,105 @@ files = { - 'ARM_ASM': [ - 'vp8/common/arm/armv6/bilinearfilter_v6.asm', - 'vp8/common/arm/armv6/copymem16x16_v6.asm', - 'vp8/common/arm/armv6/copymem8x4_v6.asm', - 'vp8/common/arm/armv6/copymem8x8_v6.asm', - 'vp8/common/arm/armv6/dc_only_idct_add_v6.asm', - 'vp8/common/arm/armv6/dequant_idct_v6.asm', - 'vp8/common/arm/armv6/dequantize_v6.asm', - 'vp8/common/arm/armv6/filter_v6.asm', - 'vp8/common/arm/armv6/idct_blk_v6.c', - 'vp8/common/arm/armv6/idct_v6.asm', - 'vp8/common/arm/armv6/intra4x4_predict_v6.asm', - 'vp8/common/arm/armv6/iwalsh_v6.asm', - 'vp8/common/arm/armv6/loopfilter_v6.asm', - 'vp8/common/arm/armv6/simpleloopfilter_v6.asm', - 'vp8/common/arm/armv6/sixtappredict8x4_v6.asm', - 'vp8/common/arm/armv6/vp8_sad16x16_armv6.asm', - 'vp8/common/arm/armv6/vp8_variance16x16_armv6.asm', - 'vp8/common/arm/armv6/vp8_variance8x8_armv6.asm', - 'vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm', - 'vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm', - 'vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm', - 'vp8/common/arm/bilinearfilter_arm.c', - 'vp8/common/arm/dequantize_arm.c', - 'vp8/common/arm/filter_arm.c', - 'vp8/common/arm/loopfilter_arm.c', - 'vp8/common/arm/neon/bilinearpredict16x16_neon.asm', - 'vp8/common/arm/neon/bilinearpredict4x4_neon.asm', - 'vp8/common/arm/neon/bilinearpredict8x4_neon.asm', - 'vp8/common/arm/neon/bilinearpredict8x8_neon.asm', - 'vp8/common/arm/neon/buildintrapredictorsmby_neon.asm', - 'vp8/common/arm/neon/copymem16x16_neon.asm', - 'vp8/common/arm/neon/copymem8x4_neon.asm', - 'vp8/common/arm/neon/copymem8x8_neon.asm', - 'vp8/common/arm/neon/dc_only_idct_add_neon.asm', - 'vp8/common/arm/neon/dequant_idct_neon.asm', - 'vp8/common/arm/neon/dequantizeb_neon.asm', - 'vp8/common/arm/neon/idct_blk_neon.c', - 'vp8/common/arm/neon/idct_dequant_0_2x_neon.asm', - 'vp8/common/arm/neon/idct_dequant_full_2x_neon.asm', - 'vp8/common/arm/neon/iwalsh_neon.asm', - 'vp8/common/arm/neon/loopfilter_neon.asm', - 'vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm', - 'vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm', - 'vp8/common/arm/neon/mbloopfilter_neon.asm', - 'vp8/common/arm/neon/sad16_neon.asm', - 'vp8/common/arm/neon/sad8_neon.asm', - 'vp8/common/arm/neon/save_reg_neon.asm', - 'vp8/common/arm/neon/shortidct4x4llm_neon.asm', - 'vp8/common/arm/neon/sixtappredict16x16_neon.asm', - 'vp8/common/arm/neon/sixtappredict4x4_neon.asm', - 'vp8/common/arm/neon/sixtappredict8x4_neon.asm', - 'vp8/common/arm/neon/sixtappredict8x8_neon.asm', - 'vp8/common/arm/neon/variance_neon.asm', - 'vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm', - 'vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm', - 'vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm', - 'vp8/common/arm/reconintra_arm.c', - 'vp8/common/arm/variance_arm.c', - 'vp8/encoder/arm/armv5te/boolhuff_armv5te.asm', - 'vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm', - 'vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm', - 'vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm', - 'vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm', - 'vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm', - 'vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm', - 'vp8/encoder/arm/armv6/vp8_subtract_armv6.asm', - 'vp8/encoder/arm/armv6/walsh_v6.asm', - 'vp8/encoder/arm/boolhuff_arm.c', - 'vp8/encoder/arm/dct_arm.c', - 'vp8/encoder/arm/neon/fastquantizeb_neon.asm', - 'vp8/encoder/arm/neon/picklpf_arm.c', - 'vp8/encoder/arm/neon/shortfdct_neon.asm', - 'vp8/encoder/arm/neon/subtract_neon.asm', - 'vp8/encoder/arm/neon/vp8_memcpy_neon.asm', - 'vp8/encoder/arm/neon/vp8_mse16x16_neon.asm', - 'vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm', - 'vp8/encoder/arm/quantize_arm.c', - 'vp9/common/arm/neon/vp9_avg_neon.asm', - 'vp9/common/arm/neon/vp9_convolve8_avg_neon.asm', - 'vp9/common/arm/neon/vp9_convolve8_neon.asm', - 'vp9/common/arm/neon/vp9_convolve_neon.c', - 'vp9/common/arm/neon/vp9_copy_neon.asm', - 'vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm', - 'vp9/common/arm/neon/vp9_idct16x16_neon.c', - 'vp9/common/arm/neon/vp9_loopfilter_neon.asm', - 'vp9/common/arm/neon/vp9_mb_lpf_neon.asm', - 'vp9/common/arm/neon/vp9_save_reg_neon.asm', - 'vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm', - 'vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm', - 'vp9/common/arm/neon/vp9_short_idct32x32_1_add_neon.asm', - 'vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm', - 'vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm', - 'vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm', - 'vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm', - 'vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm', - 'vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm', - 'vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm', - 'vpx_ports/arm_cpudetect.c', - 'vpx_scale/arm/neon/vp8_vpxyv12_copy_y_neon.asm', - 'vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm', - 'vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm', - 'vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm', - 'vpx_scale/arm/neon/yv12extend_arm.c' - ], - 'AVX2': ['vp9/common/x86/vp9_loopfilter_intrin_avx2.c'], - 'UNIFIED_SOURCES': [ - 'vp8/common/alloccommon.c', - 'vp8/common/blockd.c', - 'vp8/common/debugmodes.c', - 'vp8/common/dequantize.c', - 'vp8/common/entropy.c', - 'vp8/common/entropymode.c', - 'vp8/common/entropymv.c', - 'vp8/common/extend.c', - 'vp8/common/filter.c', - 'vp8/common/findnearmv.c', - 'vp8/common/generic/systemdependent.c', - 'vp8/common/idct_blk.c', - 'vp8/common/idctllm.c', - 'vp8/common/loopfilter.c', - 'vp8/common/loopfilter_filters.c', - 'vp8/common/mbpitch.c', - 'vp8/common/modecont.c', - 'vp8/common/quant_common.c', - 'vp8/common/reconinter.c', - 'vp8/common/reconintra.c', - 'vp8/common/reconintra4x4.c', - 'vp8/common/setupintrarecon.c', - 'vp8/common/swapyv12buffer.c', - 'vp8/common/treecoder.c', - 'vp8/common/variance_c.c', - 'vp8/decoder/dboolhuff.c', - 'vp8/decoder/decodemv.c', - 'vp8/decoder/decodframe.c', - 'vp8/decoder/detokenize.c', - 'vp8/decoder/onyxd_if.c', - 'vp8/decoder/threading.c', - 'vp8/encoder/bitstream.c', - 'vp8/encoder/dct.c', - 'vp8/encoder/denoising.c', - 'vp8/encoder/encodeframe.c', - 'vp8/encoder/encodeintra.c', - 'vp8/encoder/encodemb.c', - 'vp8/encoder/encodemv.c', - 'vp8/encoder/ethreading.c', - 'vp8/encoder/firstpass.c', - 'vp8/encoder/lookahead.c', - 'vp8/encoder/mcomp.c', - 'vp8/encoder/modecosts.c', - 'vp8/encoder/mr_dissim.c', - 'vp8/encoder/onyx_if.c', - 'vp8/encoder/pickinter.c', - 'vp8/encoder/picklpf.c', - 'vp8/encoder/psnr.c', - 'vp8/encoder/quantize.c', - 'vp8/encoder/ratectrl.c', - 'vp8/encoder/rdopt.c', - 'vp8/encoder/segmentation.c', - 'vp8/encoder/temporal_filter.c', - 'vp8/encoder/tokenize.c', - 'vp8/encoder/treewriter.c', - 'vp8/vp8_cx_iface.c', - 'vp9/common/generic/vp9_systemdependent.c', - 'vp9/common/vp9_alloccommon.c', - 'vp9/common/vp9_common_data.c', - 'vp9/common/vp9_convolve.c', - 'vp9/common/vp9_debugmodes.c', - 'vp9/common/vp9_entropy.c', - 'vp9/common/vp9_entropymode.c', - 'vp9/common/vp9_extend.c', - 'vp9/common/vp9_filter.c', - 'vp9/common/vp9_findnearmv.c', - 'vp9/common/vp9_idct.c', - 'vp9/common/vp9_loopfilter.c', - 'vp9/common/vp9_loopfilter_filters.c', - 'vp9/common/vp9_mvref_common.c', - 'vp9/common/vp9_pred_common.c', - 'vp9/common/vp9_quant_common.c', - 'vp9/common/vp9_reconinter.c', - 'vp9/common/vp9_reconintra.c', - 'vp9/common/vp9_scale.c', - 'vp9/common/vp9_scan.c', - 'vp9/common/vp9_seg_common.c', - 'vp9/common/vp9_tile_common.c', - 'vp9/common/vp9_treecoder.c', - 'vp9/decoder/vp9_dboolhuff.c', - 'vp9/decoder/vp9_decodemv.c', - 'vp9/decoder/vp9_decodframe.c', - 'vp9/decoder/vp9_detokenize.c', - 'vp9/decoder/vp9_dsubexp.c', - 'vp9/decoder/vp9_onyxd_if.c', - 'vp9/decoder/vp9_thread.c', - 'vp9/encoder/vp9_boolhuff.c', - 'vp9/encoder/vp9_dct.c', - 'vp9/encoder/vp9_encodeframe.c', - 'vp9/encoder/vp9_encodeintra.c', - 'vp9/encoder/vp9_encodemb.c', - 'vp9/encoder/vp9_encodemv.c', - 'vp9/encoder/vp9_firstpass.c', - 'vp9/encoder/vp9_lookahead.c', - 'vp9/encoder/vp9_mbgraph.c', - 'vp9/encoder/vp9_mcomp.c', - 'vp9/encoder/vp9_modecosts.c', - 'vp9/encoder/vp9_onyx_if.c', - 'vp9/encoder/vp9_picklpf.c', - 'vp9/encoder/vp9_psnr.c', - 'vp9/encoder/vp9_quantize.c', - 'vp9/encoder/vp9_ratectrl.c', - 'vp9/encoder/vp9_rdopt.c', - 'vp9/encoder/vp9_sad_c.c', - 'vp9/encoder/vp9_segmentation.c', - 'vp9/encoder/vp9_subexp.c', - 'vp9/encoder/vp9_temporal_filter.c', - 'vp9/encoder/vp9_tokenize.c', - 'vp9/encoder/vp9_treewriter.c', - 'vp9/encoder/vp9_vaq.c', - 'vp9/encoder/vp9_variance_c.c', - 'vp9/vp9_cx_iface.c', - 'vp9/vp9_dx_iface.c', - 'vpx/src/vpx_codec.c', - 'vpx/src/vpx_decoder.c', - 'vpx/src/vpx_encoder.c', - 'vpx/src/vpx_image.c', - 'vpx_scale/generic/gen_scalers.c', - 'vpx_scale/generic/vpx_scale.c', - 'vpx_scale/generic/yv12config.c', - 'vpx_scale/generic/yv12extend.c', - 'vpx_scale/vpx_scale_rtcd.c' - ], - 'SOURCES': [ - 'vp8/common/rtcd.c', - 'vp8/common/sad_c.c', - 'vp8/vp8_dx_iface.c', - 'vp9/common/vp9_entropymv.c', - 'vp9/common/vp9_rtcd.c', - 'vp9/encoder/vp9_bitstream.c', - 'vpx/src/svc_encodeframe.c', - 'vpx_mem/vpx_mem.c', - ], + 'ARM_ASM': ['vp8/common/arm/armv6/bilinearfilter_v6.asm', + 'vp8/common/arm/armv6/copymem16x16_v6.asm', + 'vp8/common/arm/armv6/copymem8x4_v6.asm', + 'vp8/common/arm/armv6/copymem8x8_v6.asm', + 'vp8/common/arm/armv6/dc_only_idct_add_v6.asm', + 'vp8/common/arm/armv6/dequant_idct_v6.asm', + 'vp8/common/arm/armv6/dequantize_v6.asm', + 'vp8/common/arm/armv6/filter_v6.asm', + 'vp8/common/arm/armv6/idct_blk_v6.c', + 'vp8/common/arm/armv6/idct_v6.asm', + 'vp8/common/arm/armv6/intra4x4_predict_v6.asm', + 'vp8/common/arm/armv6/iwalsh_v6.asm', + 'vp8/common/arm/armv6/loopfilter_v6.asm', + 'vp8/common/arm/armv6/simpleloopfilter_v6.asm', + 'vp8/common/arm/armv6/sixtappredict8x4_v6.asm', + 'vp8/common/arm/armv6/vp8_sad16x16_armv6.asm', + 'vp8/common/arm/armv6/vp8_variance16x16_armv6.asm', + 'vp8/common/arm/armv6/vp8_variance8x8_armv6.asm', + 'vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm', + 'vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm', + 'vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm', + 'vp8/common/arm/bilinearfilter_arm.c', + 'vp8/common/arm/dequantize_arm.c', + 'vp8/common/arm/filter_arm.c', + 'vp8/common/arm/loopfilter_arm.c', + 'vp8/common/arm/neon/bilinearpredict_neon.c', + 'vp8/common/arm/neon/copymem_neon.c', + 'vp8/common/arm/neon/dc_only_idct_add_neon.c', + 'vp8/common/arm/neon/dequant_idct_neon.c', + 'vp8/common/arm/neon/dequantizeb_neon.c', + 'vp8/common/arm/neon/idct_blk_neon.c', + 'vp8/common/arm/neon/idct_dequant_0_2x_neon.c', + 'vp8/common/arm/neon/idct_dequant_full_2x_neon.c', + 'vp8/common/arm/neon/iwalsh_neon.c', + 'vp8/common/arm/neon/loopfilter_neon.c', + 'vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c', + 'vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c', + 'vp8/common/arm/neon/mbloopfilter_neon.c', + 'vp8/common/arm/neon/reconintra_neon.c', + 'vp8/common/arm/neon/sad_neon.c', + 'vp8/common/arm/neon/shortidct4x4llm_neon.c', + 'vp8/common/arm/neon/sixtappredict_neon.c', + 'vp8/common/arm/neon/variance_neon.c', + 'vp8/common/arm/neon/vp8_subpixelvariance_neon.c', + 'vp8/common/arm/variance_arm.c', + 'vp8/encoder/arm/armv5te/boolhuff_armv5te.asm', + 'vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm', + 'vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm', + 'vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm', + 'vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm', + 'vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm', + 'vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm', + 'vp8/encoder/arm/armv6/vp8_subtract_armv6.asm', + 'vp8/encoder/arm/armv6/walsh_v6.asm', + 'vp8/encoder/arm/boolhuff_arm.c', + 'vp8/encoder/arm/dct_arm.c', + 'vp8/encoder/arm/neon/denoising_neon.c', + 'vp8/encoder/arm/neon/fastquantizeb_neon.asm', + 'vp8/encoder/arm/neon/shortfdct_neon.c', + 'vp8/encoder/arm/neon/subtract_neon.c', + 'vp8/encoder/arm/neon/vp8_mse16x16_neon.asm', + 'vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c', + 'vp8/encoder/arm/quantize_arm.c', + 'vp9/common/arm/neon/vp9_avg_neon.asm', + 'vp9/common/arm/neon/vp9_convolve8_avg_neon.asm', + 'vp9/common/arm/neon/vp9_convolve8_neon.asm', + 'vp9/common/arm/neon/vp9_convolve_neon.c', + 'vp9/common/arm/neon/vp9_copy_neon.asm', + 'vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm', + 'vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm', + 'vp9/common/arm/neon/vp9_idct16x16_add_neon.asm', + 'vp9/common/arm/neon/vp9_idct16x16_neon.c', + 'vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm', + 'vp9/common/arm/neon/vp9_idct32x32_add_neon.asm', + 'vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm', + 'vp9/common/arm/neon/vp9_idct4x4_add_neon.asm', + 'vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm', + 'vp9/common/arm/neon/vp9_idct8x8_add_neon.asm', + 'vp9/common/arm/neon/vp9_iht4x4_add_neon.asm', + 'vp9/common/arm/neon/vp9_iht8x8_add_neon.asm', + 'vp9/common/arm/neon/vp9_loopfilter_16_neon.asm', + 'vp9/common/arm/neon/vp9_loopfilter_16_neon.c', + 'vp9/common/arm/neon/vp9_loopfilter_neon.asm', + 'vp9/common/arm/neon/vp9_mb_lpf_neon.asm', + 'vp9/common/arm/neon/vp9_reconintra_neon.asm', + 'vp9/common/arm/neon/vp9_save_reg_neon.asm', + 'vp9/encoder/arm/neon/vp9_dct_neon.c', + 'vp9/encoder/arm/neon/vp9_quantize_neon.c', + 'vp9/encoder/arm/neon/vp9_sad_neon.c', + 'vp9/encoder/arm/neon/vp9_subtract_neon.c', + 'vp9/encoder/arm/neon/vp9_variance_neon.c', + 'vpx_ports/arm_cpudetect.c'], + 'AVX2': ['vp9/common/x86/vp9_loopfilter_intrin_avx2.c', + 'vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c', + 'vp9/encoder/x86/vp9_dct32x32_avx2.c', + 'vp9/encoder/x86/vp9_dct_avx2.c', + 'vp9/encoder/x86/vp9_error_intrin_avx2.c', + 'vp9/encoder/x86/vp9_sad4d_intrin_avx2.c', + 'vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c', + 'vp9/encoder/x86/vp9_variance_avx2.c', + 'vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c'], 'ERROR_CONCEALMENT': ['vp8/decoder/error_concealment.c'], 'EXPORTS': ['vpx/vp8.h', 'vpx/vp8cx.h', @@ -246,6 +107,7 @@ files = { 'vpx/vpx_codec.h', 'vpx/vpx_decoder.h', 'vpx/vpx_encoder.h', + 'vpx/vpx_frame_buffer.h', 'vpx/vpx_image.h', 'vpx/vpx_integer.h', 'vpx_mem/include/vpx_mem_intrnl.h', @@ -256,86 +118,225 @@ files = { 'vpx_ports/x86.h', 'vpx_scale/vpx_scale.h', 'vpx_scale/yv12config.h'], + 'SOURCES': ['vp8/common/rtcd.c', + 'vp8/common/sad_c.c', + 'vp8/encoder/bitstream.c', + 'vp8/encoder/onyx_if.c', + 'vp8/vp8_dx_iface.c', + 'vp9/common/vp9_alloccommon.c', + 'vp9/common/vp9_blockd.c', + 'vp9/common/vp9_common_data.c', + 'vp9/common/vp9_convolve.c', + 'vp9/common/vp9_debugmodes.c', + 'vp9/common/vp9_entropy.c', + 'vp9/common/vp9_entropymode.c', + 'vp9/common/vp9_entropymv.c', + 'vp9/common/vp9_filter.c', + 'vp9/common/vp9_frame_buffers.c', + 'vp9/common/vp9_idct.c', + 'vp9/common/vp9_loopfilter.c', + 'vp9/common/vp9_loopfilter_filters.c', + 'vp9/common/vp9_mvref_common.c', + 'vp9/common/vp9_pred_common.c', + 'vp9/common/vp9_prob.c', + 'vp9/common/vp9_quant_common.c', + 'vp9/common/vp9_reconinter.c', + 'vp9/common/vp9_reconintra.c', + 'vp9/common/vp9_rtcd.c', + 'vp9/common/vp9_scale.c', + 'vp9/common/vp9_scan.c', + 'vp9/common/vp9_seg_common.c', + 'vp9/common/vp9_thread.c', + 'vp9/common/vp9_tile_common.c', + 'vp9/decoder/vp9_decodeframe.c', + 'vp9/decoder/vp9_decodemv.c', + 'vp9/decoder/vp9_decoder.c', + 'vp9/decoder/vp9_detokenize.c', + 'vp9/decoder/vp9_dsubexp.c', + 'vp9/decoder/vp9_dthread.c', + 'vp9/decoder/vp9_reader.c', + 'vp9/encoder/vp9_aq_complexity.c', + 'vp9/encoder/vp9_aq_cyclicrefresh.c', + 'vp9/encoder/vp9_aq_variance.c', + 'vp9/encoder/vp9_bitstream.c', + 'vp9/encoder/vp9_context_tree.c', + 'vp9/encoder/vp9_cost.c', + 'vp9/encoder/vp9_dct.c', + 'vp9/encoder/vp9_encodeframe.c', + 'vp9/encoder/vp9_encodemb.c', + 'vp9/encoder/vp9_encodemv.c', + 'vp9/encoder/vp9_encoder.c', + 'vp9/encoder/vp9_extend.c', + 'vp9/encoder/vp9_firstpass.c', + 'vp9/encoder/vp9_lookahead.c', + 'vp9/encoder/vp9_mbgraph.c', + 'vp9/encoder/vp9_mcomp.c', + 'vp9/encoder/vp9_picklpf.c', + 'vp9/encoder/vp9_pickmode.c', + 'vp9/encoder/vp9_quantize.c', + 'vp9/encoder/vp9_ratectrl.c', + 'vp9/encoder/vp9_rd.c', + 'vp9/encoder/vp9_rdopt.c', + 'vp9/encoder/vp9_resize.c', + 'vp9/encoder/vp9_sad.c', + 'vp9/encoder/vp9_segmentation.c', + 'vp9/encoder/vp9_speed_features.c', + 'vp9/encoder/vp9_subexp.c', + 'vp9/encoder/vp9_svc_layercontext.c', + 'vp9/encoder/vp9_temporal_filter.c', + 'vp9/encoder/vp9_tokenize.c', + 'vp9/encoder/vp9_treewriter.c', + 'vp9/encoder/vp9_variance.c', + 'vp9/encoder/vp9_write_bit_buffer.c', + 'vp9/encoder/vp9_writer.c', + 'vp9/vp9_cx_iface.c', + 'vp9/vp9_dx_iface.c', + 'vpx/src/svc_encodeframe.c', + 'vpx/src/vpx_encoder.c', + 'vpx_mem/vpx_mem.c', + 'vpx_scale/generic/yv12config.c', + 'vpx_scale/generic/yv12extend.c', + 'vpx_scale/vpx_scale_rtcd.c'], + 'UNIFIED_SOURCES': ['vp8/common/alloccommon.c', + 'vp8/common/blockd.c', + 'vp8/common/debugmodes.c', + 'vp8/common/dequantize.c', + 'vp8/common/entropy.c', + 'vp8/common/entropymode.c', + 'vp8/common/entropymv.c', + 'vp8/common/extend.c', + 'vp8/common/filter.c', + 'vp8/common/findnearmv.c', + 'vp8/common/generic/systemdependent.c', + 'vp8/common/idct_blk.c', + 'vp8/common/idctllm.c', + 'vp8/common/loopfilter.c', + 'vp8/common/loopfilter_filters.c', + 'vp8/common/mbpitch.c', + 'vp8/common/modecont.c', + 'vp8/common/quant_common.c', + 'vp8/common/reconinter.c', + 'vp8/common/reconintra.c', + 'vp8/common/reconintra4x4.c', + 'vp8/common/setupintrarecon.c', + 'vp8/common/swapyv12buffer.c', + 'vp8/common/treecoder.c', + 'vp8/common/variance_c.c', + 'vp8/decoder/dboolhuff.c', + 'vp8/decoder/decodeframe.c', + 'vp8/decoder/decodemv.c', + 'vp8/decoder/detokenize.c', + 'vp8/decoder/onyxd_if.c', + 'vp8/decoder/threading.c', + 'vp8/encoder/dct.c', + 'vp8/encoder/denoising.c', + 'vp8/encoder/encodeframe.c', + 'vp8/encoder/encodeintra.c', + 'vp8/encoder/encodemb.c', + 'vp8/encoder/encodemv.c', + 'vp8/encoder/ethreading.c', + 'vp8/encoder/firstpass.c', + 'vp8/encoder/lookahead.c', + 'vp8/encoder/mcomp.c', + 'vp8/encoder/modecosts.c', + 'vp8/encoder/mr_dissim.c', + 'vp8/encoder/pickinter.c', + 'vp8/encoder/picklpf.c', + 'vp8/encoder/quantize.c', + 'vp8/encoder/ratectrl.c', + 'vp8/encoder/rdopt.c', + 'vp8/encoder/segmentation.c', + 'vp8/encoder/temporal_filter.c', + 'vp8/encoder/tokenize.c', + 'vp8/encoder/treewriter.c', + 'vp8/vp8_cx_iface.c', + 'vp9/decoder/vp9_read_bit_buffer.c', + 'vpx/src/vpx_codec.c', + 'vpx/src/vpx_decoder.c', + 'vpx/src/vpx_image.c', + 'vpx/src/vpx_psnr.c', + 'vpx_scale/generic/gen_scalers.c', + 'vpx_scale/generic/vpx_scale.c'], 'VP8_POSTPROC': ['vp8/common/mfqe.c', 'vp8/common/postproc.c'], 'VP9_POSTPROC': ['vp9/common/vp9_postproc.c'], 'X86-64_ASM': ['third_party/x86inc/x86inc.asm', - 'vp8/common/x86/loopfilter_block_sse2.asm', - 'vp9/encoder/x86/vp9_quantize_ssse3.asm'], - 'X86_ASM': [ - 'vp8/common/x86/dequantize_mmx.asm', - 'vp8/common/x86/filter_x86.c', - 'vp8/common/x86/idct_blk_mmx.c', - 'vp8/common/x86/idct_blk_sse2.c', - 'vp8/common/x86/idctllm_mmx.asm', - 'vp8/common/x86/idctllm_sse2.asm', - 'vp8/common/x86/iwalsh_mmx.asm', - 'vp8/common/x86/iwalsh_sse2.asm', - 'vp8/common/x86/loopfilter_mmx.asm', - 'vp8/common/x86/loopfilter_sse2.asm', - 'vp8/common/x86/loopfilter_x86.c', - 'vp8/common/x86/mfqe_sse2.asm', - 'vp8/common/x86/postproc_mmx.asm', - 'vp8/common/x86/postproc_sse2.asm', - 'vp8/common/x86/postproc_x86.c', - 'vp8/common/x86/recon_mmx.asm', - 'vp8/common/x86/recon_sse2.asm', - 'vp8/common/x86/recon_wrapper_sse2.c', - 'vp8/common/x86/sad_mmx.asm', - 'vp8/common/x86/sad_sse2.asm', - 'vp8/common/x86/sad_sse3.asm', - 'vp8/common/x86/sad_sse4.asm', - 'vp8/common/x86/sad_ssse3.asm', - 'vp8/common/x86/subpixel_mmx.asm', - 'vp8/common/x86/subpixel_sse2.asm', - 'vp8/common/x86/subpixel_ssse3.asm', - 'vp8/common/x86/variance_impl_mmx.asm', - 'vp8/common/x86/variance_impl_sse2.asm', - 'vp8/common/x86/variance_impl_ssse3.asm', - 'vp8/common/x86/variance_mmx.c', - 'vp8/common/x86/variance_sse2.c', - 'vp8/common/x86/variance_ssse3.c', - 'vp8/common/x86/vp8_asm_stubs.c', - 'vp8/encoder/x86/dct_mmx.asm', - 'vp8/encoder/x86/dct_sse2.asm', - 'vp8/encoder/x86/denoising_sse2.c', - 'vp8/encoder/x86/encodeopt.asm', - 'vp8/encoder/x86/fwalsh_sse2.asm', - 'vp8/encoder/x86/quantize_mmx.asm', - 'vp8/encoder/x86/quantize_sse2.c', - 'vp8/encoder/x86/quantize_sse4.asm', - 'vp8/encoder/x86/quantize_ssse3.asm', - 'vp8/encoder/x86/subtract_mmx.asm', - 'vp8/encoder/x86/subtract_sse2.asm', - 'vp8/encoder/x86/temporal_filter_apply_sse2.asm', - 'vp8/encoder/x86/vp8_enc_stubs_mmx.c', - 'vp8/encoder/x86/vp8_enc_stubs_sse2.c', - 'vp9/common/x86/vp9_asm_stubs.c', - 'vp9/common/x86/vp9_copy_sse2.asm', - 'vp9/common/x86/vp9_idct_intrin_sse2.c', - 'vp9/common/x86/vp9_intrapred_sse2.asm', - 'vp9/common/x86/vp9_intrapred_ssse3.asm', - 'vp9/common/x86/vp9_loopfilter_intrin_sse2.c', - 'vp9/common/x86/vp9_loopfilter_mmx.asm', - 'vp9/common/x86/vp9_subpixel_8t_sse2.asm', - 'vp9/common/x86/vp9_subpixel_8t_ssse3.asm', - 'vp9/encoder/x86/vp9_dct32x32_sse2.c', - 'vp9/encoder/x86/vp9_dct_sse2.c', - 'vp9/encoder/x86/vp9_error_sse2.asm', - 'vp9/encoder/x86/vp9_sad4d_sse2.asm', - 'vp9/encoder/x86/vp9_sad_mmx.asm', - 'vp9/encoder/x86/vp9_sad_sse2.asm', - 'vp9/encoder/x86/vp9_sad_sse3.asm', - 'vp9/encoder/x86/vp9_sad_sse4.asm', - 'vp9/encoder/x86/vp9_sad_ssse3.asm', - 'vp9/encoder/x86/vp9_subpel_variance.asm', - 'vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm', - 'vp9/encoder/x86/vp9_subtract_sse2.asm', - 'vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm', - 'vp9/encoder/x86/vp9_variance_impl_mmx.asm', - 'vp9/encoder/x86/vp9_variance_impl_sse2.asm', - 'vp9/encoder/x86/vp9_variance_mmx.c', - 'vp9/encoder/x86/vp9_variance_sse2.c', - 'vpx_ports/emms.asm', - 'vpx_ports/x86_cpuid.c', - ] + 'vp8/common/x86/loopfilter_block_sse2_x86_64.asm', + 'vp8/encoder/x86/ssim_opt_x86_64.asm', + 'vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm', + 'vp9/encoder/x86/vp9_ssim_opt_x86_64.asm'], + 'X86_ASM': ['vp8/common/x86/dequantize_mmx.asm', + 'vp8/common/x86/filter_x86.c', + 'vp8/common/x86/idct_blk_mmx.c', + 'vp8/common/x86/idct_blk_sse2.c', + 'vp8/common/x86/idctllm_mmx.asm', + 'vp8/common/x86/idctllm_sse2.asm', + 'vp8/common/x86/iwalsh_mmx.asm', + 'vp8/common/x86/iwalsh_sse2.asm', + 'vp8/common/x86/loopfilter_mmx.asm', + 'vp8/common/x86/loopfilter_sse2.asm', + 'vp8/common/x86/loopfilter_x86.c', + 'vp8/common/x86/mfqe_sse2.asm', + 'vp8/common/x86/postproc_mmx.asm', + 'vp8/common/x86/postproc_sse2.asm', + 'vp8/common/x86/recon_mmx.asm', + 'vp8/common/x86/recon_sse2.asm', + 'vp8/common/x86/recon_wrapper_sse2.c', + 'vp8/common/x86/sad_mmx.asm', + 'vp8/common/x86/sad_sse2.asm', + 'vp8/common/x86/sad_sse3.asm', + 'vp8/common/x86/sad_sse4.asm', + 'vp8/common/x86/sad_ssse3.asm', + 'vp8/common/x86/subpixel_mmx.asm', + 'vp8/common/x86/subpixel_sse2.asm', + 'vp8/common/x86/subpixel_ssse3.asm', + 'vp8/common/x86/variance_impl_mmx.asm', + 'vp8/common/x86/variance_impl_sse2.asm', + 'vp8/common/x86/variance_impl_ssse3.asm', + 'vp8/common/x86/variance_mmx.c', + 'vp8/common/x86/variance_sse2.c', + 'vp8/common/x86/variance_ssse3.c', + 'vp8/common/x86/vp8_asm_stubs.c', + 'vp8/encoder/x86/dct_mmx.asm', + 'vp8/encoder/x86/dct_sse2.asm', + 'vp8/encoder/x86/denoising_sse2.c', + 'vp8/encoder/x86/encodeopt.asm', + 'vp8/encoder/x86/fwalsh_sse2.asm', + 'vp8/encoder/x86/quantize_mmx.asm', + 'vp8/encoder/x86/quantize_sse2.c', + 'vp8/encoder/x86/quantize_sse4.c', + 'vp8/encoder/x86/quantize_ssse3.c', + 'vp8/encoder/x86/subtract_mmx.asm', + 'vp8/encoder/x86/subtract_sse2.asm', + 'vp8/encoder/x86/temporal_filter_apply_sse2.asm', + 'vp8/encoder/x86/vp8_enc_stubs_mmx.c', + 'vp8/encoder/x86/vp8_enc_stubs_sse2.c', + 'vp9/common/x86/vp9_asm_stubs.c', + 'vp9/common/x86/vp9_copy_sse2.asm', + 'vp9/common/x86/vp9_idct_intrin_sse2.c', + 'vp9/common/x86/vp9_idct_intrin_ssse3.c', + 'vp9/common/x86/vp9_idct_ssse3_x86_64.asm', + 'vp9/common/x86/vp9_intrapred_sse2.asm', + 'vp9/common/x86/vp9_intrapred_ssse3.asm', + 'vp9/common/x86/vp9_loopfilter_intrin_sse2.c', + 'vp9/common/x86/vp9_loopfilter_mmx.asm', + 'vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c', + 'vp9/common/x86/vp9_subpixel_8t_sse2.asm', + 'vp9/common/x86/vp9_subpixel_8t_ssse3.asm', + 'vp9/common/x86/vp9_subpixel_bilinear_sse2.asm', + 'vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm', + 'vp9/encoder/x86/vp9_dct32x32_sse2.c', + 'vp9/encoder/x86/vp9_dct_mmx.asm', + 'vp9/encoder/x86/vp9_dct_sse2.c', + 'vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm', + 'vp9/encoder/x86/vp9_error_sse2.asm', + 'vp9/encoder/x86/vp9_sad4d_sse2.asm', + 'vp9/encoder/x86/vp9_sad_sse2.asm', + 'vp9/encoder/x86/vp9_sad_sse3.asm', + 'vp9/encoder/x86/vp9_sad_sse4.asm', + 'vp9/encoder/x86/vp9_sad_ssse3.asm', + 'vp9/encoder/x86/vp9_subpel_variance.asm', + 'vp9/encoder/x86/vp9_subtract_sse2.asm', + 'vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm', + 'vp9/encoder/x86/vp9_variance_sse2.c', + 'vpx_ports/emms.asm'] } diff --git a/media/libvpx/third_party/x86inc/x86inc.asm b/media/libvpx/third_party/x86inc/x86inc.asm index 9e4918a0ea89..99453a99854a 100644 --- a/media/libvpx/third_party/x86inc/x86inc.asm +++ b/media/libvpx/third_party/x86inc/x86inc.asm @@ -234,10 +234,10 @@ ALIGNMODE k7 %define r%1mp %2 %elif ARCH_X86_64 ; memory %define r%1m [rsp + stack_offset + %6] - %define r%1mp qword r %+ %1m + %define r%1mp qword r %+ %1 %+ m %else %define r%1m [esp + stack_offset + %6] - %define r%1mp dword r %+ %1m + %define r%1mp dword r %+ %1 %+ m %endif %define r%1 %2 %endmacro @@ -395,6 +395,23 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %assign n_arg_names %0 %endmacro +%if ARCH_X86_64 +%macro ALLOC_STACK 2 ; stack_size, num_regs + %assign %%stack_aligment ((mmsize + 15) & ~15) + %assign stack_size_padded %1 + + %assign %%reg_num (%2 - 1) + %xdefine rsp_tmp r %+ %%reg_num + mov rsp_tmp, rsp + sub rsp, stack_size_padded + and rsp, ~(%%stack_aligment - 1) +%endmacro + +%macro RESTORE_STACK 0 ; reset rsp register + mov rsp, rsp_tmp +%endmacro +%endif + %if WIN64 ; Windows x64 ;================================================= DECLARE_REG 0, rcx, ecx, cx, cl @@ -592,16 +609,20 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 CAT_XDEFINE cglobaled_, %1, 1 %endif %xdefine current_function %1 - %ifidn __OUTPUT_FORMAT__,elf - global %1:function hidden - %elifidn __OUTPUT_FORMAT__,elf32 - global %1:function hidden - %elifidn __OUTPUT_FORMAT__,elf64 - global %1:function hidden - %elifidn __OUTPUT_FORMAT__,macho32 - global %1:private_extern - %elifidn __OUTPUT_FORMAT__,macho64 - global %1:private_extern + %ifdef CHROMIUM + %ifidn __OUTPUT_FORMAT__,elf + global %1:function hidden + %elifidn __OUTPUT_FORMAT__,elf32 + global %1:function hidden + %elifidn __OUTPUT_FORMAT__,elf64 + global %1:function hidden + %elifidn __OUTPUT_FORMAT__,macho32 + global %1:private_extern + %elifidn __OUTPUT_FORMAT__,macho64 + global %1:private_extern + %else + global %1 + %endif %else global %1 %endif diff --git a/media/libvpx/vp8/common/alloccommon.h b/media/libvpx/vp8/common/alloccommon.h index ea93c252280b..93e99d76b1d7 100644 --- a/media/libvpx/vp8/common/alloccommon.h +++ b/media/libvpx/vp8/common/alloccommon.h @@ -9,15 +9,23 @@ */ -#ifndef __INC_ALLOCCOMMON_H -#define __INC_ALLOCCOMMON_H +#ifndef VP8_COMMON_ALLOCCOMMON_H_ +#define VP8_COMMON_ALLOCCOMMON_H_ #include "onyxc_int.h" +#ifdef __cplusplus +extern "C" { +#endif + void vp8_create_common(VP8_COMMON *oci); void vp8_remove_common(VP8_COMMON *oci); void vp8_de_alloc_frame_buffers(VP8_COMMON *oci); int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height); void vp8_setup_version(VP8_COMMON *oci); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_ALLOCCOMMON_H_ diff --git a/media/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm b/media/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm index dc84c30daf53..39919579f805 100644 --- a/media/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm +++ b/media/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm @@ -53,7 +53,7 @@ loop orr r6, r6, r7 ; differences of all 4 pixels ; calculate total sum adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; substract negative differences from sum + subs r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -77,7 +77,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -101,7 +101,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -127,7 +127,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords diff --git a/media/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm b/media/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm index adc353d2006e..915ee4993091 100644 --- a/media/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm +++ b/media/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm @@ -51,7 +51,7 @@ loop orr r8, r8, r10 ; differences of all 4 pixels ; calculate total sum add r4, r4, r6 ; add positive differences to sum - sub r4, r4, r7 ; substract negative differences from sum + sub r4, r4, r7 ; subtract negative differences from sum ; calculate sse uxtb16 r7, r8 ; byte (two pixels) to halfwords @@ -77,7 +77,7 @@ loop ; calculate total sum add r4, r4, r6 ; add positive differences to sum - sub r4, r4, r7 ; substract negative differences from sum + sub r4, r4, r7 ; subtract negative differences from sum ; calculate sse uxtb16 r7, r8 ; byte (two pixels) to halfwords diff --git a/media/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm b/media/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm index dd2ce685c8bc..3668dc517a90 100644 --- a/media/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm +++ b/media/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm @@ -58,7 +58,7 @@ loop orr r6, r6, r7 ; differences of all 4 pixels ; calculate total sum adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; substract negative differences from sum + subs r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -89,7 +89,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -120,7 +120,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -153,7 +153,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords diff --git a/media/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm b/media/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm index f972d9b5bac4..b4e0959d1b38 100644 --- a/media/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm +++ b/media/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm @@ -69,7 +69,7 @@ loop orr r6, r6, r7 ; differences of all 4 pixels ; calculate total sum adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; substract negative differences from sum + subs r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -111,7 +111,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -153,7 +153,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -195,7 +195,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords diff --git a/media/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm b/media/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm index f5da9c09eedd..10863e2ec801 100644 --- a/media/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm +++ b/media/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm @@ -59,7 +59,7 @@ loop orr r6, r6, r7 ; differences of all 4 pixels ; calculate total sum adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; substract negative differences from sum + subs r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -90,7 +90,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -121,7 +121,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -154,7 +154,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords diff --git a/media/libvpx/vp8/common/arm/bilinearfilter_arm.h b/media/libvpx/vp8/common/arm/bilinearfilter_arm.h index b7155d3f0a5c..6b84e6f3b55e 100644 --- a/media/libvpx/vp8/common/arm/bilinearfilter_arm.h +++ b/media/libvpx/vp8/common/arm/bilinearfilter_arm.h @@ -9,8 +9,12 @@ */ -#ifndef BILINEARFILTER_ARM_H -#define BILINEARFILTER_ARM_H +#ifndef VP8_COMMON_ARM_BILINEARFILTER_ARM_H_ +#define VP8_COMMON_ARM_BILINEARFILTER_ARM_H_ + +#ifdef __cplusplus +extern "C" { +#endif extern void vp8_filter_block2d_bil_first_pass_armv6 ( @@ -32,4 +36,8 @@ extern void vp8_filter_block2d_bil_second_pass_armv6 const short *vp8_filter ); -#endif /* BILINEARFILTER_ARM_H */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_ARM_BILINEARFILTER_ARM_H_ diff --git a/media/libvpx/vp8/common/arm/dequantize_arm.c b/media/libvpx/vp8/common/arm/dequantize_arm.c index 70e72aa47745..1f8157f0b115 100644 --- a/media/libvpx/vp8/common/arm/dequantize_arm.c +++ b/media/libvpx/vp8/common/arm/dequantize_arm.c @@ -12,26 +12,9 @@ #include "vpx_config.h" #include "vp8/common/blockd.h" -#if HAVE_NEON -extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ); -#endif - #if HAVE_MEDIA extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ); -#endif -#if HAVE_NEON - -void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) -{ - short *DQ = d->dqcoeff; - short *Q = d->qcoeff; - - vp8_dequantize_b_loop_neon(Q, DQC, DQ); -} -#endif - -#if HAVE_MEDIA void vp8_dequantize_b_v6(BLOCKD *d, short *DQC) { short *DQ = d->dqcoeff; diff --git a/media/libvpx/vp8/common/arm/loopfilter_arm.c b/media/libvpx/vp8/common/arm/loopfilter_arm.c index 3bdc9675e718..5840c2bbaaa9 100644 --- a/media/libvpx/vp8/common/arm/loopfilter_arm.c +++ b/media/libvpx/vp8/common/arm/loopfilter_arm.c @@ -34,11 +34,11 @@ typedef void loopfilter_uv_neon(unsigned char *u, int pitch, extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon; extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon; -extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon; -extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon; - extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon; extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon; + +extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon; +extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon; extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon; extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon; #endif diff --git a/media/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm b/media/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm deleted file mode 100644 index e392786d43d8..000000000000 --- a/media/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm +++ /dev/null @@ -1,357 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_bilinear_predict16x16_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(r5) int dst_pitch - -|vp8_bilinear_predict16x16_neon| PROC - push {r4-r5, lr} - - adr r12, bifilter16_coeff - ldr r4, [sp, #12] ;load parameters from stack - ldr r5, [sp, #16] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_bfilter16x16_only - - add r2, r12, r2, lsl #3 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - - vld1.s32 {d31}, [r2] ;load first_pass filter - - beq firstpass_bfilter16x16_only - - sub sp, sp, #272 ;reserve space on stack for temporary storage - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - mov lr, sp - vld1.u8 {d5, d6, d7}, [r0], r1 - - mov r2, #3 ;loop counter - vld1.u8 {d8, d9, d10}, [r0], r1 - - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vld1.u8 {d11, d12, d13}, [r0], r1 - - vdup.8 d1, d31[4] - -;First Pass: output_height lines x output_width columns (17x16) -filt_blk2d_fp16x16_loop_neon - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q8, d3, d0 - vmull.u8 q9, d5, d0 - vmull.u8 q10, d6, d0 - vmull.u8 q11, d8, d0 - vmull.u8 q12, d9, d0 - vmull.u8 q13, d11, d0 - vmull.u8 q14, d12, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - vext.8 d11, d11, d12, #1 - - vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q9, d5, d1 - vmlal.u8 q11, d8, d1 - vmlal.u8 q13, d11, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - vext.8 d12, d12, d13, #1 - - vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q10, d6, d1 - vmlal.u8 q12, d9, d1 - vmlal.u8 q14, d12, d1 - - subs r2, r2, #1 - - vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d15, q8, #7 - vqrshrn.u16 d16, q9, #7 - vqrshrn.u16 d17, q10, #7 - vqrshrn.u16 d18, q11, #7 - vqrshrn.u16 d19, q12, #7 - vqrshrn.u16 d20, q13, #7 - - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - vqrshrn.u16 d21, q14, #7 - vld1.u8 {d5, d6, d7}, [r0], r1 - - vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result - vld1.u8 {d8, d9, d10}, [r0], r1 - vst1.u8 {d18, d19, d20, d21}, [lr]! - vld1.u8 {d11, d12, d13}, [r0], r1 - - bne filt_blk2d_fp16x16_loop_neon - -;First-pass filtering for rest 5 lines - vld1.u8 {d14, d15, d16}, [r0], r1 - - vmull.u8 q9, d2, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q10, d3, d0 - vmull.u8 q11, d5, d0 - vmull.u8 q12, d6, d0 - vmull.u8 q13, d8, d0 - vmull.u8 q14, d9, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - - vmlal.u8 q9, d2, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q11, d5, d1 - vmlal.u8 q13, d8, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - - vmlal.u8 q10, d3, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q12, d6, d1 - vmlal.u8 q14, d9, d1 - - vmull.u8 q1, d11, d0 - vmull.u8 q2, d12, d0 - vmull.u8 q3, d14, d0 - vmull.u8 q4, d15, d0 - - vext.8 d11, d11, d12, #1 ;construct src_ptr[1] - vext.8 d14, d14, d15, #1 - - vmlal.u8 q1, d11, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q3, d14, d1 - - vext.8 d12, d12, d13, #1 - vext.8 d15, d15, d16, #1 - - vmlal.u8 q2, d12, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q4, d15, d1 - - vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d11, q10, #7 - vqrshrn.u16 d12, q11, #7 - vqrshrn.u16 d13, q12, #7 - vqrshrn.u16 d14, q13, #7 - vqrshrn.u16 d15, q14, #7 - vqrshrn.u16 d16, q1, #7 - vqrshrn.u16 d17, q2, #7 - vqrshrn.u16 d18, q3, #7 - vqrshrn.u16 d19, q4, #7 - - vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result - vst1.u8 {d14, d15, d16, d17}, [lr]! - vst1.u8 {d18, d19}, [lr]! - -;Second pass: 16x16 -;secondpass_filter - add r3, r12, r3, lsl #3 - sub lr, lr, #272 - - vld1.u32 {d31}, [r3] ;load second_pass filter - - vld1.u8 {d22, d23}, [lr]! ;load src data - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - mov r12, #4 ;loop counter - -filt_blk2d_sp16x16_loop_neon - vld1.u8 {d24, d25}, [lr]! - vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) - vld1.u8 {d26, d27}, [lr]! - vmull.u8 q2, d23, d0 - vld1.u8 {d28, d29}, [lr]! - vmull.u8 q3, d24, d0 - vld1.u8 {d30, d31}, [lr]! - - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) - vmlal.u8 q2, d25, d1 - vmlal.u8 q3, d26, d1 - vmlal.u8 q4, d27, d1 - vmlal.u8 q5, d28, d1 - vmlal.u8 q6, d29, d1 - vmlal.u8 q7, d30, d1 - vmlal.u8 q8, d31, d1 - - subs r12, r12, #1 - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - vqrshrn.u16 d6, q5, #7 - vqrshrn.u16 d7, q6, #7 - vqrshrn.u16 d8, q7, #7 - vqrshrn.u16 d9, q8, #7 - - vst1.u8 {d2, d3}, [r4], r5 ;store result - vst1.u8 {d4, d5}, [r4], r5 - vst1.u8 {d6, d7}, [r4], r5 - vmov q11, q15 - vst1.u8 {d8, d9}, [r4], r5 - - bne filt_blk2d_sp16x16_loop_neon - - add sp, sp, #272 - - pop {r4-r5,pc} - -;-------------------- -firstpass_bfilter16x16_only - mov r2, #4 ;loop counter - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vdup.8 d1, d31[4] - -;First Pass: output_height lines x output_width columns (16x16) -filt_blk2d_fpo16x16_loop_neon - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - vld1.u8 {d5, d6, d7}, [r0], r1 - vld1.u8 {d8, d9, d10}, [r0], r1 - vld1.u8 {d11, d12, d13}, [r0], r1 - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q8, d3, d0 - vmull.u8 q9, d5, d0 - vmull.u8 q10, d6, d0 - vmull.u8 q11, d8, d0 - vmull.u8 q12, d9, d0 - vmull.u8 q13, d11, d0 - vmull.u8 q14, d12, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - vext.8 d11, d11, d12, #1 - - vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q9, d5, d1 - vmlal.u8 q11, d8, d1 - vmlal.u8 q13, d11, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - vext.8 d12, d12, d13, #1 - - vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q10, d6, d1 - vmlal.u8 q12, d9, d1 - vmlal.u8 q14, d12, d1 - - subs r2, r2, #1 - - vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d15, q8, #7 - vqrshrn.u16 d16, q9, #7 - vqrshrn.u16 d17, q10, #7 - vqrshrn.u16 d18, q11, #7 - vqrshrn.u16 d19, q12, #7 - vqrshrn.u16 d20, q13, #7 - vst1.u8 {d14, d15}, [r4], r5 ;store result - vqrshrn.u16 d21, q14, #7 - - vst1.u8 {d16, d17}, [r4], r5 - vst1.u8 {d18, d19}, [r4], r5 - vst1.u8 {d20, d21}, [r4], r5 - - bne filt_blk2d_fpo16x16_loop_neon - pop {r4-r5,pc} - -;--------------------- -secondpass_bfilter16x16_only -;Second pass: 16x16 -;secondpass_filter - add r3, r12, r3, lsl #3 - mov r12, #4 ;loop counter - vld1.u32 {d31}, [r3] ;load second_pass filter - vld1.u8 {d22, d23}, [r0], r1 ;load src data - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - -filt_blk2d_spo16x16_loop_neon - vld1.u8 {d24, d25}, [r0], r1 - vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) - vld1.u8 {d26, d27}, [r0], r1 - vmull.u8 q2, d23, d0 - vld1.u8 {d28, d29}, [r0], r1 - vmull.u8 q3, d24, d0 - vld1.u8 {d30, d31}, [r0], r1 - - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) - vmlal.u8 q2, d25, d1 - vmlal.u8 q3, d26, d1 - vmlal.u8 q4, d27, d1 - vmlal.u8 q5, d28, d1 - vmlal.u8 q6, d29, d1 - vmlal.u8 q7, d30, d1 - vmlal.u8 q8, d31, d1 - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - vqrshrn.u16 d6, q5, #7 - vqrshrn.u16 d7, q6, #7 - vqrshrn.u16 d8, q7, #7 - vqrshrn.u16 d9, q8, #7 - - vst1.u8 {d2, d3}, [r4], r5 ;store result - subs r12, r12, #1 - vst1.u8 {d4, d5}, [r4], r5 - vmov q11, q15 - vst1.u8 {d6, d7}, [r4], r5 - vst1.u8 {d8, d9}, [r4], r5 - - bne filt_blk2d_spo16x16_loop_neon - pop {r4-r5,pc} - - ENDP - -;----------------- - -bifilter16_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - - END diff --git a/media/libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm b/media/libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm deleted file mode 100644 index 0ac62436f97d..000000000000 --- a/media/libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm +++ /dev/null @@ -1,130 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_bilinear_predict4x4_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(lr) int dst_pitch - -|vp8_bilinear_predict4x4_neon| PROC - push {r4, lr} - - adr r12, bifilter4_coeff - ldr r4, [sp, #8] ;load parameters from stack - ldr lr, [sp, #12] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq skip_firstpass_filter - -;First pass: output_height lines x output_width columns (5x4) - vld1.u8 {d2}, [r0], r1 ;load src data - add r2, r12, r2, lsl #3 ;calculate Hfilter location (2coeffsx4bytes=8bytes) - - vld1.u8 {d3}, [r0], r1 - vld1.u32 {d31}, [r2] ;first_pass filter - - vld1.u8 {d4}, [r0], r1 - vdup.8 d0, d31[0] ;first_pass filter (d0-d1) - vld1.u8 {d5}, [r0], r1 - vdup.8 d1, d31[4] - vld1.u8 {d6}, [r0], r1 - - vshr.u64 q4, q1, #8 ;construct src_ptr[1] - vshr.u64 q5, q2, #8 - vshr.u64 d12, d6, #8 - - vzip.32 d2, d3 ;put 2-line data in 1 register (src_ptr[0]) - vzip.32 d4, d5 - vzip.32 d8, d9 ;put 2-line data in 1 register (src_ptr[1]) - vzip.32 d10, d11 - - vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q8, d4, d0 - vmull.u8 q9, d6, d0 - - vmlal.u8 q7, d8, d1 ;(src_ptr[1] * vp8_filter[1]) - vmlal.u8 q8, d10, d1 - vmlal.u8 q9, d12, d1 - - vqrshrn.u16 d28, q7, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d29, q8, #7 - vqrshrn.u16 d30, q9, #7 - -;Second pass: 4x4 -secondpass_filter - cmp r3, #0 ;skip second_pass filter if yoffset=0 - beq skip_secondpass_filter - - add r3, r12, r3, lsl #3 ;calculate Vfilter location - vld1.u32 {d31}, [r3] ;load second_pass filter - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d31[4] - - vmull.u8 q1, d28, d0 - vmull.u8 q2, d29, d0 - - vext.8 d26, d28, d29, #4 ;construct src_ptr[pixel_step] - vext.8 d27, d29, d30, #4 - - vmlal.u8 q1, d26, d1 - vmlal.u8 q2, d27, d1 - - add r0, r4, lr - add r1, r0, lr - add r2, r1, lr - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - - vst1.32 {d2[0]}, [r4] ;store result - vst1.32 {d2[1]}, [r0] - vst1.32 {d3[0]}, [r1] - vst1.32 {d3[1]}, [r2] - - pop {r4, pc} - -;-------------------- -skip_firstpass_filter - - vld1.32 {d28[0]}, [r0], r1 ;load src data - vld1.32 {d28[1]}, [r0], r1 - vld1.32 {d29[0]}, [r0], r1 - vld1.32 {d29[1]}, [r0], r1 - vld1.32 {d30[0]}, [r0], r1 - - b secondpass_filter - -;--------------------- -skip_secondpass_filter - vst1.32 {d28[0]}, [r4], lr ;store result - vst1.32 {d28[1]}, [r4], lr - vst1.32 {d29[0]}, [r4], lr - vst1.32 {d29[1]}, [r4], lr - - pop {r4, pc} - - ENDP - -;----------------- - -bifilter4_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - - END diff --git a/media/libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm b/media/libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm deleted file mode 100644 index 41f5c45ffe7b..000000000000 --- a/media/libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm +++ /dev/null @@ -1,135 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_bilinear_predict8x4_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(lr) int dst_pitch - -|vp8_bilinear_predict8x4_neon| PROC - push {r4, lr} - - adr r12, bifilter8x4_coeff - ldr r4, [sp, #8] ;load parameters from stack - ldr lr, [sp, #12] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq skip_firstpass_filter - -;First pass: output_height lines x output_width columns (5x8) - add r2, r12, r2, lsl #3 ;calculate filter location - - vld1.u8 {q1}, [r0], r1 ;load src data - vld1.u32 {d31}, [r2] ;load first_pass filter - vld1.u8 {q2}, [r0], r1 - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vld1.u8 {q3}, [r0], r1 - vdup.8 d1, d31[4] - vld1.u8 {q4}, [r0], r1 - - vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0]) - vld1.u8 {q5}, [r0], r1 - vmull.u8 q7, d4, d0 - vmull.u8 q8, d6, d0 - vmull.u8 q9, d8, d0 - vmull.u8 q10, d10, d0 - - vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - vext.8 d9, d8, d9, #1 - vext.8 d11, d10, d11, #1 - - vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1]) - vmlal.u8 q7, d5, d1 - vmlal.u8 q8, d7, d1 - vmlal.u8 q9, d9, d1 - vmlal.u8 q10, d11, d1 - - vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d23, q7, #7 - vqrshrn.u16 d24, q8, #7 - vqrshrn.u16 d25, q9, #7 - vqrshrn.u16 d26, q10, #7 - -;Second pass: 4x8 -secondpass_filter - cmp r3, #0 ;skip second_pass filter if yoffset=0 - beq skip_secondpass_filter - - add r3, r12, r3, lsl #3 - add r0, r4, lr - - vld1.u32 {d31}, [r3] ;load second_pass filter - add r1, r0, lr - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - - vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q2, d23, d0 - vmull.u8 q3, d24, d0 - vmull.u8 q4, d25, d0 - - vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) - vmlal.u8 q2, d24, d1 - vmlal.u8 q3, d25, d1 - vmlal.u8 q4, d26, d1 - - add r2, r1, lr - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - - vst1.u8 {d2}, [r4] ;store result - vst1.u8 {d3}, [r0] - vst1.u8 {d4}, [r1] - vst1.u8 {d5}, [r2] - - pop {r4, pc} - -;-------------------- -skip_firstpass_filter - vld1.u8 {d22}, [r0], r1 ;load src data - vld1.u8 {d23}, [r0], r1 - vld1.u8 {d24}, [r0], r1 - vld1.u8 {d25}, [r0], r1 - vld1.u8 {d26}, [r0], r1 - - b secondpass_filter - -;--------------------- -skip_secondpass_filter - vst1.u8 {d22}, [r4], lr ;store result - vst1.u8 {d23}, [r4], lr - vst1.u8 {d24}, [r4], lr - vst1.u8 {d25}, [r4], lr - - pop {r4, pc} - - ENDP - -;----------------- - -bifilter8x4_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - - END diff --git a/media/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm b/media/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm deleted file mode 100644 index c4711bc4d4ad..000000000000 --- a/media/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm +++ /dev/null @@ -1,183 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_bilinear_predict8x8_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(lr) int dst_pitch - -|vp8_bilinear_predict8x8_neon| PROC - push {r4, lr} - - adr r12, bifilter8_coeff - ldr r4, [sp, #8] ;load parameters from stack - ldr lr, [sp, #12] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq skip_firstpass_filter - -;First pass: output_height lines x output_width columns (9x8) - add r2, r12, r2, lsl #3 ;calculate filter location - - vld1.u8 {q1}, [r0], r1 ;load src data - vld1.u32 {d31}, [r2] ;load first_pass filter - vld1.u8 {q2}, [r0], r1 - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vld1.u8 {q3}, [r0], r1 - vdup.8 d1, d31[4] - vld1.u8 {q4}, [r0], r1 - - vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q7, d4, d0 - vmull.u8 q8, d6, d0 - vmull.u8 q9, d8, d0 - - vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - vext.8 d9, d8, d9, #1 - - vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1]) - vmlal.u8 q7, d5, d1 - vmlal.u8 q8, d7, d1 - vmlal.u8 q9, d9, d1 - - vld1.u8 {q1}, [r0], r1 ;load src data - vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 - vld1.u8 {q2}, [r0], r1 - vqrshrn.u16 d23, q7, #7 - vld1.u8 {q3}, [r0], r1 - vqrshrn.u16 d24, q8, #7 - vld1.u8 {q4}, [r0], r1 - vqrshrn.u16 d25, q9, #7 - - ;first_pass filtering on the rest 5-line data - vld1.u8 {q5}, [r0], r1 - - vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q7, d4, d0 - vmull.u8 q8, d6, d0 - vmull.u8 q9, d8, d0 - vmull.u8 q10, d10, d0 - - vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - vext.8 d9, d8, d9, #1 - vext.8 d11, d10, d11, #1 - - vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1]) - vmlal.u8 q7, d5, d1 - vmlal.u8 q8, d7, d1 - vmlal.u8 q9, d9, d1 - vmlal.u8 q10, d11, d1 - - vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d27, q7, #7 - vqrshrn.u16 d28, q8, #7 - vqrshrn.u16 d29, q9, #7 - vqrshrn.u16 d30, q10, #7 - -;Second pass: 8x8 -secondpass_filter - cmp r3, #0 ;skip second_pass filter if yoffset=0 - beq skip_secondpass_filter - - add r3, r12, r3, lsl #3 - add r0, r4, lr - - vld1.u32 {d31}, [r3] ;load second_pass filter - add r1, r0, lr - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - - vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q2, d23, d0 - vmull.u8 q3, d24, d0 - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) - vmlal.u8 q2, d24, d1 - vmlal.u8 q3, d25, d1 - vmlal.u8 q4, d26, d1 - vmlal.u8 q5, d27, d1 - vmlal.u8 q6, d28, d1 - vmlal.u8 q7, d29, d1 - vmlal.u8 q8, d30, d1 - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - vqrshrn.u16 d6, q5, #7 - vqrshrn.u16 d7, q6, #7 - vqrshrn.u16 d8, q7, #7 - vqrshrn.u16 d9, q8, #7 - - vst1.u8 {d2}, [r4] ;store result - vst1.u8 {d3}, [r0] - vst1.u8 {d4}, [r1], lr - vst1.u8 {d5}, [r1], lr - vst1.u8 {d6}, [r1], lr - vst1.u8 {d7}, [r1], lr - vst1.u8 {d8}, [r1], lr - vst1.u8 {d9}, [r1], lr - - pop {r4, pc} - -;-------------------- -skip_firstpass_filter - vld1.u8 {d22}, [r0], r1 ;load src data - vld1.u8 {d23}, [r0], r1 - vld1.u8 {d24}, [r0], r1 - vld1.u8 {d25}, [r0], r1 - vld1.u8 {d26}, [r0], r1 - vld1.u8 {d27}, [r0], r1 - vld1.u8 {d28}, [r0], r1 - vld1.u8 {d29}, [r0], r1 - vld1.u8 {d30}, [r0], r1 - - b secondpass_filter - -;--------------------- -skip_secondpass_filter - vst1.u8 {d22}, [r4], lr ;store result - vst1.u8 {d23}, [r4], lr - vst1.u8 {d24}, [r4], lr - vst1.u8 {d25}, [r4], lr - vst1.u8 {d26}, [r4], lr - vst1.u8 {d27}, [r4], lr - vst1.u8 {d28}, [r4], lr - vst1.u8 {d29}, [r4], lr - - pop {r4, pc} - - ENDP - -;----------------- - -bifilter8_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - - END diff --git a/media/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c b/media/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c new file mode 100644 index 000000000000..9824a3193689 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c @@ -0,0 +1,699 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +static const uint8_t bifilter4_coeff[8][2] = { + {128, 0}, + {112, 16}, + { 96, 32}, + { 80, 48}, + { 64, 64}, + { 48, 80}, + { 32, 96}, + { 16, 112} +}; + +void vp8_bilinear_predict4x4_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8; + uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8; + uint8x16_t q1u8, q2u8; + uint16x8_t q1u16, q2u16; + uint16x8_t q7u16, q8u16, q9u16; + uint64x2_t q4u64, q5u64; + uint64x1_t d12u64; + uint32x2x2_t d0u32x2, d1u32x2, d2u32x2, d3u32x2; + + if (xoffset == 0) { // skip_1stpass_filter + uint32x2_t d28u32 = vdup_n_u32(0); + uint32x2_t d29u32 = vdup_n_u32(0); + uint32x2_t d30u32 = vdup_n_u32(0); + + d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 0); + src_ptr += src_pixels_per_line; + d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 1); + src_ptr += src_pixels_per_line; + d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 0); + src_ptr += src_pixels_per_line; + d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 1); + src_ptr += src_pixels_per_line; + d30u32 = vld1_lane_u32((const uint32_t *)src_ptr, d30u32, 0); + d28u8 = vreinterpret_u8_u32(d28u32); + d29u8 = vreinterpret_u8_u32(d29u32); + d30u8 = vreinterpret_u8_u32(d30u32); + } else { + d2u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d3u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d4u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d6u8 = vld1_u8(src_ptr); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + q4u64 = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8); + q5u64 = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8); + d12u64 = vshr_n_u64(vreinterpret_u64_u8(d6u8), 8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q1u8)), + vreinterpret_u32_u8(vget_high_u8(q1u8))); + d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q2u8)), + vreinterpret_u32_u8(vget_high_u8(q2u8))); + d2u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q4u64)), + vreinterpret_u32_u64(vget_high_u64(q4u64))); + d3u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), + vreinterpret_u32_u64(vget_high_u64(q5u64))); + + q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d0u8); + q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d0u8); + q9u16 = vmull_u8(d6u8, d0u8); + + q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d2u32x2.val[0]), d1u8); + q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d3u32x2.val[0]), d1u8); + q9u16 = vmlal_u8(q9u16, vreinterpret_u8_u64(d12u64), d1u8); + + d28u8 = vqrshrn_n_u16(q7u16, 7); + d29u8 = vqrshrn_n_u16(q8u16, 7); + d30u8 = vqrshrn_n_u16(q9u16, 7); + } + + // secondpass_filter + if (yoffset == 0) { // skip_2ndpass_filter + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 1); + } else { + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q1u16 = vmull_u8(d28u8, d0u8); + q2u16 = vmull_u8(d29u8, d0u8); + + d26u8 = vext_u8(d28u8, d29u8, 4); + d27u8 = vext_u8(d29u8, d30u8, 4); + + q1u16 = vmlal_u8(q1u16, d26u8, d1u8); + q2u16 = vmlal_u8(q2u16, d27u8, d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1); + } + return; +} + +void vp8_bilinear_predict8x4_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8; + uint8x8_t d7u8, d9u8, d11u8, d22u8, d23u8, d24u8, d25u8, d26u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16; + uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16; + + if (xoffset == 0) { // skip_1stpass_filter + d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d26u8 = vld1_u8(src_ptr); + } else { + q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q5u8 = vld1q_u8(src_ptr); + + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + q10u16 = vmlal_u8(q10u16, d11u8, d1u8); + + d22u8 = vqrshrn_n_u16(q6u16, 7); + d23u8 = vqrshrn_n_u16(q7u16, 7); + d24u8 = vqrshrn_n_u16(q8u16, 7); + d25u8 = vqrshrn_n_u16(q9u16, 7); + d26u8 = vqrshrn_n_u16(q10u16, 7); + } + + // secondpass_filter + if (yoffset == 0) { // skip_2ndpass_filter + vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d25u8); + } else { + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q1u16 = vmull_u8(d22u8, d0u8); + q2u16 = vmull_u8(d23u8, d0u8); + q3u16 = vmull_u8(d24u8, d0u8); + q4u16 = vmull_u8(d25u8, d0u8); + + q1u16 = vmlal_u8(q1u16, d23u8, d1u8); + q2u16 = vmlal_u8(q2u16, d24u8, d1u8); + q3u16 = vmlal_u8(q3u16, d25u8, d1u8); + q4u16 = vmlal_u8(q4u16, d26u8, d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + + vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d5u8); + } + return; +} + +void vp8_bilinear_predict8x8_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8, d11u8; + uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16; + uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16; + + if (xoffset == 0) { // skip_1stpass_filter + d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d26u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d27u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d28u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d29u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d30u8 = vld1_u8(src_ptr); + } else { + q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + + d22u8 = vqrshrn_n_u16(q6u16, 7); + d23u8 = vqrshrn_n_u16(q7u16, 7); + d24u8 = vqrshrn_n_u16(q8u16, 7); + d25u8 = vqrshrn_n_u16(q9u16, 7); + + // first_pass filtering on the rest 5-line data + q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q5u8 = vld1q_u8(src_ptr); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + q10u16 = vmlal_u8(q10u16, d11u8, d1u8); + + d26u8 = vqrshrn_n_u16(q6u16, 7); + d27u8 = vqrshrn_n_u16(q7u16, 7); + d28u8 = vqrshrn_n_u16(q8u16, 7); + d29u8 = vqrshrn_n_u16(q9u16, 7); + d30u8 = vqrshrn_n_u16(q10u16, 7); + } + + // secondpass_filter + if (yoffset == 0) { // skip_2ndpass_filter + vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d25u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d26u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d27u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d28u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d29u8); + } else { + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q1u16 = vmull_u8(d22u8, d0u8); + q2u16 = vmull_u8(d23u8, d0u8); + q3u16 = vmull_u8(d24u8, d0u8); + q4u16 = vmull_u8(d25u8, d0u8); + q5u16 = vmull_u8(d26u8, d0u8); + q6u16 = vmull_u8(d27u8, d0u8); + q7u16 = vmull_u8(d28u8, d0u8); + q8u16 = vmull_u8(d29u8, d0u8); + + q1u16 = vmlal_u8(q1u16, d23u8, d1u8); + q2u16 = vmlal_u8(q2u16, d24u8, d1u8); + q3u16 = vmlal_u8(q3u16, d25u8, d1u8); + q4u16 = vmlal_u8(q4u16, d26u8, d1u8); + q5u16 = vmlal_u8(q5u16, d27u8, d1u8); + q6u16 = vmlal_u8(q6u16, d28u8, d1u8); + q7u16 = vmlal_u8(q7u16, d29u8, d1u8); + q8u16 = vmlal_u8(q8u16, d30u8, d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d5u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d6u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d7u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d8u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d9u8); + } + return; +} + +void vp8_bilinear_predict16x16_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + int i; + unsigned char tmp[272]; + unsigned char *tmpp; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8; + uint8x8_t d19u8, d20u8, d21u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8; + uint8x16_t q11u8, q12u8, q13u8, q14u8, q15u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16; + uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16; + + if (xoffset == 0) { // secondpass_bfilter16x16_only + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q11u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q13u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q14u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q15u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch; + } + return; + } + + if (yoffset == 0) { // firstpass_bfilter16x16_only + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + for (i = 4; i > 0 ; i--) { + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 =vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)dst_ptr, q7u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q8u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q9u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q10u8); dst_ptr += dst_pitch; + } + return; + } + + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + // First Pass: output_height lines x output_width columns (17x16) + tmpp = tmp; + for (i = 3; i > 0; i--) { + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 = vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q10u8); tmpp += 16; + } + + // First-pass filtering for rest 5 lines + d14u8 = vld1_u8(src_ptr); + d15u8 = vld1_u8(src_ptr + 8); + d16u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + q9u16 = vmull_u8(d2u8, d0u8); + q10u16 = vmull_u8(d3u8, d0u8); + q11u16 = vmull_u8(d5u8, d0u8); + q12u16 = vmull_u8(d6u8, d0u8); + q13u16 = vmull_u8(d8u8, d0u8); + q14u16 = vmull_u8(d9u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + + q9u16 = vmlal_u8(q9u16, d2u8, d1u8); + q11u16 = vmlal_u8(q11u16, d5u8, d1u8); + q13u16 = vmlal_u8(q13u16, d8u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + + q10u16 = vmlal_u8(q10u16, d3u8, d1u8); + q12u16 = vmlal_u8(q12u16, d6u8, d1u8); + q14u16 = vmlal_u8(q14u16, d9u8, d1u8); + + q1u16 = vmull_u8(d11u8, d0u8); + q2u16 = vmull_u8(d12u8, d0u8); + q3u16 = vmull_u8(d14u8, d0u8); + q4u16 = vmull_u8(d15u8, d0u8); + + d11u8 = vext_u8(d11u8, d12u8, 1); + d14u8 = vext_u8(d14u8, d15u8, 1); + + q1u16 = vmlal_u8(q1u16, d11u8, d1u8); + q3u16 = vmlal_u8(q3u16, d14u8, d1u8); + + d12u8 = vext_u8(d12u8, d13u8, 1); + d15u8 = vext_u8(d15u8, d16u8, 1); + + q2u16 = vmlal_u8(q2u16, d12u8, d1u8); + q4u16 = vmlal_u8(q4u16, d15u8, d1u8); + + d10u8 = vqrshrn_n_u16(q9u16, 7); + d11u8 = vqrshrn_n_u16(q10u16, 7); + d12u8 = vqrshrn_n_u16(q11u16, 7); + d13u8 = vqrshrn_n_u16(q12u16, 7); + d14u8 = vqrshrn_n_u16(q13u16, 7); + d15u8 = vqrshrn_n_u16(q14u16, 7); + d16u8 = vqrshrn_n_u16(q1u16, 7); + d17u8 = vqrshrn_n_u16(q2u16, 7); + d18u8 = vqrshrn_n_u16(q3u16, 7); + d19u8 = vqrshrn_n_u16(q4u16, 7); + + q5u8 = vcombine_u8(d10u8, d11u8); + q6u8 = vcombine_u8(d12u8, d13u8); + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + + vst1q_u8((uint8_t *)tmpp, q5u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q6u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); + + // secondpass_filter + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + tmpp = tmp; + q11u8 = vld1q_u8(tmpp); + tmpp += 16; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(tmpp); tmpp += 16; + q13u8 = vld1q_u8(tmpp); tmpp += 16; + q14u8 = vld1q_u8(tmpp); tmpp += 16; + q15u8 = vld1q_u8(tmpp); tmpp += 16; + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch; + } + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm b/media/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm deleted file mode 100644 index e3ea91fe6c04..000000000000 --- a/media/libvpx/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm +++ /dev/null @@ -1,584 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_build_intra_predictors_mby_neon_func| - EXPORT |vp8_build_intra_predictors_mby_s_neon_func| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *y_buffer -; r1 unsigned char *ypred_ptr -; r2 int y_stride -; r3 int mode -; stack int Up -; stack int Left - -|vp8_build_intra_predictors_mby_neon_func| PROC - push {r4-r8, lr} - - cmp r3, #0 - beq case_dc_pred - cmp r3, #1 - beq case_v_pred - cmp r3, #2 - beq case_h_pred - cmp r3, #3 - beq case_tm_pred - -case_dc_pred - ldr r4, [sp, #24] ; Up - ldr r5, [sp, #28] ; Left - - ; Default the DC average to 128 - mov r12, #128 - vdup.u8 q0, r12 - - ; Zero out running sum - mov r12, #0 - - ; compute shift and jump - adds r7, r4, r5 - beq skip_dc_pred_up_left - - ; Load above row, if it exists - cmp r4, #0 - beq skip_dc_pred_up - - sub r6, r0, r2 - vld1.8 {q1}, [r6] - vpaddl.u8 q2, q1 - vpaddl.u16 q3, q2 - vpaddl.u32 q4, q3 - - vmov.32 r4, d8[0] - vmov.32 r6, d9[0] - - add r12, r4, r6 - - ; Move back to interger registers - -skip_dc_pred_up - - cmp r5, #0 - beq skip_dc_pred_left - - sub r0, r0, #1 - - ; Load left row, if it exists - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0] - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - -skip_dc_pred_left - add r7, r7, #3 ; Shift - sub r4, r7, #1 - mov r5, #1 - add r12, r12, r5, lsl r4 - mov r5, r12, lsr r7 ; expected_dc - - vdup.u8 q0, r5 - -skip_dc_pred_up_left - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - - pop {r4-r8,pc} -case_v_pred - ; Copy down above row - sub r6, r0, r2 - vld1.8 {q0}, [r6] - - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - pop {r4-r8,pc} - -case_h_pred - ; Load 4x yleft_col - sub r0, r0, #1 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - pop {r4-r8,pc} - -case_tm_pred - ; Load yabove_row - sub r3, r0, r2 - vld1.8 {q8}, [r3] - - ; Load ytop_left - sub r3, r3, #1 - ldrb r7, [r3] - - vdup.u16 q7, r7 - - ; Compute yabove_row - ytop_left - mov r3, #1 - vdup.u8 q0, r3 - - vmull.u8 q4, d16, d0 - vmull.u8 q5, d17, d0 - - vsub.s16 q4, q4, q7 - vsub.s16 q5, q5, q7 - - ; Load 4x yleft_col - sub r0, r0, #1 - mov r12, #4 - -case_tm_pred_loop - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u16 q0, r3 - vdup.u16 q1, r4 - vdup.u16 q2, r5 - vdup.u16 q3, r6 - - vqadd.s16 q8, q0, q4 - vqadd.s16 q9, q0, q5 - - vqadd.s16 q10, q1, q4 - vqadd.s16 q11, q1, q5 - - vqadd.s16 q12, q2, q4 - vqadd.s16 q13, q2, q5 - - vqadd.s16 q14, q3, q4 - vqadd.s16 q15, q3, q5 - - vqshrun.s16 d0, q8, #0 - vqshrun.s16 d1, q9, #0 - - vqshrun.s16 d2, q10, #0 - vqshrun.s16 d3, q11, #0 - - vqshrun.s16 d4, q12, #0 - vqshrun.s16 d5, q13, #0 - - vqshrun.s16 d6, q14, #0 - vqshrun.s16 d7, q15, #0 - - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - subs r12, r12, #1 - bne case_tm_pred_loop - - pop {r4-r8,pc} - - ENDP - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; r0 unsigned char *y_buffer -; r1 unsigned char *ypred_ptr -; r2 int y_stride -; r3 int mode -; stack int Up -; stack int Left - -|vp8_build_intra_predictors_mby_s_neon_func| PROC - push {r4-r8, lr} - - mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor; - - cmp r3, #0 - beq case_dc_pred_s - cmp r3, #1 - beq case_v_pred_s - cmp r3, #2 - beq case_h_pred_s - cmp r3, #3 - beq case_tm_pred_s - -case_dc_pred_s - ldr r4, [sp, #24] ; Up - ldr r5, [sp, #28] ; Left - - ; Default the DC average to 128 - mov r12, #128 - vdup.u8 q0, r12 - - ; Zero out running sum - mov r12, #0 - - ; compute shift and jump - adds r7, r4, r5 - beq skip_dc_pred_up_left_s - - ; Load above row, if it exists - cmp r4, #0 - beq skip_dc_pred_up_s - - sub r6, r0, r2 - vld1.8 {q1}, [r6] - vpaddl.u8 q2, q1 - vpaddl.u16 q3, q2 - vpaddl.u32 q4, q3 - - vmov.32 r4, d8[0] - vmov.32 r6, d9[0] - - add r12, r4, r6 - - ; Move back to interger registers - -skip_dc_pred_up_s - - cmp r5, #0 - beq skip_dc_pred_left_s - - sub r0, r0, #1 - - ; Load left row, if it exists - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0] - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - -skip_dc_pred_left_s - add r7, r7, #3 ; Shift - sub r4, r7, #1 - mov r5, #1 - add r12, r12, r5, lsl r4 - mov r5, r12, lsr r7 ; expected_dc - - vdup.u8 q0, r5 - -skip_dc_pred_up_left_s - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - - pop {r4-r8,pc} -case_v_pred_s - ; Copy down above row - sub r6, r0, r2 - vld1.8 {q0}, [r6] - - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - pop {r4-r8,pc} - -case_h_pred_s - ; Load 4x yleft_col - sub r0, r0, #1 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - pop {r4-r8,pc} - -case_tm_pred_s - ; Load yabove_row - sub r3, r0, r2 - vld1.8 {q8}, [r3] - - ; Load ytop_left - sub r3, r3, #1 - ldrb r7, [r3] - - vdup.u16 q7, r7 - - ; Compute yabove_row - ytop_left - mov r3, #1 - vdup.u8 q0, r3 - - vmull.u8 q4, d16, d0 - vmull.u8 q5, d17, d0 - - vsub.s16 q4, q4, q7 - vsub.s16 q5, q5, q7 - - ; Load 4x yleft_col - sub r0, r0, #1 - mov r12, #4 - -case_tm_pred_loop_s - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u16 q0, r3 - vdup.u16 q1, r4 - vdup.u16 q2, r5 - vdup.u16 q3, r6 - - vqadd.s16 q8, q0, q4 - vqadd.s16 q9, q0, q5 - - vqadd.s16 q10, q1, q4 - vqadd.s16 q11, q1, q5 - - vqadd.s16 q12, q2, q4 - vqadd.s16 q13, q2, q5 - - vqadd.s16 q14, q3, q4 - vqadd.s16 q15, q3, q5 - - vqshrun.s16 d0, q8, #0 - vqshrun.s16 d1, q9, #0 - - vqshrun.s16 d2, q10, #0 - vqshrun.s16 d3, q11, #0 - - vqshrun.s16 d4, q12, #0 - vqshrun.s16 d5, q13, #0 - - vqshrun.s16 d6, q14, #0 - vqshrun.s16 d7, q15, #0 - - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - subs r12, r12, #1 - bne case_tm_pred_loop_s - - pop {r4-r8,pc} - - ENDP - - - END diff --git a/media/libvpx/vp8/common/arm/neon/copymem16x16_neon.asm b/media/libvpx/vp8/common/arm/neon/copymem16x16_neon.asm deleted file mode 100644 index bda4b965442f..000000000000 --- a/media/libvpx/vp8/common/arm/neon/copymem16x16_neon.asm +++ /dev/null @@ -1,59 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_copy_mem16x16_neon| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp8_copy_mem16x16_neon| PROC - - vld1.u8 {q0}, [r0], r1 - vld1.u8 {q1}, [r0], r1 - vld1.u8 {q2}, [r0], r1 - vst1.u8 {q0}, [r2], r3 - vld1.u8 {q3}, [r0], r1 - vst1.u8 {q1}, [r2], r3 - vld1.u8 {q4}, [r0], r1 - vst1.u8 {q2}, [r2], r3 - vld1.u8 {q5}, [r0], r1 - vst1.u8 {q3}, [r2], r3 - vld1.u8 {q6}, [r0], r1 - vst1.u8 {q4}, [r2], r3 - vld1.u8 {q7}, [r0], r1 - vst1.u8 {q5}, [r2], r3 - vld1.u8 {q8}, [r0], r1 - vst1.u8 {q6}, [r2], r3 - vld1.u8 {q9}, [r0], r1 - vst1.u8 {q7}, [r2], r3 - vld1.u8 {q10}, [r0], r1 - vst1.u8 {q8}, [r2], r3 - vld1.u8 {q11}, [r0], r1 - vst1.u8 {q9}, [r2], r3 - vld1.u8 {q12}, [r0], r1 - vst1.u8 {q10}, [r2], r3 - vld1.u8 {q13}, [r0], r1 - vst1.u8 {q11}, [r2], r3 - vld1.u8 {q14}, [r0], r1 - vst1.u8 {q12}, [r2], r3 - vld1.u8 {q15}, [r0], r1 - vst1.u8 {q13}, [r2], r3 - vst1.u8 {q14}, [r2], r3 - vst1.u8 {q15}, [r2], r3 - - mov pc, lr - - ENDP ; |vp8_copy_mem16x16_neon| - - END diff --git a/media/libvpx/vp8/common/arm/neon/copymem8x4_neon.asm b/media/libvpx/vp8/common/arm/neon/copymem8x4_neon.asm deleted file mode 100644 index 35c0f6708a55..000000000000 --- a/media/libvpx/vp8/common/arm/neon/copymem8x4_neon.asm +++ /dev/null @@ -1,34 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_copy_mem8x4_neon| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp8_copy_mem8x4_neon| PROC - vld1.u8 {d0}, [r0], r1 - vld1.u8 {d1}, [r0], r1 - vst1.u8 {d0}, [r2], r3 - vld1.u8 {d2}, [r0], r1 - vst1.u8 {d1}, [r2], r3 - vld1.u8 {d3}, [r0], r1 - vst1.u8 {d2}, [r2], r3 - vst1.u8 {d3}, [r2], r3 - - mov pc, lr - - ENDP ; |vp8_copy_mem8x4_neon| - - END diff --git a/media/libvpx/vp8/common/arm/neon/copymem8x8_neon.asm b/media/libvpx/vp8/common/arm/neon/copymem8x8_neon.asm deleted file mode 100644 index 1f5b9411bb5e..000000000000 --- a/media/libvpx/vp8/common/arm/neon/copymem8x8_neon.asm +++ /dev/null @@ -1,43 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_copy_mem8x8_neon| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp8_copy_mem8x8_neon| PROC - - vld1.u8 {d0}, [r0], r1 - vld1.u8 {d1}, [r0], r1 - vst1.u8 {d0}, [r2], r3 - vld1.u8 {d2}, [r0], r1 - vst1.u8 {d1}, [r2], r3 - vld1.u8 {d3}, [r0], r1 - vst1.u8 {d2}, [r2], r3 - vld1.u8 {d4}, [r0], r1 - vst1.u8 {d3}, [r2], r3 - vld1.u8 {d5}, [r0], r1 - vst1.u8 {d4}, [r2], r3 - vld1.u8 {d6}, [r0], r1 - vst1.u8 {d5}, [r2], r3 - vld1.u8 {d7}, [r0], r1 - vst1.u8 {d6}, [r2], r3 - vst1.u8 {d7}, [r2], r3 - - mov pc, lr - - ENDP ; |vp8_copy_mem8x8_neon| - - END diff --git a/media/libvpx/vp8/common/arm/neon/copymem_neon.c b/media/libvpx/vp8/common/arm/neon/copymem_neon.c new file mode 100644 index 000000000000..deced115c14c --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/copymem_neon.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +void vp8_copy_mem8x4_neon( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) { + uint8x8_t vtmp; + int r; + + for (r = 0; r < 4; r++) { + vtmp = vld1_u8(src); + vst1_u8(dst, vtmp); + src += src_stride; + dst += dst_stride; + } +} + +void vp8_copy_mem8x8_neon( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) { + uint8x8_t vtmp; + int r; + + for (r = 0; r < 8; r++) { + vtmp = vld1_u8(src); + vst1_u8(dst, vtmp); + src += src_stride; + dst += dst_stride; + } +} + +void vp8_copy_mem16x16_neon( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) { + int r; + uint8x16_t qtmp; + + for (r = 0; r < 16; r++) { + qtmp = vld1q_u8(src); + vst1q_u8(dst, qtmp); + src += src_stride; + dst += dst_stride; + } +} diff --git a/media/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.asm b/media/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.asm deleted file mode 100644 index 79ff02c69401..000000000000 --- a/media/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.asm +++ /dev/null @@ -1,54 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |vp8_dc_only_idct_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, -; int pred_stride, unsigned char *dst_ptr, -; int dst_stride) - -; r0 input_dc -; r1 pred_ptr -; r2 pred_stride -; r3 dst_ptr -; sp dst_stride - -|vp8_dc_only_idct_add_neon| PROC - add r0, r0, #4 - asr r0, r0, #3 - ldr r12, [sp] - vdup.16 q0, r0 - - vld1.32 {d2[0]}, [r1], r2 - vld1.32 {d2[1]}, [r1], r2 - vld1.32 {d4[0]}, [r1], r2 - vld1.32 {d4[1]}, [r1] - - vaddw.u8 q1, q0, d2 - vaddw.u8 q2, q0, d4 - - vqmovun.s16 d2, q1 - vqmovun.s16 d4, q2 - - vst1.32 {d2[0]}, [r3], r12 - vst1.32 {d2[1]}, [r3], r12 - vst1.32 {d4[0]}, [r3], r12 - vst1.32 {d4[1]}, [r3] - - bx lr - - ENDP - - END diff --git a/media/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c b/media/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c new file mode 100644 index 000000000000..ad5f41d7dee1 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +void vp8_dc_only_idct_add_neon( + int16_t input_dc, + unsigned char *pred_ptr, + int pred_stride, + unsigned char *dst_ptr, + int dst_stride) { + int i; + uint16_t a1 = ((input_dc + 4) >> 3); + uint32x2_t d2u32 = vdup_n_u32(0); + uint8x8_t d2u8; + uint16x8_t q1u16; + uint16x8_t qAdd; + + qAdd = vdupq_n_u16(a1); + + for (i = 0; i < 2; i++) { + d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 0); + pred_ptr += pred_stride; + d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 1); + pred_ptr += pred_stride; + + q1u16 = vaddw_u8(qAdd, vreinterpret_u8_u32(d2u32)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0); + dst_ptr += dst_stride; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1); + dst_ptr += dst_stride; + } +} diff --git a/media/libvpx/vp8/common/arm/neon/dequant_idct_neon.asm b/media/libvpx/vp8/common/arm/neon/dequant_idct_neon.asm deleted file mode 100644 index 602cce676973..000000000000 --- a/media/libvpx/vp8/common/arm/neon/dequant_idct_neon.asm +++ /dev/null @@ -1,131 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_dequant_idct_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp8_dequant_idct_add_neon(short *input, short *dq, -; unsigned char *dest, int stride) -; r0 short *input, -; r1 short *dq, -; r2 unsigned char *dest -; r3 int stride - -|vp8_dequant_idct_add_neon| PROC - vld1.16 {q3, q4}, [r0] - vld1.16 {q5, q6}, [r1] - - add r1, r2, r3 ; r1 = dest + stride - lsl r3, #1 ; 2x stride - - vld1.32 {d14[0]}, [r2], r3 - vld1.32 {d14[1]}, [r1], r3 - vld1.32 {d15[0]}, [r2] - vld1.32 {d15[1]}, [r1] - - adr r12, cospi8sqrt2minus1 ; pointer to the first constant - - vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon - vmul.i16 q2, q4, q6 - -;|short_idct4x4llm_neon| PROC - vld1.16 {d0}, [r12] - vswp d3, d4 ;q2(vp[4] vp[12]) - - vqdmulh.s16 q3, q2, d0[2] - vqdmulh.s16 q4, q2, d0[0] - - vqadd.s16 d12, d2, d3 ;a1 - vqsub.s16 d13, d2, d3 ;b1 - - vshr.s16 q3, q3, #1 - vshr.s16 q4, q4, #1 - - vqadd.s16 q3, q3, q2 - vqadd.s16 q4, q4, q2 - - vqsub.s16 d10, d6, d9 ;c1 - vqadd.s16 d11, d7, d8 ;d1 - - vqadd.s16 d2, d12, d11 - vqadd.s16 d3, d13, d10 - vqsub.s16 d4, d13, d10 - vqsub.s16 d5, d12, d11 - - vtrn.32 d2, d4 - vtrn.32 d3, d5 - vtrn.16 d2, d3 - vtrn.16 d4, d5 - -; memset(input, 0, 32) -- 32bytes - vmov.i16 q14, #0 - - vswp d3, d4 - vqdmulh.s16 q3, q2, d0[2] - vqdmulh.s16 q4, q2, d0[0] - - vqadd.s16 d12, d2, d3 ;a1 - vqsub.s16 d13, d2, d3 ;b1 - - vmov q15, q14 - - vshr.s16 q3, q3, #1 - vshr.s16 q4, q4, #1 - - vqadd.s16 q3, q3, q2 - vqadd.s16 q4, q4, q2 - - vqsub.s16 d10, d6, d9 ;c1 - vqadd.s16 d11, d7, d8 ;d1 - - vqadd.s16 d2, d12, d11 - vqadd.s16 d3, d13, d10 - vqsub.s16 d4, d13, d10 - vqsub.s16 d5, d12, d11 - - vst1.16 {q14, q15}, [r0] - - vrshr.s16 d2, d2, #3 - vrshr.s16 d3, d3, #3 - vrshr.s16 d4, d4, #3 - vrshr.s16 d5, d5, #3 - - vtrn.32 d2, d4 - vtrn.32 d3, d5 - vtrn.16 d2, d3 - vtrn.16 d4, d5 - - vaddw.u8 q1, q1, d14 - vaddw.u8 q2, q2, d15 - - sub r2, r2, r3 - sub r1, r1, r3 - - vqmovun.s16 d0, q1 - vqmovun.s16 d1, q2 - - vst1.32 {d0[0]}, [r2], r3 - vst1.32 {d0[1]}, [r1], r3 - vst1.32 {d1[0]}, [r2] - vst1.32 {d1[1]}, [r1] - - bx lr - - ENDP ; |vp8_dequant_idct_add_neon| - -; Constant Pool -cospi8sqrt2minus1 DCD 0x4e7b4e7b -sinpi8sqrt2 DCD 0x8a8c8a8c - - END diff --git a/media/libvpx/vp8/common/arm/neon/dequant_idct_neon.c b/media/libvpx/vp8/common/arm/neon/dequant_idct_neon.c new file mode 100644 index 000000000000..58e11922c760 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/dequant_idct_neon.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +static const int16_t cospi8sqrt2minus1 = 20091; +static const int16_t sinpi8sqrt2 = 35468; + +void vp8_dequant_idct_add_neon( + int16_t *input, + int16_t *dq, + unsigned char *dst, + int stride) { + unsigned char *dst0; + int32x2_t d14, d15; + int16x4_t d2, d3, d4, d5, d10, d11, d12, d13; + int16x8_t q1, q2, q3, q4, q5, q6; + int16x8_t qEmpty = vdupq_n_s16(0); + int32x2x2_t d2tmp0, d2tmp1; + int16x4x2_t d2tmp2, d2tmp3; + + d14 = d15 = vdup_n_s32(0); + + // load input + q3 = vld1q_s16(input); + vst1q_s16(input, qEmpty); + input += 8; + q4 = vld1q_s16(input); + vst1q_s16(input, qEmpty); + + // load dq + q5 = vld1q_s16(dq); + dq += 8; + q6 = vld1q_s16(dq); + + // load src from dst + dst0 = dst; + d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0); + dst0 += stride; + d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1); + dst0 += stride; + d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0); + dst0 += stride; + d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1); + + q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3), + vreinterpretq_u16_s16(q5))); + q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4), + vreinterpretq_u16_s16(q6))); + + d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2)); + d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2)); + + q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2)); + + q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2); + q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1); + + q3 = vshrq_n_s16(q3, 1); + q4 = vshrq_n_s16(q4, 1); + + q3 = vqaddq_s16(q3, q2); + q4 = vqaddq_s16(q4, q2); + + d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); + d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4)); + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]), + vreinterpret_s16_s32(d2tmp1.val[0])); + d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]), + vreinterpret_s16_s32(d2tmp1.val[1])); + + // loop 2 + q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]); + + q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2); + q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1); + + d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]); + d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]); + + q3 = vshrq_n_s16(q3, 1); + q4 = vshrq_n_s16(q4, 1); + + q3 = vqaddq_s16(q3, q2); + q4 = vqaddq_s16(q4, q2); + + d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); + d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4)); + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + d2 = vrshr_n_s16(d2, 3); + d3 = vrshr_n_s16(d3, 3); + d4 = vrshr_n_s16(d4, 3); + d5 = vrshr_n_s16(d5, 3); + + d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]), + vreinterpret_s16_s32(d2tmp1.val[0])); + d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]), + vreinterpret_s16_s32(d2tmp1.val[1])); + + q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]); + q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]); + + q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1), + vreinterpret_u8_s32(d14))); + q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2), + vreinterpret_u8_s32(d15))); + + d14 = vreinterpret_s32_u8(vqmovun_s16(q1)); + d15 = vreinterpret_s32_u8(vqmovun_s16(q2)); + + dst0 = dst; + vst1_lane_s32((int32_t *)dst0, d14, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d14, 1); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d15, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d15, 1); + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/dequantizeb_neon.asm b/media/libvpx/vp8/common/arm/neon/dequantizeb_neon.asm deleted file mode 100644 index c8e0c31f29c9..000000000000 --- a/media/libvpx/vp8/common/arm/neon/dequantizeb_neon.asm +++ /dev/null @@ -1,34 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_dequantize_b_loop_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 short *Q, -; r1 short *DQC -; r2 short *DQ -|vp8_dequantize_b_loop_neon| PROC - vld1.16 {q0, q1}, [r0] - vld1.16 {q2, q3}, [r1] - - vmul.i16 q4, q0, q2 - vmul.i16 q5, q1, q3 - - vst1.16 {q4, q5}, [r2] - - bx lr - - ENDP - - END diff --git a/media/libvpx/vp8/common/arm/neon/dequantizeb_neon.c b/media/libvpx/vp8/common/arm/neon/dequantizeb_neon.c new file mode 100644 index 000000000000..54e709dd3c32 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/dequantizeb_neon.c @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp8/common/blockd.h" + +void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) { + int16x8x2_t qQ, qDQC, qDQ; + + qQ = vld2q_s16(d->qcoeff); + qDQC = vld2q_s16(DQC); + + qDQ.val[0] = vmulq_s16(qQ.val[0], qDQC.val[0]); + qDQ.val[1] = vmulq_s16(qQ.val[1], qDQC.val[1]); + + vst2q_s16(d->dqcoeff, qDQ); +} diff --git a/media/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm b/media/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm deleted file mode 100644 index 6c29c55860d8..000000000000 --- a/media/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm +++ /dev/null @@ -1,79 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |idct_dequant_0_2x_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void idct_dequant_0_2x_neon(short *q, short dq, -; unsigned char *dst, int stride); -; r0 *q -; r1 dq -; r2 *dst -; r3 stride -|idct_dequant_0_2x_neon| PROC - push {r4, r5} - - add r12, r2, #4 - vld1.32 {d2[0]}, [r2], r3 - vld1.32 {d8[0]}, [r12], r3 - vld1.32 {d2[1]}, [r2], r3 - vld1.32 {d8[1]}, [r12], r3 - vld1.32 {d4[0]}, [r2], r3 - vld1.32 {d10[0]}, [r12], r3 - vld1.32 {d4[1]}, [r2], r3 - vld1.32 {d10[1]}, [r12], r3 - - ldrh r12, [r0] ; lo q - ldrh r4, [r0, #32] ; hi q - mov r5, #0 - strh r5, [r0] - strh r5, [r0, #32] - - sxth r12, r12 ; lo - mul r0, r12, r1 - add r0, r0, #4 - asr r0, r0, #3 - vdup.16 q0, r0 - sxth r4, r4 ; hi - mul r0, r4, r1 - add r0, r0, #4 - asr r0, r0, #3 - vdup.16 q3, r0 - - vaddw.u8 q1, q0, d2 ; lo - vaddw.u8 q2, q0, d4 - vaddw.u8 q4, q3, d8 ; hi - vaddw.u8 q5, q3, d10 - - sub r2, r2, r3, lsl #2 ; dst - 4*stride - add r0, r2, #4 - - vqmovun.s16 d2, q1 ; lo - vqmovun.s16 d4, q2 - vqmovun.s16 d8, q4 ; hi - vqmovun.s16 d10, q5 - - vst1.32 {d2[0]}, [r2], r3 ; lo - vst1.32 {d8[0]}, [r0], r3 ; hi - vst1.32 {d2[1]}, [r2], r3 - vst1.32 {d8[1]}, [r0], r3 - vst1.32 {d4[0]}, [r2], r3 - vst1.32 {d10[0]}, [r0], r3 - vst1.32 {d4[1]}, [r2] - vst1.32 {d10[1]}, [r0] - - pop {r4, r5} - bx lr - - ENDP ; |idct_dequant_0_2x_neon| - END diff --git a/media/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c b/media/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c new file mode 100644 index 000000000000..967c322804ff --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +void idct_dequant_0_2x_neon( + int16_t *q, + int16_t dq, + unsigned char *dst, + int stride) { + unsigned char *dst0; + int i, a0, a1; + int16x8x2_t q2Add; + int32x2_t d2s32, d4s32; + uint8x8_t d2u8, d4u8; + uint16x8_t q1u16, q2u16; + + a0 = ((q[0] * dq) + 4) >> 3; + a1 = ((q[16] * dq) + 4) >> 3; + q[0] = q[16] = 0; + q2Add.val[0] = vdupq_n_s16((int16_t)a0); + q2Add.val[1] = vdupq_n_s16((int16_t)a1); + + for (i = 0; i < 2; i++, dst += 4) { + dst0 = dst; + d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0); + dst0 += stride; + d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1); + dst0 += stride; + d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0); + dst0 += stride; + d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1); + + q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), + vreinterpret_u8_s32(d2s32)); + q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), + vreinterpret_u8_s32(d4s32)); + + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); + + d2s32 = vreinterpret_s32_u8(d2u8); + d4s32 = vreinterpret_s32_u8(d4u8); + + dst0 = dst; + vst1_lane_s32((int32_t *)dst0, d2s32, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d2s32, 1); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d4s32, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d4s32, 1); + } + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm b/media/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm deleted file mode 100644 index d5dce63f6bd7..000000000000 --- a/media/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm +++ /dev/null @@ -1,196 +0,0 @@ -; -; Copyright (c) 2010 The Webm project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |idct_dequant_full_2x_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void idct_dequant_full_2x_neon(short *q, short *dq, -; unsigned char *dst, int stride); -; r0 *q, -; r1 *dq, -; r2 *dst -; r3 stride -|idct_dequant_full_2x_neon| PROC - vld1.16 {q0, q1}, [r1] ; dq (same l/r) - vld1.16 {q2, q3}, [r0] ; l q - add r0, r0, #32 - vld1.16 {q4, q5}, [r0] ; r q - add r12, r2, #4 - - ; interleave the predictors - vld1.32 {d28[0]}, [r2], r3 ; l pre - vld1.32 {d28[1]}, [r12], r3 ; r pre - vld1.32 {d29[0]}, [r2], r3 - vld1.32 {d29[1]}, [r12], r3 - vld1.32 {d30[0]}, [r2], r3 - vld1.32 {d30[1]}, [r12], r3 - vld1.32 {d31[0]}, [r2], r3 - vld1.32 {d31[1]}, [r12] - - adr r1, cospi8sqrt2minus1 ; pointer to the first constant - - ; dequant: q[i] = q[i] * dq[i] - vmul.i16 q2, q2, q0 - vmul.i16 q3, q3, q1 - vmul.i16 q4, q4, q0 - vmul.i16 q5, q5, q1 - - vld1.16 {d0}, [r1] - - ; q2: l0r0 q3: l8r8 - ; q4: l4r4 q5: l12r12 - vswp d5, d8 - vswp d7, d10 - - ; _CONSTANTS_ * 4,12 >> 16 - ; q6: 4 * sinpi : c1/temp1 - ; q7: 12 * sinpi : d1/temp2 - ; q8: 4 * cospi - ; q9: 12 * cospi - vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2 - vqdmulh.s16 q7, q5, d0[2] - vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1 - vqdmulh.s16 q9, q5, d0[0] - - vqadd.s16 q10, q2, q3 ; a1 = 0 + 8 - vqsub.s16 q11, q2, q3 ; b1 = 0 - 8 - - ; vqdmulh only accepts signed values. this was a problem because - ; our constant had the high bit set, and was treated as a negative value. - ; vqdmulh also doubles the value before it shifts by 16. we need to - ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0, - ; so we can shift the constant without losing precision. this avoids - ; shift again afterward, but also avoids the sign issue. win win! - ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we - ; pre-shift it - vshr.s16 q8, q8, #1 - vshr.s16 q9, q9, #1 - - ; q4: 4 + 4 * cospi : d1/temp1 - ; q5: 12 + 12 * cospi : c1/temp2 - vqadd.s16 q4, q4, q8 - vqadd.s16 q5, q5, q9 - - ; c1 = temp1 - temp2 - ; d1 = temp1 + temp2 - vqsub.s16 q2, q6, q5 - vqadd.s16 q3, q4, q7 - - ; [0]: a1+d1 - ; [1]: b1+c1 - ; [2]: b1-c1 - ; [3]: a1-d1 - vqadd.s16 q4, q10, q3 - vqadd.s16 q5, q11, q2 - vqsub.s16 q6, q11, q2 - vqsub.s16 q7, q10, q3 - - ; rotate - vtrn.32 q4, q6 - vtrn.32 q5, q7 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - ; idct loop 2 - ; q4: l 0, 4, 8,12 r 0, 4, 8,12 - ; q5: l 1, 5, 9,13 r 1, 5, 9,13 - ; q6: l 2, 6,10,14 r 2, 6,10,14 - ; q7: l 3, 7,11,15 r 3, 7,11,15 - - ; q8: 1 * sinpi : c1/temp1 - ; q9: 3 * sinpi : d1/temp2 - ; q10: 1 * cospi - ; q11: 3 * cospi - vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2 - vqdmulh.s16 q9, q7, d0[2] - vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1 - vqdmulh.s16 q11, q7, d0[0] - - vqadd.s16 q2, q4, q6 ; a1 = 0 + 2 - vqsub.s16 q3, q4, q6 ; b1 = 0 - 2 - - ; see note on shifting above - vshr.s16 q10, q10, #1 - vshr.s16 q11, q11, #1 - - ; q10: 1 + 1 * cospi : d1/temp1 - ; q11: 3 + 3 * cospi : c1/temp2 - vqadd.s16 q10, q5, q10 - vqadd.s16 q11, q7, q11 - - ; q8: c1 = temp1 - temp2 - ; q9: d1 = temp1 + temp2 - vqsub.s16 q8, q8, q11 - vqadd.s16 q9, q10, q9 - - ; a1+d1 - ; b1+c1 - ; b1-c1 - ; a1-d1 - vqadd.s16 q4, q2, q9 - vqadd.s16 q5, q3, q8 - vqsub.s16 q6, q3, q8 - vqsub.s16 q7, q2, q9 - - ; +4 >> 3 (rounding) - vrshr.s16 q4, q4, #3 ; lo - vrshr.s16 q5, q5, #3 - vrshr.s16 q6, q6, #3 ; hi - vrshr.s16 q7, q7, #3 - - vtrn.32 q4, q6 - vtrn.32 q5, q7 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - - ; adding pre - ; input is still packed. pre was read interleaved - vaddw.u8 q4, q4, d28 - vaddw.u8 q5, q5, d29 - vaddw.u8 q6, q6, d30 - vaddw.u8 q7, q7, d31 - - vmov.i16 q14, #0 - vmov q15, q14 - vst1.16 {q14, q15}, [r0] ; write over high input - sub r0, r0, #32 - vst1.16 {q14, q15}, [r0] ; write over low input - - sub r2, r2, r3, lsl #2 ; dst - 4*stride - add r1, r2, #4 ; hi - - ;saturate and narrow - vqmovun.s16 d0, q4 ; lo - vqmovun.s16 d1, q5 - vqmovun.s16 d2, q6 ; hi - vqmovun.s16 d3, q7 - - vst1.32 {d0[0]}, [r2], r3 ; lo - vst1.32 {d0[1]}, [r1], r3 ; hi - vst1.32 {d1[0]}, [r2], r3 - vst1.32 {d1[1]}, [r1], r3 - vst1.32 {d2[0]}, [r2], r3 - vst1.32 {d2[1]}, [r1], r3 - vst1.32 {d3[0]}, [r2] - vst1.32 {d3[1]}, [r1] - - bx lr - - ENDP ; |idct_dequant_full_2x_neon| - -; Constant Pool -cospi8sqrt2minus1 DCD 0x4e7b -; because the lowest bit in 0x8a8c is 0, we can pre-shift this -sinpi8sqrt2 DCD 0x4546 - - END diff --git a/media/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c b/media/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c new file mode 100644 index 000000000000..a60ed46b7646 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +static const int16_t cospi8sqrt2minus1 = 20091; +static const int16_t sinpi8sqrt2 = 17734; +// because the lowest bit in 0x8a8c is 0, we can pre-shift this + +void idct_dequant_full_2x_neon( + int16_t *q, + int16_t *dq, + unsigned char *dst, + int stride) { + unsigned char *dst0, *dst1; + int32x2_t d28, d29, d30, d31; + int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; + int16x8_t qEmpty = vdupq_n_s16(0); + int32x4x2_t q2tmp0, q2tmp1; + int16x8x2_t q2tmp2, q2tmp3; + int16x4_t dLow0, dLow1, dHigh0, dHigh1; + + d28 = d29 = d30 = d31 = vdup_n_s32(0); + + // load dq + q0 = vld1q_s16(dq); + dq += 8; + q1 = vld1q_s16(dq); + + // load q + q2 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q3 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q4 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + q += 8; + q5 = vld1q_s16(q); + vst1q_s16(q, qEmpty); + + // load src from dst + dst0 = dst; + dst1 = dst + 4; + d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0); + dst0 += stride; + d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1); + dst1 += stride; + d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0); + dst0 += stride; + d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1); + dst1 += stride; + + d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0); + dst0 += stride; + d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1); + dst1 += stride; + d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0); + d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1); + + q2 = vmulq_s16(q2, q0); + q3 = vmulq_s16(q3, q1); + q4 = vmulq_s16(q4, q0); + q5 = vmulq_s16(q5, q1); + + // vswp + dLow0 = vget_low_s16(q2); + dHigh0 = vget_high_s16(q2); + dLow1 = vget_low_s16(q4); + dHigh1 = vget_high_s16(q4); + q2 = vcombine_s16(dLow0, dLow1); + q4 = vcombine_s16(dHigh0, dHigh1); + + dLow0 = vget_low_s16(q3); + dHigh0 = vget_high_s16(q3); + dLow1 = vget_low_s16(q5); + dHigh1 = vget_high_s16(q5); + q3 = vcombine_s16(dLow0, dLow1); + q5 = vcombine_s16(dHigh0, dHigh1); + + q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2); + q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2); + q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1); + q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1); + + q10 = vqaddq_s16(q2, q3); + q11 = vqsubq_s16(q2, q3); + + q8 = vshrq_n_s16(q8, 1); + q9 = vshrq_n_s16(q9, 1); + + q4 = vqaddq_s16(q4, q8); + q5 = vqaddq_s16(q5, q9); + + q2 = vqsubq_s16(q6, q5); + q3 = vqaddq_s16(q7, q4); + + q4 = vqaddq_s16(q10, q3); + q5 = vqaddq_s16(q11, q2); + q6 = vqsubq_s16(q11, q2); + q7 = vqsubq_s16(q10, q3); + + q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); + q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); + q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), + vreinterpretq_s16_s32(q2tmp1.val[0])); + q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), + vreinterpretq_s16_s32(q2tmp1.val[1])); + + // loop 2 + q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2); + q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2); + q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1); + q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1); + + q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]); + q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]); + + q10 = vshrq_n_s16(q10, 1); + q11 = vshrq_n_s16(q11, 1); + + q10 = vqaddq_s16(q2tmp2.val[1], q10); + q11 = vqaddq_s16(q2tmp3.val[1], q11); + + q8 = vqsubq_s16(q8, q11); + q9 = vqaddq_s16(q9, q10); + + q4 = vqaddq_s16(q2, q9); + q5 = vqaddq_s16(q3, q8); + q6 = vqsubq_s16(q3, q8); + q7 = vqsubq_s16(q2, q9); + + q4 = vrshrq_n_s16(q4, 3); + q5 = vrshrq_n_s16(q5, 3); + q6 = vrshrq_n_s16(q6, 3); + q7 = vrshrq_n_s16(q7, 3); + + q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); + q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); + q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), + vreinterpretq_s16_s32(q2tmp1.val[0])); + q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), + vreinterpretq_s16_s32(q2tmp1.val[1])); + + q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), + vreinterpret_u8_s32(d28))); + q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), + vreinterpret_u8_s32(d29))); + q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), + vreinterpret_u8_s32(d30))); + q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), + vreinterpret_u8_s32(d31))); + + d28 = vreinterpret_s32_u8(vqmovun_s16(q4)); + d29 = vreinterpret_s32_u8(vqmovun_s16(q5)); + d30 = vreinterpret_s32_u8(vqmovun_s16(q6)); + d31 = vreinterpret_s32_u8(vqmovun_s16(q7)); + + dst0 = dst; + dst1 = dst + 4; + vst1_lane_s32((int32_t *)dst0, d28, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d28, 1); + dst1 += stride; + vst1_lane_s32((int32_t *)dst0, d29, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d29, 1); + dst1 += stride; + + vst1_lane_s32((int32_t *)dst0, d30, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst1, d30, 1); + dst1 += stride; + vst1_lane_s32((int32_t *)dst0, d31, 0); + vst1_lane_s32((int32_t *)dst1, d31, 1); + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/iwalsh_neon.asm b/media/libvpx/vp8/common/arm/neon/iwalsh_neon.asm deleted file mode 100644 index e8ea2a61976c..000000000000 --- a/media/libvpx/vp8/common/arm/neon/iwalsh_neon.asm +++ /dev/null @@ -1,87 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - EXPORT |vp8_short_inv_walsh4x4_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA |.text|, CODE, READONLY ; name this block of code - -;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff) -|vp8_short_inv_walsh4x4_neon| PROC - - ; read in all four lines of values: d0->d3 - vld1.i16 {q0-q1}, [r0@128] - - ; first for loop - vadd.s16 d4, d0, d3 ;a = [0] + [12] - vadd.s16 d6, d1, d2 ;b = [4] + [8] - vsub.s16 d5, d0, d3 ;d = [0] - [12] - vsub.s16 d7, d1, d2 ;c = [4] - [8] - - vadd.s16 q0, q2, q3 ; a+b d+c - vsub.s16 q1, q2, q3 ; a-b d-c - - vtrn.32 d0, d2 ;d0: 0 1 8 9 - ;d2: 2 3 10 11 - vtrn.32 d1, d3 ;d1: 4 5 12 13 - ;d3: 6 7 14 15 - - vtrn.16 d0, d1 ;d0: 0 4 8 12 - ;d1: 1 5 9 13 - vtrn.16 d2, d3 ;d2: 2 6 10 14 - ;d3: 3 7 11 15 - - ; second for loop - - vadd.s16 d4, d0, d3 ;a = [0] + [3] - vadd.s16 d6, d1, d2 ;b = [1] + [2] - vsub.s16 d5, d0, d3 ;d = [0] - [3] - vsub.s16 d7, d1, d2 ;c = [1] - [2] - - vmov.i16 q8, #3 - - vadd.s16 q0, q2, q3 ; a+b d+c - vsub.s16 q1, q2, q3 ; a-b d-c - - vadd.i16 q0, q0, q8 ;e/f += 3 - vadd.i16 q1, q1, q8 ;g/h += 3 - - vshr.s16 q0, q0, #3 ;e/f >> 3 - vshr.s16 q1, q1, #3 ;g/h >> 3 - - mov r2, #64 - add r3, r1, #32 - - vst1.i16 d0[0], [r1],r2 - vst1.i16 d1[0], [r3],r2 - vst1.i16 d2[0], [r1],r2 - vst1.i16 d3[0], [r3],r2 - - vst1.i16 d0[1], [r1],r2 - vst1.i16 d1[1], [r3],r2 - vst1.i16 d2[1], [r1],r2 - vst1.i16 d3[1], [r3],r2 - - vst1.i16 d0[2], [r1],r2 - vst1.i16 d1[2], [r3],r2 - vst1.i16 d2[2], [r1],r2 - vst1.i16 d3[2], [r3],r2 - - vst1.i16 d0[3], [r1],r2 - vst1.i16 d1[3], [r3],r2 - vst1.i16 d2[3], [r1] - vst1.i16 d3[3], [r3] - - bx lr - ENDP ; |vp8_short_inv_walsh4x4_neon| - - END diff --git a/media/libvpx/vp8/common/arm/neon/iwalsh_neon.c b/media/libvpx/vp8/common/arm/neon/iwalsh_neon.c new file mode 100644 index 000000000000..6ea9dd712aaa --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/iwalsh_neon.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +void vp8_short_inv_walsh4x4_neon( + int16_t *input, + int16_t *mb_dqcoeff) { + int16x8_t q0s16, q1s16, q2s16, q3s16; + int16x4_t d4s16, d5s16, d6s16, d7s16; + int16x4x2_t v2tmp0, v2tmp1; + int32x2x2_t v2tmp2, v2tmp3; + int16x8_t qAdd3; + + q0s16 = vld1q_s16(input); + q1s16 = vld1q_s16(input + 8); + + // 1st for loop + d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16)); + d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16)); + d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16)); + d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16)); + + q2s16 = vcombine_s16(d4s16, d5s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + q0s16 = vaddq_s16(q2s16, q3s16); + q1s16 = vsubq_s16(q2s16, q3s16); + + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)), + vreinterpret_s32_s16(vget_low_s16(q1s16))); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)), + vreinterpret_s32_s16(vget_high_s16(q1s16))); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), + vreinterpret_s16_s32(v2tmp3.val[0])); + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), + vreinterpret_s16_s32(v2tmp3.val[1])); + + // 2nd for loop + d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); + d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); + d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); + d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); + q2s16 = vcombine_s16(d4s16, d5s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + qAdd3 = vdupq_n_s16(3); + + q0s16 = vaddq_s16(q2s16, q3s16); + q1s16 = vsubq_s16(q2s16, q3s16); + + q0s16 = vaddq_s16(q0s16, qAdd3); + q1s16 = vaddq_s16(q1s16, qAdd3); + + q0s16 = vshrq_n_s16(q0s16, 3); + q1s16 = vshrq_n_s16(q1s16, 3); + + // store + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3); + mb_dqcoeff += 16; + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/loopfilter_neon.asm b/media/libvpx/vp8/common/arm/neon/loopfilter_neon.asm deleted file mode 100644 index e44be0a1e34d..000000000000 --- a/media/libvpx/vp8/common/arm/neon/loopfilter_neon.asm +++ /dev/null @@ -1,397 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_loop_filter_horizontal_edge_y_neon| - EXPORT |vp8_loop_filter_horizontal_edge_uv_neon| - EXPORT |vp8_loop_filter_vertical_edge_y_neon| - EXPORT |vp8_loop_filter_vertical_edge_uv_neon| - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src -; r1 int pitch -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -|vp8_loop_filter_horizontal_edge_y_neon| PROC - push {lr} - vdup.u8 q0, r2 ; duplicate blimit - vdup.u8 q1, r3 ; duplicate limit - sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines - ldr r3, [sp, #4] ; load thresh - add r12, r2, r1 - add r1, r1, r1 - - vdup.u8 q2, r3 ; duplicate thresh - - vld1.u8 {q3}, [r2@128], r1 ; p3 - vld1.u8 {q4}, [r12@128], r1 ; p2 - vld1.u8 {q5}, [r2@128], r1 ; p1 - vld1.u8 {q6}, [r12@128], r1 ; p0 - vld1.u8 {q7}, [r2@128], r1 ; q0 - vld1.u8 {q8}, [r12@128], r1 ; q1 - vld1.u8 {q9}, [r2@128] ; q2 - vld1.u8 {q10}, [r12@128] ; q3 - - sub r2, r2, r1, lsl #1 - sub r12, r12, r1, lsl #1 - - bl vp8_loop_filter_neon - - vst1.u8 {q5}, [r2@128], r1 ; store op1 - vst1.u8 {q6}, [r12@128], r1 ; store op0 - vst1.u8 {q7}, [r2@128], r1 ; store oq0 - vst1.u8 {q8}, [r12@128], r1 ; store oq1 - - pop {pc} - ENDP ; |vp8_loop_filter_horizontal_edge_y_neon| - - -; r0 unsigned char *u, -; r1 int pitch, -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -; sp+4 unsigned char *v -|vp8_loop_filter_horizontal_edge_uv_neon| PROC - push {lr} - vdup.u8 q0, r2 ; duplicate blimit - vdup.u8 q1, r3 ; duplicate limit - ldr r12, [sp, #4] ; load thresh - ldr r2, [sp, #8] ; load v ptr - vdup.u8 q2, r12 ; duplicate thresh - - sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines - sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines - - vld1.u8 {d6}, [r3@64], r1 ; p3 - vld1.u8 {d7}, [r12@64], r1 ; p3 - vld1.u8 {d8}, [r3@64], r1 ; p2 - vld1.u8 {d9}, [r12@64], r1 ; p2 - vld1.u8 {d10}, [r3@64], r1 ; p1 - vld1.u8 {d11}, [r12@64], r1 ; p1 - vld1.u8 {d12}, [r3@64], r1 ; p0 - vld1.u8 {d13}, [r12@64], r1 ; p0 - vld1.u8 {d14}, [r3@64], r1 ; q0 - vld1.u8 {d15}, [r12@64], r1 ; q0 - vld1.u8 {d16}, [r3@64], r1 ; q1 - vld1.u8 {d17}, [r12@64], r1 ; q1 - vld1.u8 {d18}, [r3@64], r1 ; q2 - vld1.u8 {d19}, [r12@64], r1 ; q2 - vld1.u8 {d20}, [r3@64] ; q3 - vld1.u8 {d21}, [r12@64] ; q3 - - bl vp8_loop_filter_neon - - sub r0, r0, r1, lsl #1 - sub r2, r2, r1, lsl #1 - - vst1.u8 {d10}, [r0@64], r1 ; store u op1 - vst1.u8 {d11}, [r2@64], r1 ; store v op1 - vst1.u8 {d12}, [r0@64], r1 ; store u op0 - vst1.u8 {d13}, [r2@64], r1 ; store v op0 - vst1.u8 {d14}, [r0@64], r1 ; store u oq0 - vst1.u8 {d15}, [r2@64], r1 ; store v oq0 - vst1.u8 {d16}, [r0@64] ; store u oq1 - vst1.u8 {d17}, [r2@64] ; store v oq1 - - pop {pc} - ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon| - -; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; int count) -; r0 unsigned char *src -; r1 int pitch -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, - -|vp8_loop_filter_vertical_edge_y_neon| PROC - push {lr} - vdup.u8 q0, r2 ; duplicate blimit - vdup.u8 q1, r3 ; duplicate limit - sub r2, r0, #4 ; src ptr down by 4 columns - add r1, r1, r1 - ldr r3, [sp, #4] ; load thresh - add r12, r2, r1, asr #1 - - vld1.u8 {d6}, [r2], r1 - vld1.u8 {d8}, [r12], r1 - vld1.u8 {d10}, [r2], r1 - vld1.u8 {d12}, [r12], r1 - vld1.u8 {d14}, [r2], r1 - vld1.u8 {d16}, [r12], r1 - vld1.u8 {d18}, [r2], r1 - vld1.u8 {d20}, [r12], r1 - - vld1.u8 {d7}, [r2], r1 ; load second 8-line src data - vld1.u8 {d9}, [r12], r1 - vld1.u8 {d11}, [r2], r1 - vld1.u8 {d13}, [r12], r1 - vld1.u8 {d15}, [r2], r1 - vld1.u8 {d17}, [r12], r1 - vld1.u8 {d19}, [r2] - vld1.u8 {d21}, [r12] - - ;transpose to 8x16 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vdup.u8 q2, r3 ; duplicate thresh - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - bl vp8_loop_filter_neon - - vswp d12, d11 - vswp d16, d13 - - sub r0, r0, #2 ; dst ptr - - vswp d14, d12 - vswp d16, d15 - - add r12, r0, r1, asr #1 - - ;store op1, op0, oq0, oq1 - vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 - vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1 - vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 - vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1 - vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 - vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1 - vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 - vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1 - - vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1 - vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1 - vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1 - vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1 - vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1 - vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1 - vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0] - vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12] - - pop {pc} - ENDP ; |vp8_loop_filter_vertical_edge_y_neon| - -; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; unsigned char *v) -; r0 unsigned char *u, -; r1 int pitch, -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -; sp+4 unsigned char *v -|vp8_loop_filter_vertical_edge_uv_neon| PROC - push {lr} - vdup.u8 q0, r2 ; duplicate blimit - sub r12, r0, #4 ; move u pointer down by 4 columns - ldr r2, [sp, #8] ; load v ptr - vdup.u8 q1, r3 ; duplicate limit - sub r3, r2, #4 ; move v pointer down by 4 columns - - vld1.u8 {d6}, [r12], r1 ;load u data - vld1.u8 {d7}, [r3], r1 ;load v data - vld1.u8 {d8}, [r12], r1 - vld1.u8 {d9}, [r3], r1 - vld1.u8 {d10}, [r12], r1 - vld1.u8 {d11}, [r3], r1 - vld1.u8 {d12}, [r12], r1 - vld1.u8 {d13}, [r3], r1 - vld1.u8 {d14}, [r12], r1 - vld1.u8 {d15}, [r3], r1 - vld1.u8 {d16}, [r12], r1 - vld1.u8 {d17}, [r3], r1 - vld1.u8 {d18}, [r12], r1 - vld1.u8 {d19}, [r3], r1 - vld1.u8 {d20}, [r12] - vld1.u8 {d21}, [r3] - - ldr r12, [sp, #4] ; load thresh - - ;transpose to 8x16 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vdup.u8 q2, r12 ; duplicate thresh - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - bl vp8_loop_filter_neon - - vswp d12, d11 - vswp d16, d13 - vswp d14, d12 - vswp d16, d15 - - sub r0, r0, #2 - sub r2, r2, #2 - - ;store op1, op0, oq0, oq1 - vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 - vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1 - vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 - vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r2], r1 - vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 - vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r2], r1 - vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 - vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r2], r1 - vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 - vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r2], r1 - vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 - vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r2], r1 - vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 - vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r2], r1 - vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0] - vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2] - - pop {pc} - ENDP ; |vp8_loop_filter_vertical_edge_uv_neon| - -; void vp8_loop_filter_neon(); -; This is a helper function for the loopfilters. The invidual functions do the -; necessary load, transpose (if necessary) and store. - -; r0-r3 PRESERVE -; q0 flimit -; q1 limit -; q2 thresh -; q3 p3 -; q4 p2 -; q5 p1 -; q6 p0 -; q7 q0 -; q8 q1 -; q9 q2 -; q10 q3 -|vp8_loop_filter_neon| PROC - - ; vp8_filter_mask - vabd.u8 q11, q3, q4 ; abs(p3 - p2) - vabd.u8 q12, q4, q5 ; abs(p2 - p1) - vabd.u8 q13, q5, q6 ; abs(p1 - p0) - vabd.u8 q14, q8, q7 ; abs(q1 - q0) - vabd.u8 q3, q9, q8 ; abs(q2 - q1) - vabd.u8 q4, q10, q9 ; abs(q3 - q2) - - vmax.u8 q11, q11, q12 - vmax.u8 q12, q13, q14 - vmax.u8 q3, q3, q4 - vmax.u8 q15, q11, q12 - - vabd.u8 q9, q6, q7 ; abs(p0 - q0) - - ; vp8_hevmask - vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 - vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 - vmax.u8 q15, q15, q3 - - vmov.u8 q10, #0x80 ; 0x80 - - vabd.u8 q2, q5, q8 ; a = abs(p1 - q1) - vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2 - - vcge.u8 q15, q1, q15 - - ; vp8_filter() function - ; convert to signed - veor q7, q7, q10 ; qs0 - vshr.u8 q2, q2, #1 ; a = a / 2 - veor q6, q6, q10 ; ps0 - - veor q5, q5, q10 ; ps1 - vqadd.u8 q9, q9, q2 ; a = b + a - - veor q8, q8, q10 ; qs1 - - vmov.u8 q10, #3 ; #3 - - vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) - vsubl.s8 q11, d15, d13 - - vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1 - - vmovl.u8 q4, d20 - - vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1) - vorr q14, q13, q14 ; vp8_hevmask - - vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) - vmul.i16 q11, q11, q4 - - vand q1, q1, q14 ; vp8_filter &= hev - vand q15, q15, q9 ; vp8_filter_mask - - vaddw.s8 q2, q2, d2 - vaddw.s8 q11, q11, d3 - - vmov.u8 q9, #4 ; #4 - - ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d2, q2 - vqmovn.s16 d3, q11 - vand q1, q1, q15 ; vp8_filter &= mask - - vqadd.s8 q2, q1, q10 ; Filter2 = clamp(vp8_filter+3) - vqadd.s8 q1, q1, q9 ; Filter1 = clamp(vp8_filter+4) - vshr.s8 q2, q2, #3 ; Filter2 >>= 3 - vshr.s8 q1, q1, #3 ; Filter1 >>= 3 - - - vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2) - vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1) - - ; outer tap adjustments: ++vp8_filter >> 1 - vrshr.s8 q1, q1, #1 - vbic q1, q1, q14 ; vp8_filter &= ~hev - vmov.u8 q0, #0x80 ; 0x80 - vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp8_filter) - vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp8_filter) - - veor q6, q11, q0 ; *op0 = u^0x80 - veor q7, q10, q0 ; *oq0 = u^0x80 - veor q5, q13, q0 ; *op1 = u^0x80 - veor q8, q12, q0 ; *oq1 = u^0x80 - - bx lr - ENDP ; |vp8_loop_filter_horizontal_edge_y_neon| - -;----------------- - - END diff --git a/media/libvpx/vp8/common/arm/neon/loopfilter_neon.c b/media/libvpx/vp8/common/arm/neon/loopfilter_neon.c new file mode 100644 index 000000000000..0bec7fb06431 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/loopfilter_neon.c @@ -0,0 +1,549 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_config.h" + +static INLINE void vp8_loop_filter_neon( + uint8x16_t qblimit, // flimit + uint8x16_t qlimit, // limit + uint8x16_t qthresh, // thresh + uint8x16_t q3, // p3 + uint8x16_t q4, // p2 + uint8x16_t q5, // p1 + uint8x16_t q6, // p0 + uint8x16_t q7, // q0 + uint8x16_t q8, // q1 + uint8x16_t q9, // q2 + uint8x16_t q10, // q3 + uint8x16_t *q5r, // p1 + uint8x16_t *q6r, // p0 + uint8x16_t *q7r, // q0 + uint8x16_t *q8r) { // q1 + uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int16x8_t q2s16, q11s16; + uint16x8_t q4u16; + int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8; + int8x8_t d2s8, d3s8; + + q11u8 = vabdq_u8(q3, q4); + q12u8 = vabdq_u8(q4, q5); + q13u8 = vabdq_u8(q5, q6); + q14u8 = vabdq_u8(q8, q7); + q3 = vabdq_u8(q9, q8); + q4 = vabdq_u8(q10, q9); + + q11u8 = vmaxq_u8(q11u8, q12u8); + q12u8 = vmaxq_u8(q13u8, q14u8); + q3 = vmaxq_u8(q3, q4); + q15u8 = vmaxq_u8(q11u8, q12u8); + + q9 = vabdq_u8(q6, q7); + + // vp8_hevmask + q13u8 = vcgtq_u8(q13u8, qthresh); + q14u8 = vcgtq_u8(q14u8, qthresh); + q15u8 = vmaxq_u8(q15u8, q3); + + q2u8 = vabdq_u8(q5, q8); + q9 = vqaddq_u8(q9, q9); + + q15u8 = vcgeq_u8(qlimit, q15u8); + + // vp8_filter() function + // convert to signed + q10 = vdupq_n_u8(0x80); + q8 = veorq_u8(q8, q10); + q7 = veorq_u8(q7, q10); + q6 = veorq_u8(q6, q10); + q5 = veorq_u8(q5, q10); + + q2u8 = vshrq_n_u8(q2u8, 1); + q9 = vqaddq_u8(q9, q2u8); + + q10 = vdupq_n_u8(3); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), + vget_low_s8(vreinterpretq_s8_u8(q6))); + q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), + vget_high_s8(vreinterpretq_s8_u8(q6))); + + q9 = vcgeq_u8(qblimit, q9); + + q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), + vreinterpretq_s8_u8(q8)); + + q14u8 = vorrq_u8(q13u8, q14u8); + + q4u16 = vmovl_u8(vget_low_u8(q10)); + q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16)); + q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16)); + + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8); + q15u8 = vandq_u8(q15u8, q9); + + q1s8 = vreinterpretq_s8_u8(q1u8); + q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); + q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8)); + + q9 = vdupq_n_u8(4); + // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) + d2s8 = vqmovn_s16(q2s16); + d3s8 = vqmovn_s16(q11s16); + q1s8 = vcombine_s8(d2s8, d3s8); + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8); + q1s8 = vreinterpretq_s8_u8(q1u8); + + q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10)); + q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9)); + q2s8 = vshrq_n_s8(q2s8, 3); + q1s8 = vshrq_n_s8(q1s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8); + q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8); + + q1s8 = vrshrq_n_s8(q1s8, 1); + q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8); + q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8); + + q0u8 = vdupq_n_u8(0x80); + *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8); + *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); + *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); + *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8); + return; +} + +void vp8_loop_filter_horizontal_edge_y_neon( + unsigned char *src, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + src -= (pitch << 2); + + q3 = vld1q_u8(src); + src += pitch; + q4 = vld1q_u8(src); + src += pitch; + q5 = vld1q_u8(src); + src += pitch; + q6 = vld1q_u8(src); + src += pitch; + q7 = vld1q_u8(src); + src += pitch; + q8 = vld1q_u8(src); + src += pitch; + q9 = vld1q_u8(src); + src += pitch; + q10 = vld1q_u8(src); + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + src -= (pitch * 5); + vst1q_u8(src, q5); + src += pitch; + vst1q_u8(src, q6); + src += pitch; + vst1q_u8(src, q7); + src += pitch; + vst1q_u8(src, q8); + return; +} + +void vp8_loop_filter_horizontal_edge_uv_neon( + unsigned char *u, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + u -= (pitch << 2); + v -= (pitch << 2); + + d6 = vld1_u8(u); + u += pitch; + d7 = vld1_u8(v); + v += pitch; + d8 = vld1_u8(u); + u += pitch; + d9 = vld1_u8(v); + v += pitch; + d10 = vld1_u8(u); + u += pitch; + d11 = vld1_u8(v); + v += pitch; + d12 = vld1_u8(u); + u += pitch; + d13 = vld1_u8(v); + v += pitch; + d14 = vld1_u8(u); + u += pitch; + d15 = vld1_u8(v); + v += pitch; + d16 = vld1_u8(u); + u += pitch; + d17 = vld1_u8(v); + v += pitch; + d18 = vld1_u8(u); + u += pitch; + d19 = vld1_u8(v); + v += pitch; + d20 = vld1_u8(u); + d21 = vld1_u8(v); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + u -= (pitch * 5); + vst1_u8(u, vget_low_u8(q5)); + u += pitch; + vst1_u8(u, vget_low_u8(q6)); + u += pitch; + vst1_u8(u, vget_low_u8(q7)); + u += pitch; + vst1_u8(u, vget_low_u8(q8)); + + v -= (pitch * 5); + vst1_u8(v, vget_high_u8(q5)); + v += pitch; + vst1_u8(v, vget_high_u8(q6)); + v += pitch; + vst1_u8(v, vget_high_u8(q7)); + v += pitch; + vst1_u8(v, vget_high_u8(q8)); + return; +} + +static INLINE void write_4x8(unsigned char *dst, int pitch, + const uint8x8x4_t result) { +#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) + vst4_lane_u8(dst, result, 0); + dst += pitch; + vst4_lane_u8(dst, result, 1); + dst += pitch; + vst4_lane_u8(dst, result, 2); + dst += pitch; + vst4_lane_u8(dst, result, 3); + dst += pitch; + vst4_lane_u8(dst, result, 4); + dst += pitch; + vst4_lane_u8(dst, result, 5); + dst += pitch; + vst4_lane_u8(dst, result, 6); + dst += pitch; + vst4_lane_u8(dst, result, 7); +#else + /* + * uint8x8x4_t result + 00 01 02 03 | 04 05 06 07 + 10 11 12 13 | 14 15 16 17 + 20 21 22 23 | 24 25 26 27 + 30 31 32 33 | 34 35 36 37 + --- + * after vtrn_u16 + 00 01 20 21 | 04 05 24 25 + 02 03 22 23 | 06 07 26 27 + 10 11 30 31 | 14 15 34 35 + 12 13 32 33 | 16 17 36 37 + --- + * after vtrn_u8 + 00 10 20 30 | 04 14 24 34 + 01 11 21 31 | 05 15 25 35 + 02 12 22 32 | 06 16 26 36 + 03 13 23 33 | 07 17 27 37 + */ + const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]), + vreinterpret_u16_u8(result.val[2])); + const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]), + vreinterpret_u16_u8(result.val[3])); + const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]), + vreinterpret_u8_u16(r13_u16.val[0])); + const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]), + vreinterpret_u8_u16(r13_u16.val[1])); + const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]); + const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]); + const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]); + const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]); + vst1_lane_u32((uint32_t *)dst, x_0_4, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_1_5, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_2_6, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_3_7, 0); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_0_4, 1); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_1_5, 1); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_2_6, 1); + dst += pitch; + vst1_lane_u32((uint32_t *)dst, x_3_7, 1); +#endif +} + +void vp8_loop_filter_vertical_edge_y_neon( + unsigned char *src, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + unsigned char *s, *d; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + uint8x8x4_t q4ResultH, q4ResultL; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + s = src - 4; + d6 = vld1_u8(s); + s += pitch; + d8 = vld1_u8(s); + s += pitch; + d10 = vld1_u8(s); + s += pitch; + d12 = vld1_u8(s); + s += pitch; + d14 = vld1_u8(s); + s += pitch; + d16 = vld1_u8(s); + s += pitch; + d18 = vld1_u8(s); + s += pitch; + d20 = vld1_u8(s); + s += pitch; + d7 = vld1_u8(s); + s += pitch; + d9 = vld1_u8(s); + s += pitch; + d11 = vld1_u8(s); + s += pitch; + d13 = vld1_u8(s); + s += pitch; + d15 = vld1_u8(s); + s += pitch; + d17 = vld1_u8(s); + s += pitch; + d19 = vld1_u8(s); + s += pitch; + d21 = vld1_u8(s); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + q4ResultL.val[0] = vget_low_u8(q5); // d10 + q4ResultL.val[1] = vget_low_u8(q6); // d12 + q4ResultL.val[2] = vget_low_u8(q7); // d14 + q4ResultL.val[3] = vget_low_u8(q8); // d16 + q4ResultH.val[0] = vget_high_u8(q5); // d11 + q4ResultH.val[1] = vget_high_u8(q6); // d13 + q4ResultH.val[2] = vget_high_u8(q7); // d15 + q4ResultH.val[3] = vget_high_u8(q8); // d17 + + d = src - 2; + write_4x8(d, pitch, q4ResultL); + d += pitch * 8; + write_4x8(d, pitch, q4ResultH); +} + +void vp8_loop_filter_vertical_edge_uv_neon( + unsigned char *u, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + unsigned char *us, *ud; + unsigned char *vs, *vd; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + uint8x8x4_t q4ResultH, q4ResultL; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + us = u - 4; + d6 = vld1_u8(us); + us += pitch; + d8 = vld1_u8(us); + us += pitch; + d10 = vld1_u8(us); + us += pitch; + d12 = vld1_u8(us); + us += pitch; + d14 = vld1_u8(us); + us += pitch; + d16 = vld1_u8(us); + us += pitch; + d18 = vld1_u8(us); + us += pitch; + d20 = vld1_u8(us); + + vs = v - 4; + d7 = vld1_u8(vs); + vs += pitch; + d9 = vld1_u8(vs); + vs += pitch; + d11 = vld1_u8(vs); + vs += pitch; + d13 = vld1_u8(vs); + vs += pitch; + d15 = vld1_u8(vs); + vs += pitch; + d17 = vld1_u8(vs); + vs += pitch; + d19 = vld1_u8(vs); + vs += pitch; + d21 = vld1_u8(vs); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + q4ResultL.val[0] = vget_low_u8(q5); // d10 + q4ResultL.val[1] = vget_low_u8(q6); // d12 + q4ResultL.val[2] = vget_low_u8(q7); // d14 + q4ResultL.val[3] = vget_low_u8(q8); // d16 + ud = u - 2; + write_4x8(ud, pitch, q4ResultL); + + q4ResultH.val[0] = vget_high_u8(q5); // d11 + q4ResultH.val[1] = vget_high_u8(q6); // d13 + q4ResultH.val[2] = vget_high_u8(q7); // d15 + q4ResultH.val[3] = vget_high_u8(q8); // d17 + vd = v - 2; + write_4x8(vd, pitch, q4ResultH); +} diff --git a/media/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm b/media/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm deleted file mode 100644 index adf848b9c347..000000000000 --- a/media/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm +++ /dev/null @@ -1,117 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - ;EXPORT |vp8_loop_filter_simple_horizontal_edge_neon| - EXPORT |vp8_loop_filter_bhs_neon| - EXPORT |vp8_loop_filter_mbhs_neon| - ARM - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *s, PRESERVE -; r1 int p, PRESERVE -; q1 limit, PRESERVE - -|vp8_loop_filter_simple_horizontal_edge_neon| PROC - - sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines - - vld1.u8 {q7}, [r0@128], r1 ; q0 - vld1.u8 {q5}, [r3@128], r1 ; p0 - vld1.u8 {q8}, [r0@128] ; q1 - vld1.u8 {q6}, [r3@128] ; p1 - - vabd.u8 q15, q6, q7 ; abs(p0 - q0) - vabd.u8 q14, q5, q8 ; abs(p1 - q1) - - vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 - vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 - vmov.u8 q0, #0x80 ; 0x80 - vmov.s16 q13, #3 - vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 - - veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value - veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value - veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value - veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value - - vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1 - - vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) - vsubl.s8 q3, d15, d13 - - vqsub.s8 q4, q5, q8 ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1) - - vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0) - vmul.s16 q3, q3, q13 - - vmov.u8 q10, #0x03 ; 0x03 - vmov.u8 q9, #0x04 ; 0x04 - - vaddw.s8 q2, q2, d8 ; vp8_filter + 3 * ( qs0 - ps0) - vaddw.s8 q3, q3, d9 - - vqmovn.s16 d8, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d9, q3 - - vand q14, q4, q15 ; vp8_filter &= mask - - vqadd.s8 q2, q14, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) - vqadd.s8 q3, q14, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) - vshr.s8 q2, q2, #3 ; Filter2 >>= 3 - vshr.s8 q4, q3, #3 ; Filter1 >>= 3 - - sub r0, r0, r1 - - ;calculate output - vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) - vqsub.s8 q10, q7, q4 ; u = vp8_signed_char_clamp(qs0 - Filter1) - - veor q6, q11, q0 ; *op0 = u^0x80 - veor q7, q10, q0 ; *oq0 = u^0x80 - - vst1.u8 {q6}, [r3@128] ; store op0 - vst1.u8 {q7}, [r0@128] ; store oq0 - - bx lr - ENDP ; |vp8_loop_filter_simple_horizontal_edge_neon| - -; r0 unsigned char *y -; r1 int ystride -; r2 const unsigned char *blimit - -|vp8_loop_filter_bhs_neon| PROC - push {r4, lr} - ldrb r3, [r2] ; load blim from mem - vdup.s8 q1, r3 ; duplicate blim - - add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride - bl vp8_loop_filter_simple_horizontal_edge_neon - ; vp8_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1 - add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride - bl vp8_loop_filter_simple_horizontal_edge_neon - add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride - pop {r4, lr} - b vp8_loop_filter_simple_horizontal_edge_neon - ENDP ;|vp8_loop_filter_bhs_neon| - -; r0 unsigned char *y -; r1 int ystride -; r2 const unsigned char *blimit - -|vp8_loop_filter_mbhs_neon| PROC - ldrb r3, [r2] ; load blim from mem - vdup.s8 q1, r3 ; duplicate mblim - b vp8_loop_filter_simple_horizontal_edge_neon - ENDP ;|vp8_loop_filter_bhs_neon| - - END diff --git a/media/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c b/media/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c new file mode 100644 index 000000000000..b25686ffb88b --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_config.h" + +static INLINE void vp8_loop_filter_simple_horizontal_edge_neon( + unsigned char *s, + int p, + const unsigned char *blimit) { + uint8_t *sp; + uint8x16_t qblimit, q0u8; + uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8; + int16x8_t q2s16, q3s16, q13s16; + int8x8_t d8s8, d9s8; + int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8; + + qblimit = vdupq_n_u8(*blimit); + + sp = s - (p << 1); + q5u8 = vld1q_u8(sp); + sp += p; + q6u8 = vld1q_u8(sp); + sp += p; + q7u8 = vld1q_u8(sp); + sp += p; + q8u8 = vld1q_u8(sp); + + q15u8 = vabdq_u8(q6u8, q7u8); + q14u8 = vabdq_u8(q5u8, q8u8); + + q15u8 = vqaddq_u8(q15u8, q15u8); + q14u8 = vshrq_n_u8(q14u8, 1); + q0u8 = vdupq_n_u8(0x80); + q13s16 = vdupq_n_s16(3); + q15u8 = vqaddq_u8(q15u8, q14u8); + + q5u8 = veorq_u8(q5u8, q0u8); + q6u8 = veorq_u8(q6u8, q0u8); + q7u8 = veorq_u8(q7u8, q0u8); + q8u8 = veorq_u8(q8u8, q0u8); + + q15u8 = vcgeq_u8(qblimit, q15u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)), + vget_low_s8(vreinterpretq_s8_u8(q6u8))); + q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)), + vget_high_s8(vreinterpretq_s8_u8(q6u8))); + + q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8), + vreinterpretq_s8_u8(q8u8)); + + q2s16 = vmulq_s16(q2s16, q13s16); + q3s16 = vmulq_s16(q3s16, q13s16); + + q10u8 = vdupq_n_u8(3); + q9u8 = vdupq_n_u8(4); + + q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8)); + q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8)); + + d8s8 = vqmovn_s16(q2s16); + d9s8 = vqmovn_s16(q3s16); + q4s8 = vcombine_s8(d8s8, d9s8); + + q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8)); + + q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8)); + q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8)); + q2s8 = vshrq_n_s8(q2s8, 3); + q3s8 = vshrq_n_s8(q3s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8); + q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8); + + q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); + q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); + + vst1q_u8(s, q7u8); + s -= p; + vst1q_u8(s, q6u8); + return; +} + +void vp8_loop_filter_bhs_neon( + unsigned char *y_ptr, + int y_stride, + const unsigned char *blimit) { + y_ptr += y_stride * 4; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + y_ptr += y_stride * 4; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + y_ptr += y_stride * 4; + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + return; +} + +void vp8_loop_filter_mbhs_neon( + unsigned char *y_ptr, + int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit); + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/media/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm deleted file mode 100644 index e690df2f7de9..000000000000 --- a/media/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm +++ /dev/null @@ -1,154 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - ;EXPORT |vp8_loop_filter_simple_vertical_edge_neon| - EXPORT |vp8_loop_filter_bvs_neon| - EXPORT |vp8_loop_filter_mbvs_neon| - ARM - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *s, PRESERVE -; r1 int p, PRESERVE -; q1 limit, PRESERVE - -|vp8_loop_filter_simple_vertical_edge_neon| PROC - sub r0, r0, #2 ; move src pointer down by 2 columns - add r12, r1, r1 - add r3, r0, r1 - - vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12 - vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12 - vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12 - vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12 - vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12 - vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12 - vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12 - vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12 - - vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12 - vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12 - vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12 - vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12 - vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12 - vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12 - vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12 - vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3] - - vswp d7, d10 - vswp d12, d9 - - ;vp8_filter_mask() function - ;vp8_hevmask() function - sub r0, r0, r1, lsl #4 - vabd.u8 q15, q5, q4 ; abs(p0 - q0) - vabd.u8 q14, q3, q6 ; abs(p1 - q1) - - vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 - vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 - vmov.u8 q0, #0x80 ; 0x80 - vmov.s16 q11, #3 - vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 - - veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value - veor q5, q5, q0 ; ps0: p0 offset to convert to a signed value - veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value - veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value - - vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 - - vsubl.s8 q2, d8, d10 ; ( qs0 - ps0) - vsubl.s8 q13, d9, d11 - - vqsub.s8 q14, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) - - vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0) - vmul.s16 q13, q13, q11 - - vmov.u8 q11, #0x03 ; 0x03 - vmov.u8 q12, #0x04 ; 0x04 - - vaddw.s8 q2, q2, d28 ; vp8_filter + 3 * ( qs0 - ps0) - vaddw.s8 q13, q13, d29 - - vqmovn.s16 d28, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d29, q13 - - add r0, r0, #1 - add r3, r0, r1 - - vand q14, q14, q15 ; vp8_filter &= mask - - vqadd.s8 q2, q14, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) - vqadd.s8 q3, q14, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) - vshr.s8 q2, q2, #3 ; Filter2 >>= 3 - vshr.s8 q14, q3, #3 ; Filter1 >>= 3 - - ;calculate output - vqadd.s8 q11, q5, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) - vqsub.s8 q10, q4, q14 ; u = vp8_signed_char_clamp(qs0 - Filter1) - - veor q6, q11, q0 ; *op0 = u^0x80 - veor q7, q10, q0 ; *oq0 = u^0x80 - add r12, r1, r1 - vswp d13, d14 - - ;store op1, op0, oq0, oq1 - vst2.8 {d12[0], d13[0]}, [r0], r12 - vst2.8 {d12[1], d13[1]}, [r3], r12 - vst2.8 {d12[2], d13[2]}, [r0], r12 - vst2.8 {d12[3], d13[3]}, [r3], r12 - vst2.8 {d12[4], d13[4]}, [r0], r12 - vst2.8 {d12[5], d13[5]}, [r3], r12 - vst2.8 {d12[6], d13[6]}, [r0], r12 - vst2.8 {d12[7], d13[7]}, [r3], r12 - vst2.8 {d14[0], d15[0]}, [r0], r12 - vst2.8 {d14[1], d15[1]}, [r3], r12 - vst2.8 {d14[2], d15[2]}, [r0], r12 - vst2.8 {d14[3], d15[3]}, [r3], r12 - vst2.8 {d14[4], d15[4]}, [r0], r12 - vst2.8 {d14[5], d15[5]}, [r3], r12 - vst2.8 {d14[6], d15[6]}, [r0], r12 - vst2.8 {d14[7], d15[7]}, [r3] - - bx lr - ENDP ; |vp8_loop_filter_simple_vertical_edge_neon| - -; r0 unsigned char *y -; r1 int ystride -; r2 const unsigned char *blimit - -|vp8_loop_filter_bvs_neon| PROC - push {r4, lr} - ldrb r3, [r2] ; load blim from mem - mov r4, r0 - add r0, r0, #4 - vdup.s8 q1, r3 ; duplicate blim - bl vp8_loop_filter_simple_vertical_edge_neon - ; vp8_loop_filter_simple_vertical_edge_neon preserves r1 and q1 - add r0, r4, #8 - bl vp8_loop_filter_simple_vertical_edge_neon - add r0, r4, #12 - pop {r4, lr} - b vp8_loop_filter_simple_vertical_edge_neon - ENDP ;|vp8_loop_filter_bvs_neon| - -; r0 unsigned char *y -; r1 int ystride -; r2 const unsigned char *blimit - -|vp8_loop_filter_mbvs_neon| PROC - ldrb r3, [r2] ; load mblim from mem - vdup.s8 q1, r3 ; duplicate mblim - b vp8_loop_filter_simple_vertical_edge_neon - ENDP ;|vp8_loop_filter_bvs_neon| - END diff --git a/media/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/media/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c new file mode 100644 index 000000000000..d5178bbae146 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_config.h" + +#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) +static INLINE void write_2x8(unsigned char *dst, int pitch, + const uint8x8x2_t result, + const uint8x8x2_t result2) { + vst2_lane_u8(dst, result, 0); + dst += pitch; + vst2_lane_u8(dst, result, 1); + dst += pitch; + vst2_lane_u8(dst, result, 2); + dst += pitch; + vst2_lane_u8(dst, result, 3); + dst += pitch; + vst2_lane_u8(dst, result, 4); + dst += pitch; + vst2_lane_u8(dst, result, 5); + dst += pitch; + vst2_lane_u8(dst, result, 6); + dst += pitch; + vst2_lane_u8(dst, result, 7); + dst += pitch; + + vst2_lane_u8(dst, result2, 0); + dst += pitch; + vst2_lane_u8(dst, result2, 1); + dst += pitch; + vst2_lane_u8(dst, result2, 2); + dst += pitch; + vst2_lane_u8(dst, result2, 3); + dst += pitch; + vst2_lane_u8(dst, result2, 4); + dst += pitch; + vst2_lane_u8(dst, result2, 5); + dst += pitch; + vst2_lane_u8(dst, result2, 6); + dst += pitch; + vst2_lane_u8(dst, result2, 7); +} +#else +static INLINE void write_2x4(unsigned char *dst, int pitch, + const uint8x8x2_t result) { + /* + * uint8x8x2_t result + 00 01 02 03 | 04 05 06 07 + 10 11 12 13 | 14 15 16 17 + --- + * after vtrn_u8 + 00 10 02 12 | 04 14 06 16 + 01 11 03 13 | 05 15 07 17 + */ + const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0], + result.val[1]); + const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]); + const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]); + vst1_lane_u16((uint16_t *)dst, x_0_4, 0); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 0); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_0_4, 1); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 1); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_0_4, 2); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 2); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_0_4, 3); + dst += pitch; + vst1_lane_u16((uint16_t *)dst, x_1_5, 3); +} + +static INLINE void write_2x8(unsigned char *dst, int pitch, + const uint8x8x2_t result, + const uint8x8x2_t result2) { + write_2x4(dst, pitch, result); + dst += pitch * 8; + write_2x4(dst, pitch, result2); +} +#endif + + +#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) +static INLINE +uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) { + x = vld4_lane_u8(src, x, 0); + src += pitch; + x = vld4_lane_u8(src, x, 1); + src += pitch; + x = vld4_lane_u8(src, x, 2); + src += pitch; + x = vld4_lane_u8(src, x, 3); + src += pitch; + x = vld4_lane_u8(src, x, 4); + src += pitch; + x = vld4_lane_u8(src, x, 5); + src += pitch; + x = vld4_lane_u8(src, x, 6); + src += pitch; + x = vld4_lane_u8(src, x, 7); + return x; +} +#else +static INLINE +uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) { + const uint8x8_t a = vld1_u8(src); + const uint8x8_t b = vld1_u8(src + pitch * 1); + const uint8x8_t c = vld1_u8(src + pitch * 2); + const uint8x8_t d = vld1_u8(src + pitch * 3); + const uint8x8_t e = vld1_u8(src + pitch * 4); + const uint8x8_t f = vld1_u8(src + pitch * 5); + const uint8x8_t g = vld1_u8(src + pitch * 6); + const uint8x8_t h = vld1_u8(src + pitch * 7); + const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a), + vreinterpret_u32_u8(e)); + const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b), + vreinterpret_u32_u8(f)); + const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c), + vreinterpret_u32_u8(g)); + const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d), + vreinterpret_u32_u8(h)); + const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]), + vreinterpret_u16_u32(r26_u32.val[0])); + const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]), + vreinterpret_u16_u32(r37_u32.val[0])); + const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]), + vreinterpret_u8_u16(r13_u16.val[0])); + const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]), + vreinterpret_u8_u16(r13_u16.val[1])); + /* + * after vtrn_u32 + 00 01 02 03 | 40 41 42 43 + 10 11 12 13 | 50 51 52 53 + 20 21 22 23 | 60 61 62 63 + 30 31 32 33 | 70 71 72 73 + --- + * after vtrn_u16 + 00 01 20 21 | 40 41 60 61 + 02 03 22 23 | 42 43 62 63 + 10 11 30 31 | 50 51 70 71 + 12 13 32 33 | 52 52 72 73 + + 00 01 20 21 | 40 41 60 61 + 10 11 30 31 | 50 51 70 71 + 02 03 22 23 | 42 43 62 63 + 12 13 32 33 | 52 52 72 73 + --- + * after vtrn_u8 + 00 10 20 30 | 40 50 60 70 + 01 11 21 31 | 41 51 61 71 + 02 12 22 32 | 42 52 62 72 + 03 13 23 33 | 43 53 63 73 + */ + x.val[0] = r01_u8.val[0]; + x.val[1] = r01_u8.val[1]; + x.val[2] = r23_u8.val[0]; + x.val[3] = r23_u8.val[1]; + + return x; +} +#endif + +static INLINE void vp8_loop_filter_simple_vertical_edge_neon( + unsigned char *s, + int p, + const unsigned char *blimit) { + unsigned char *src1; + uint8x16_t qblimit, q0u8; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8; + int16x8_t q2s16, q13s16, q11s16; + int8x8_t d28s8, d29s8; + int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8; + uint8x8x4_t d0u8x4; // d6, d7, d8, d9 + uint8x8x4_t d1u8x4; // d10, d11, d12, d13 + uint8x8x2_t d2u8x2; // d12, d13 + uint8x8x2_t d3u8x2; // d14, d15 + + qblimit = vdupq_n_u8(*blimit); + + src1 = s - 2; + d0u8x4 = read_4x8(src1, p, d0u8x4); + src1 += p * 8; + d1u8x4 = read_4x8(src1, p, d1u8x4); + + q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]); // d6 d10 + q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]); // d8 d12 + q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]); // d7 d11 + q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]); // d9 d13 + + q15u8 = vabdq_u8(q5u8, q4u8); + q14u8 = vabdq_u8(q3u8, q6u8); + + q15u8 = vqaddq_u8(q15u8, q15u8); + q14u8 = vshrq_n_u8(q14u8, 1); + q0u8 = vdupq_n_u8(0x80); + q11s16 = vdupq_n_s16(3); + q15u8 = vqaddq_u8(q15u8, q14u8); + + q3u8 = veorq_u8(q3u8, q0u8); + q4u8 = veorq_u8(q4u8, q0u8); + q5u8 = veorq_u8(q5u8, q0u8); + q6u8 = veorq_u8(q6u8, q0u8); + + q15u8 = vcgeq_u8(qblimit, q15u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)), + vget_low_s8(vreinterpretq_s8_u8(q5u8))); + q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)), + vget_high_s8(vreinterpretq_s8_u8(q5u8))); + + q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8), + vreinterpretq_s8_u8(q6u8)); + + q2s16 = vmulq_s16(q2s16, q11s16); + q13s16 = vmulq_s16(q13s16, q11s16); + + q11u8 = vdupq_n_u8(3); + q12u8 = vdupq_n_u8(4); + + q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8)); + q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8)); + + d28s8 = vqmovn_s16(q2s16); + d29s8 = vqmovn_s16(q13s16); + q14s8 = vcombine_s8(d28s8, d29s8); + + q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8)); + + q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8)); + q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8)); + q2s8 = vshrq_n_s8(q2s8, 3); + q14s8 = vshrq_n_s8(q3s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8); + q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8); + + q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); + q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); + + d2u8x2.val[0] = vget_low_u8(q6u8); // d12 + d2u8x2.val[1] = vget_low_u8(q7u8); // d14 + d3u8x2.val[0] = vget_high_u8(q6u8); // d13 + d3u8x2.val[1] = vget_high_u8(q7u8); // d15 + + src1 = s - 1; + write_2x8(src1, p, d2u8x2, d3u8x2); +} + +void vp8_loop_filter_bvs_neon( + unsigned char *y_ptr, + int y_stride, + const unsigned char *blimit) { + y_ptr += 4; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + y_ptr += 4; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + y_ptr += 4; + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + return; +} + +void vp8_loop_filter_mbvs_neon( + unsigned char *y_ptr, + int y_stride, + const unsigned char *blimit) { + vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/mbloopfilter_neon.asm b/media/libvpx/vp8/common/arm/neon/mbloopfilter_neon.asm deleted file mode 100644 index f41c156df8b2..000000000000 --- a/media/libvpx/vp8/common/arm/neon/mbloopfilter_neon.asm +++ /dev/null @@ -1,469 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_mbloop_filter_horizontal_edge_y_neon| - EXPORT |vp8_mbloop_filter_horizontal_edge_uv_neon| - EXPORT |vp8_mbloop_filter_vertical_edge_y_neon| - EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon| - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch, -; const unsigned char *blimit, -; const unsigned char *limit, -; const unsigned char *thresh) -; r0 unsigned char *src, -; r1 int pitch, -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -|vp8_mbloop_filter_horizontal_edge_y_neon| PROC - push {lr} - add r1, r1, r1 ; double stride - ldr r12, [sp, #4] ; load thresh - sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines - vdup.u8 q2, r12 ; thresh - add r12, r0, r1, lsr #1 ; move src pointer up by 1 line - - vld1.u8 {q3}, [r0@128], r1 ; p3 - vld1.u8 {q4}, [r12@128], r1 ; p2 - vld1.u8 {q5}, [r0@128], r1 ; p1 - vld1.u8 {q6}, [r12@128], r1 ; p0 - vld1.u8 {q7}, [r0@128], r1 ; q0 - vld1.u8 {q8}, [r12@128], r1 ; q1 - vld1.u8 {q9}, [r0@128], r1 ; q2 - vld1.u8 {q10}, [r12@128], r1 ; q3 - - bl vp8_mbloop_filter_neon - - sub r12, r12, r1, lsl #2 - add r0, r12, r1, lsr #1 - - vst1.u8 {q4}, [r12@128],r1 ; store op2 - vst1.u8 {q5}, [r0@128],r1 ; store op1 - vst1.u8 {q6}, [r12@128], r1 ; store op0 - vst1.u8 {q7}, [r0@128],r1 ; store oq0 - vst1.u8 {q8}, [r12@128] ; store oq1 - vst1.u8 {q9}, [r0@128] ; store oq2 - - pop {pc} - ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon| - -; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch, -; const unsigned char *blimit, -; const unsigned char *limit, -; const unsigned char *thresh, -; unsigned char *v) -; r0 unsigned char *u, -; r1 int pitch, -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -; sp+4 unsigned char *v - -|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC - push {lr} - ldr r12, [sp, #4] ; load thresh - sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines - vdup.u8 q2, r12 ; thresh - ldr r12, [sp, #8] ; load v ptr - sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines - - vld1.u8 {d6}, [r0@64], r1 ; p3 - vld1.u8 {d7}, [r12@64], r1 ; p3 - vld1.u8 {d8}, [r0@64], r1 ; p2 - vld1.u8 {d9}, [r12@64], r1 ; p2 - vld1.u8 {d10}, [r0@64], r1 ; p1 - vld1.u8 {d11}, [r12@64], r1 ; p1 - vld1.u8 {d12}, [r0@64], r1 ; p0 - vld1.u8 {d13}, [r12@64], r1 ; p0 - vld1.u8 {d14}, [r0@64], r1 ; q0 - vld1.u8 {d15}, [r12@64], r1 ; q0 - vld1.u8 {d16}, [r0@64], r1 ; q1 - vld1.u8 {d17}, [r12@64], r1 ; q1 - vld1.u8 {d18}, [r0@64], r1 ; q2 - vld1.u8 {d19}, [r12@64], r1 ; q2 - vld1.u8 {d20}, [r0@64], r1 ; q3 - vld1.u8 {d21}, [r12@64], r1 ; q3 - - bl vp8_mbloop_filter_neon - - sub r0, r0, r1, lsl #3 - sub r12, r12, r1, lsl #3 - - add r0, r0, r1 - add r12, r12, r1 - - vst1.u8 {d8}, [r0@64], r1 ; store u op2 - vst1.u8 {d9}, [r12@64], r1 ; store v op2 - vst1.u8 {d10}, [r0@64], r1 ; store u op1 - vst1.u8 {d11}, [r12@64], r1 ; store v op1 - vst1.u8 {d12}, [r0@64], r1 ; store u op0 - vst1.u8 {d13}, [r12@64], r1 ; store v op0 - vst1.u8 {d14}, [r0@64], r1 ; store u oq0 - vst1.u8 {d15}, [r12@64], r1 ; store v oq0 - vst1.u8 {d16}, [r0@64], r1 ; store u oq1 - vst1.u8 {d17}, [r12@64], r1 ; store v oq1 - vst1.u8 {d18}, [r0@64], r1 ; store u oq2 - vst1.u8 {d19}, [r12@64], r1 ; store v oq2 - - pop {pc} - ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon| - -; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, -; const unsigned char *blimit, -; const unsigned char *limit, -; const unsigned char *thresh) -; r0 unsigned char *src, -; r1 int pitch, -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -|vp8_mbloop_filter_vertical_edge_y_neon| PROC - push {lr} - ldr r12, [sp, #4] ; load thresh - sub r0, r0, #4 ; move src pointer down by 4 columns - vdup.s8 q2, r12 ; thresh - add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines - - vld1.u8 {d6}, [r0], r1 ; load first 8-line src data - vld1.u8 {d7}, [r12], r1 ; load second 8-line src data - vld1.u8 {d8}, [r0], r1 - vld1.u8 {d9}, [r12], r1 - vld1.u8 {d10}, [r0], r1 - vld1.u8 {d11}, [r12], r1 - vld1.u8 {d12}, [r0], r1 - vld1.u8 {d13}, [r12], r1 - vld1.u8 {d14}, [r0], r1 - vld1.u8 {d15}, [r12], r1 - vld1.u8 {d16}, [r0], r1 - vld1.u8 {d17}, [r12], r1 - vld1.u8 {d18}, [r0], r1 - vld1.u8 {d19}, [r12], r1 - vld1.u8 {d20}, [r0], r1 - vld1.u8 {d21}, [r12], r1 - - ;transpose to 8x16 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - sub r0, r0, r1, lsl #3 - - bl vp8_mbloop_filter_neon - - sub r12, r12, r1, lsl #3 - - ;transpose to 16x8 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - ;store op2, op1, op0, oq0, oq1, oq2 - vst1.8 {d6}, [r0], r1 - vst1.8 {d7}, [r12], r1 - vst1.8 {d8}, [r0], r1 - vst1.8 {d9}, [r12], r1 - vst1.8 {d10}, [r0], r1 - vst1.8 {d11}, [r12], r1 - vst1.8 {d12}, [r0], r1 - vst1.8 {d13}, [r12], r1 - vst1.8 {d14}, [r0], r1 - vst1.8 {d15}, [r12], r1 - vst1.8 {d16}, [r0], r1 - vst1.8 {d17}, [r12], r1 - vst1.8 {d18}, [r0], r1 - vst1.8 {d19}, [r12], r1 - vst1.8 {d20}, [r0] - vst1.8 {d21}, [r12] - - pop {pc} - ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon| - -; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch, -; const unsigned char *blimit, -; const unsigned char *limit, -; const unsigned char *thresh, -; unsigned char *v) -; r0 unsigned char *u, -; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, -; sp+4 unsigned char *v -|vp8_mbloop_filter_vertical_edge_uv_neon| PROC - push {lr} - ldr r12, [sp, #4] ; load thresh - sub r0, r0, #4 ; move u pointer down by 4 columns - vdup.u8 q2, r12 ; thresh - ldr r12, [sp, #8] ; load v ptr - sub r12, r12, #4 ; move v pointer down by 4 columns - - vld1.u8 {d6}, [r0], r1 ;load u data - vld1.u8 {d7}, [r12], r1 ;load v data - vld1.u8 {d8}, [r0], r1 - vld1.u8 {d9}, [r12], r1 - vld1.u8 {d10}, [r0], r1 - vld1.u8 {d11}, [r12], r1 - vld1.u8 {d12}, [r0], r1 - vld1.u8 {d13}, [r12], r1 - vld1.u8 {d14}, [r0], r1 - vld1.u8 {d15}, [r12], r1 - vld1.u8 {d16}, [r0], r1 - vld1.u8 {d17}, [r12], r1 - vld1.u8 {d18}, [r0], r1 - vld1.u8 {d19}, [r12], r1 - vld1.u8 {d20}, [r0], r1 - vld1.u8 {d21}, [r12], r1 - - ;transpose to 8x16 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - sub r0, r0, r1, lsl #3 - - bl vp8_mbloop_filter_neon - - sub r12, r12, r1, lsl #3 - - ;transpose to 16x8 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - ;store op2, op1, op0, oq0, oq1, oq2 - vst1.8 {d6}, [r0], r1 - vst1.8 {d7}, [r12], r1 - vst1.8 {d8}, [r0], r1 - vst1.8 {d9}, [r12], r1 - vst1.8 {d10}, [r0], r1 - vst1.8 {d11}, [r12], r1 - vst1.8 {d12}, [r0], r1 - vst1.8 {d13}, [r12], r1 - vst1.8 {d14}, [r0], r1 - vst1.8 {d15}, [r12], r1 - vst1.8 {d16}, [r0], r1 - vst1.8 {d17}, [r12], r1 - vst1.8 {d18}, [r0], r1 - vst1.8 {d19}, [r12], r1 - vst1.8 {d20}, [r0] - vst1.8 {d21}, [r12] - - pop {pc} - ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon| - -; void vp8_mbloop_filter_neon() -; This is a helper function for the macroblock loopfilters. The individual -; functions do the necessary load, transpose (if necessary), preserve (if -; necessary) and store. - -; r0,r1 PRESERVE -; r2 mblimit -; r3 limit - -; q2 thresh -; q3 p3 PRESERVE -; q4 p2 -; q5 p1 -; q6 p0 -; q7 q0 -; q8 q1 -; q9 q2 -; q10 q3 PRESERVE - -|vp8_mbloop_filter_neon| PROC - - ; vp8_filter_mask - vabd.u8 q11, q3, q4 ; abs(p3 - p2) - vabd.u8 q12, q4, q5 ; abs(p2 - p1) - vabd.u8 q13, q5, q6 ; abs(p1 - p0) - vabd.u8 q14, q8, q7 ; abs(q1 - q0) - vabd.u8 q1, q9, q8 ; abs(q2 - q1) - vabd.u8 q0, q10, q9 ; abs(q3 - q2) - - vmax.u8 q11, q11, q12 - vmax.u8 q12, q13, q14 - vmax.u8 q1, q1, q0 - vmax.u8 q15, q11, q12 - - vabd.u8 q12, q6, q7 ; abs(p0 - q0) - - ; vp8_hevmask - vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1 - vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1 - vmax.u8 q15, q15, q1 - - vdup.u8 q1, r3 ; limit - vdup.u8 q2, r2 ; mblimit - - vmov.u8 q0, #0x80 ; 0x80 - - vcge.u8 q15, q1, q15 - - vabd.u8 q1, q5, q8 ; a = abs(p1 - q1) - vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2 - vmov.u16 q11, #3 ; #3 - - ; vp8_filter - ; convert to signed - veor q7, q7, q0 ; qs0 - vshr.u8 q1, q1, #1 ; a = a / 2 - veor q6, q6, q0 ; ps0 - veor q5, q5, q0 ; ps1 - - vqadd.u8 q12, q12, q1 ; a = b + a - - veor q8, q8, q0 ; qs1 - veor q4, q4, q0 ; ps2 - veor q9, q9, q0 ; qs2 - - vorr q14, q13, q14 ; vp8_hevmask - - vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1 - - vsubl.s8 q2, d14, d12 ; qs0 - ps0 - vsubl.s8 q13, d15, d13 - - vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1) - - vmul.i16 q2, q2, q11 ; 3 * ( qs0 - ps0) - - vand q15, q15, q12 ; vp8_filter_mask - - vmul.i16 q13, q13, q11 - - vmov.u8 q12, #3 ; #3 - - vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0) - vaddw.s8 q13, q13, d3 - - vmov.u8 q11, #4 ; #4 - - ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d2, q2 - vqmovn.s16 d3, q13 - - vand q1, q1, q15 ; vp8_filter &= mask - - vmov.u16 q15, #63 ; #63 - - vand q13, q1, q14 ; Filter2 &= hev - - vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4) - vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3) - - vmov q0, q15 - - vshr.s8 q2, q2, #3 ; Filter1 >>= 3 - vshr.s8 q13, q13, #3 ; Filter2 >>= 3 - - vmov q11, q15 - vmov q12, q15 - - vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1) - - vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2) - - vbic q1, q1, q14 ; vp8_filter &= ~hev - - ; roughly 1/7th difference across boundary - ; roughly 2/7th difference across boundary - ; roughly 3/7th difference across boundary - - vmov.u8 d5, #9 ; #9 - vmov.u8 d4, #18 ; #18 - - vmov q13, q15 - vmov q14, q15 - - vmlal.s8 q0, d2, d5 ; 63 + Filter2 * 9 - vmlal.s8 q11, d3, d5 - vmov.u8 d5, #27 ; #27 - vmlal.s8 q12, d2, d4 ; 63 + Filter2 * 18 - vmlal.s8 q13, d3, d4 - vmlal.s8 q14, d2, d5 ; 63 + Filter2 * 27 - vmlal.s8 q15, d3, d5 - - vqshrn.s16 d0, q0, #7 ; u = clamp((63 + Filter2 * 9)>>7) - vqshrn.s16 d1, q11, #7 - vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7) - vqshrn.s16 d25, q13, #7 - vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7) - vqshrn.s16 d29, q15, #7 - - vmov.u8 q1, #0x80 ; 0x80 - - vqsub.s8 q11, q9, q0 ; s = clamp(qs2 - u) - vqadd.s8 q0, q4, q0 ; s = clamp(ps2 + u) - vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u) - vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u) - vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u) - vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u) - - veor q9, q11, q1 ; *oq2 = s^0x80 - veor q4, q0, q1 ; *op2 = s^0x80 - veor q8, q13, q1 ; *oq1 = s^0x80 - veor q5, q12, q1 ; *op2 = s^0x80 - veor q7, q15, q1 ; *oq0 = s^0x80 - veor q6, q14, q1 ; *op0 = s^0x80 - - bx lr - ENDP ; |vp8_mbloop_filter_neon| - -;----------------- - - END diff --git a/media/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c b/media/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c new file mode 100644 index 000000000000..5351f4be665b --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c @@ -0,0 +1,625 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vpx_config.h" + +static INLINE void vp8_mbloop_filter_neon( + uint8x16_t qblimit, // mblimit + uint8x16_t qlimit, // limit + uint8x16_t qthresh, // thresh + uint8x16_t q3, // p2 + uint8x16_t q4, // p2 + uint8x16_t q5, // p1 + uint8x16_t q6, // p0 + uint8x16_t q7, // q0 + uint8x16_t q8, // q1 + uint8x16_t q9, // q2 + uint8x16_t q10, // q3 + uint8x16_t *q4r, // p1 + uint8x16_t *q5r, // p1 + uint8x16_t *q6r, // p0 + uint8x16_t *q7r, // q0 + uint8x16_t *q8r, // q1 + uint8x16_t *q9r) { // q1 + uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8; + uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16; + int8x16_t q0s8, q12s8, q14s8, q15s8; + int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29; + + q11u8 = vabdq_u8(q3, q4); + q12u8 = vabdq_u8(q4, q5); + q13u8 = vabdq_u8(q5, q6); + q14u8 = vabdq_u8(q8, q7); + q1u8 = vabdq_u8(q9, q8); + q0u8 = vabdq_u8(q10, q9); + + q11u8 = vmaxq_u8(q11u8, q12u8); + q12u8 = vmaxq_u8(q13u8, q14u8); + q1u8 = vmaxq_u8(q1u8, q0u8); + q15u8 = vmaxq_u8(q11u8, q12u8); + + q12u8 = vabdq_u8(q6, q7); + + // vp8_hevmask + q13u8 = vcgtq_u8(q13u8, qthresh); + q14u8 = vcgtq_u8(q14u8, qthresh); + q15u8 = vmaxq_u8(q15u8, q1u8); + + q15u8 = vcgeq_u8(qlimit, q15u8); + + q1u8 = vabdq_u8(q5, q8); + q12u8 = vqaddq_u8(q12u8, q12u8); + + // vp8_filter() function + // convert to signed + q0u8 = vdupq_n_u8(0x80); + q9 = veorq_u8(q9, q0u8); + q8 = veorq_u8(q8, q0u8); + q7 = veorq_u8(q7, q0u8); + q6 = veorq_u8(q6, q0u8); + q5 = veorq_u8(q5, q0u8); + q4 = veorq_u8(q4, q0u8); + + q1u8 = vshrq_n_u8(q1u8, 1); + q12u8 = vqaddq_u8(q12u8, q1u8); + + q14u8 = vorrq_u8(q13u8, q14u8); + q12u8 = vcgeq_u8(qblimit, q12u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), + vget_low_s8(vreinterpretq_s8_u8(q6))); + q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), + vget_high_s8(vreinterpretq_s8_u8(q6))); + + q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), + vreinterpretq_s8_u8(q8)); + + q11s16 = vdupq_n_s16(3); + q2s16 = vmulq_s16(q2s16, q11s16); + q13s16 = vmulq_s16(q13s16, q11s16); + + q15u8 = vandq_u8(q15u8, q12u8); + + q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); + q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8)); + + q12u8 = vdupq_n_u8(3); + q11u8 = vdupq_n_u8(4); + // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) + d2 = vqmovn_s16(q2s16); + d3 = vqmovn_s16(q13s16); + q1s8 = vcombine_s8(d2, d3); + q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8)); + q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8)); + q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8)); + q2s8 = vshrq_n_s8(q2s8, 3); + q13s8 = vshrq_n_s8(q13s8, 3); + + q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8); + q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8); + + q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63); + d5 = vdup_n_s8(9); + d4 = vdup_n_s8(18); + + q0s16 = vmlal_s8(vreinterpretq_s16_u16(q0u16), vget_low_s8(q1s8), d5); + q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5); + d5 = vdup_n_s8(27); + q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8), d4); + q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4); + q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8), d5); + q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5); + + d0 = vqshrn_n_s16(q0s16 , 7); + d1 = vqshrn_n_s16(q11s16, 7); + d24 = vqshrn_n_s16(q12s16, 7); + d25 = vqshrn_n_s16(q13s16, 7); + d28 = vqshrn_n_s16(q14s16, 7); + d29 = vqshrn_n_s16(q15s16, 7); + + q0s8 = vcombine_s8(d0, d1); + q12s8 = vcombine_s8(d24, d25); + q14s8 = vcombine_s8(d28, d29); + + q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8); + q0s8 = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8); + q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8); + q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8); + q15s8 = vqsubq_s8((q7s8), q14s8); + q14s8 = vqaddq_s8((q6s8), q14s8); + + q1u8 = vdupq_n_u8(0x80); + *q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8); + *q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8); + *q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8); + *q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8); + *q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8); + *q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8); + return; +} + +void vp8_mbloop_filter_horizontal_edge_y_neon( + unsigned char *src, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + src -= (pitch << 2); + + q3 = vld1q_u8(src); + src += pitch; + q4 = vld1q_u8(src); + src += pitch; + q5 = vld1q_u8(src); + src += pitch; + q6 = vld1q_u8(src); + src += pitch; + q7 = vld1q_u8(src); + src += pitch; + q8 = vld1q_u8(src); + src += pitch; + q9 = vld1q_u8(src); + src += pitch; + q10 = vld1q_u8(src); + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q4, &q5, &q6, &q7, &q8, &q9); + + src -= (pitch * 6); + vst1q_u8(src, q4); + src += pitch; + vst1q_u8(src, q5); + src += pitch; + vst1q_u8(src, q6); + src += pitch; + vst1q_u8(src, q7); + src += pitch; + vst1q_u8(src, q8); + src += pitch; + vst1q_u8(src, q9); + return; +} + +void vp8_mbloop_filter_horizontal_edge_uv_neon( + unsigned char *u, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + u -= (pitch << 2); + v -= (pitch << 2); + + d6 = vld1_u8(u); + u += pitch; + d7 = vld1_u8(v); + v += pitch; + d8 = vld1_u8(u); + u += pitch; + d9 = vld1_u8(v); + v += pitch; + d10 = vld1_u8(u); + u += pitch; + d11 = vld1_u8(v); + v += pitch; + d12 = vld1_u8(u); + u += pitch; + d13 = vld1_u8(v); + v += pitch; + d14 = vld1_u8(u); + u += pitch; + d15 = vld1_u8(v); + v += pitch; + d16 = vld1_u8(u); + u += pitch; + d17 = vld1_u8(v); + v += pitch; + d18 = vld1_u8(u); + u += pitch; + d19 = vld1_u8(v); + v += pitch; + d20 = vld1_u8(u); + d21 = vld1_u8(v); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q4, &q5, &q6, &q7, &q8, &q9); + + u -= (pitch * 6); + v -= (pitch * 6); + vst1_u8(u, vget_low_u8(q4)); + u += pitch; + vst1_u8(v, vget_high_u8(q4)); + v += pitch; + vst1_u8(u, vget_low_u8(q5)); + u += pitch; + vst1_u8(v, vget_high_u8(q5)); + v += pitch; + vst1_u8(u, vget_low_u8(q6)); + u += pitch; + vst1_u8(v, vget_high_u8(q6)); + v += pitch; + vst1_u8(u, vget_low_u8(q7)); + u += pitch; + vst1_u8(v, vget_high_u8(q7)); + v += pitch; + vst1_u8(u, vget_low_u8(q8)); + u += pitch; + vst1_u8(v, vget_high_u8(q8)); + v += pitch; + vst1_u8(u, vget_low_u8(q9)); + vst1_u8(v, vget_high_u8(q9)); + return; +} + +void vp8_mbloop_filter_vertical_edge_y_neon( + unsigned char *src, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + unsigned char *s1, *s2; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + s1 = src - 4; + s2 = s1 + 8 * pitch; + d6 = vld1_u8(s1); + s1 += pitch; + d7 = vld1_u8(s2); + s2 += pitch; + d8 = vld1_u8(s1); + s1 += pitch; + d9 = vld1_u8(s2); + s2 += pitch; + d10 = vld1_u8(s1); + s1 += pitch; + d11 = vld1_u8(s2); + s2 += pitch; + d12 = vld1_u8(s1); + s1 += pitch; + d13 = vld1_u8(s2); + s2 += pitch; + d14 = vld1_u8(s1); + s1 += pitch; + d15 = vld1_u8(s2); + s2 += pitch; + d16 = vld1_u8(s1); + s1 += pitch; + d17 = vld1_u8(s2); + s2 += pitch; + d18 = vld1_u8(s1); + s1 += pitch; + d19 = vld1_u8(s2); + s2 += pitch; + d20 = vld1_u8(s1); + d21 = vld1_u8(s2); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q4, &q5, &q6, &q7, &q8, &q9); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + s1 -= 7 * pitch; + s2 -= 7 * pitch; + + vst1_u8(s1, vget_low_u8(q3)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q3)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q4)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q4)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q5)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q5)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q6)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q6)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q7)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q7)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q8)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q8)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q9)); + s1 += pitch; + vst1_u8(s2, vget_high_u8(q9)); + s2 += pitch; + vst1_u8(s1, vget_low_u8(q10)); + vst1_u8(s2, vget_high_u8(q10)); + return; +} + +void vp8_mbloop_filter_vertical_edge_uv_neon( + unsigned char *u, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + unsigned char *us, *ud; + unsigned char *vs, *vd; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + us = u - 4; + vs = v - 4; + d6 = vld1_u8(us); + us += pitch; + d7 = vld1_u8(vs); + vs += pitch; + d8 = vld1_u8(us); + us += pitch; + d9 = vld1_u8(vs); + vs += pitch; + d10 = vld1_u8(us); + us += pitch; + d11 = vld1_u8(vs); + vs += pitch; + d12 = vld1_u8(us); + us += pitch; + d13 = vld1_u8(vs); + vs += pitch; + d14 = vld1_u8(us); + us += pitch; + d15 = vld1_u8(vs); + vs += pitch; + d16 = vld1_u8(us); + us += pitch; + d17 = vld1_u8(vs); + vs += pitch; + d18 = vld1_u8(us); + us += pitch; + d19 = vld1_u8(vs); + vs += pitch; + d20 = vld1_u8(us); + d21 = vld1_u8(vs); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q4, &q5, &q6, &q7, &q8, &q9); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + ud = u - 4; + vst1_u8(ud, vget_low_u8(q3)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q4)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q5)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q6)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q7)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q8)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q9)); + ud += pitch; + vst1_u8(ud, vget_low_u8(q10)); + + vd = v - 4; + vst1_u8(vd, vget_high_u8(q3)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q4)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q5)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q6)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q7)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q8)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q9)); + vd += pitch; + vst1_u8(vd, vget_high_u8(q10)); + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/reconintra_neon.c b/media/libvpx/vp8/common/arm/neon/reconintra_neon.c new file mode 100644 index 000000000000..af52cd5ea517 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/reconintra_neon.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp8/common/blockd.h" + +void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x, + unsigned char * yabove_row, + unsigned char * yleft, + int left_stride, + unsigned char * ypred_ptr, + int y_stride) { + const int mode = x->mode_info_context->mbmi.mode; + int i; + + switch (mode) { + case DC_PRED: + { + int shift = x->up_available + x->left_available; + uint8x16_t v_expected_dc = vdupq_n_u8(128); + + if (shift) { + unsigned int average = 0; + int expected_dc; + if (x->up_available) { + const uint8x16_t v_above = vld1q_u8(yabove_row); + const uint16x8_t a = vpaddlq_u8(v_above); + const uint32x4_t b = vpaddlq_u16(a); + const uint64x2_t c = vpaddlq_u32(b); + const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)), + vreinterpret_u32_u64(vget_high_u64(c))); + average = vget_lane_u32(d, 0); + } + if (x->left_available) { + for (i = 0; i < 16; ++i) { + average += yleft[0]; + yleft += left_stride; + } + } + shift += 3; + expected_dc = (average + (1 << (shift - 1))) >> shift; + v_expected_dc = vmovq_n_u8((uint8_t)expected_dc); + } + for (i = 0; i < 16; ++i) { + vst1q_u8(ypred_ptr, v_expected_dc); + ypred_ptr += y_stride; + } + } + break; + case V_PRED: + { + const uint8x16_t v_above = vld1q_u8(yabove_row); + for (i = 0; i < 16; ++i) { + vst1q_u8(ypred_ptr, v_above); + ypred_ptr += y_stride; + } + } + break; + case H_PRED: + { + for (i = 0; i < 16; ++i) { + const uint8x16_t v_yleft = vmovq_n_u8((uint8_t)yleft[0]); + yleft += left_stride; + vst1q_u8(ypred_ptr, v_yleft); + ypred_ptr += y_stride; + } + } + break; + case TM_PRED: + { + const uint16x8_t v_ytop_left = vmovq_n_u16((int16_t)yabove_row[-1]); + const uint8x16_t v_above = vld1q_u8(yabove_row); + for (i = 0; i < 16; ++i) { + const uint8x8_t v_yleft = vmov_n_u8((int8_t)yleft[0]); + const uint16x8_t a_lo = vaddl_u8(vget_low_u8(v_above), v_yleft); + const uint16x8_t a_hi = vaddl_u8(vget_high_u8(v_above), v_yleft); + const int16x8_t b_lo = vsubq_s16(vreinterpretq_s16_u16(a_lo), + vreinterpretq_s16_u16(v_ytop_left)); + const int16x8_t b_hi = vsubq_s16(vreinterpretq_s16_u16(a_hi), + vreinterpretq_s16_u16(v_ytop_left)); + const uint8x8_t pred_lo = vqmovun_s16(b_lo); + const uint8x8_t pred_hi = vqmovun_s16(b_hi); + + vst1q_u8(ypred_ptr, vcombine_u8(pred_lo, pred_hi)); + ypred_ptr += y_stride; + yleft += left_stride; + } + } + break; + } +} + +void vp8_build_intra_predictors_mbuv_s_neon(MACROBLOCKD *x, + unsigned char * uabove_row, + unsigned char * vabove_row, + unsigned char * uleft, + unsigned char * vleft, + int left_stride, + unsigned char * upred_ptr, + unsigned char * vpred_ptr, + int pred_stride) { + const int mode = x->mode_info_context->mbmi.uv_mode; + int i; + + switch (mode) { + case DC_PRED: + { + int shift = x->up_available + x->left_available; + uint8x8_t v_expected_udc = vdup_n_u8(128); + uint8x8_t v_expected_vdc = vdup_n_u8(128); + + if (shift) { + unsigned int average_u = 0; + unsigned int average_v = 0; + int expected_udc; + int expected_vdc; + if (x->up_available) { + const uint8x8_t v_uabove = vld1_u8(uabove_row); + const uint8x8_t v_vabove = vld1_u8(vabove_row); + const uint16x8_t a = vpaddlq_u8(vcombine_u8(v_uabove, v_vabove)); + const uint32x4_t b = vpaddlq_u16(a); + const uint64x2_t c = vpaddlq_u32(b); + average_u = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 0); + average_v = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 2); + } + if (x->left_available) { + for (i = 0; i < 8; ++i) { + average_u += uleft[0]; + uleft += left_stride; + average_v += vleft[0]; + vleft += left_stride; + } + } + shift += 2; + expected_udc = (average_u + (1 << (shift - 1))) >> shift; + expected_vdc = (average_v + (1 << (shift - 1))) >> shift; + v_expected_udc = vmov_n_u8((uint8_t)expected_udc); + v_expected_vdc = vmov_n_u8((uint8_t)expected_vdc); + } + for (i = 0; i < 8; ++i) { + vst1_u8(upred_ptr, v_expected_udc); + upred_ptr += pred_stride; + vst1_u8(vpred_ptr, v_expected_vdc); + vpred_ptr += pred_stride; + } + } + break; + case V_PRED: + { + const uint8x8_t v_uabove = vld1_u8(uabove_row); + const uint8x8_t v_vabove = vld1_u8(vabove_row); + for (i = 0; i < 8; ++i) { + vst1_u8(upred_ptr, v_uabove); + upred_ptr += pred_stride; + vst1_u8(vpred_ptr, v_vabove); + vpred_ptr += pred_stride; + } + } + break; + case H_PRED: + { + for (i = 0; i < 8; ++i) { + const uint8x8_t v_uleft = vmov_n_u8((uint8_t)uleft[0]); + const uint8x8_t v_vleft = vmov_n_u8((uint8_t)vleft[0]); + uleft += left_stride; + vleft += left_stride; + vst1_u8(upred_ptr, v_uleft); + upred_ptr += pred_stride; + vst1_u8(vpred_ptr, v_vleft); + vpred_ptr += pred_stride; + } + } + break; + case TM_PRED: + { + const uint16x8_t v_utop_left = vmovq_n_u16((int16_t)uabove_row[-1]); + const uint16x8_t v_vtop_left = vmovq_n_u16((int16_t)vabove_row[-1]); + const uint8x8_t v_uabove = vld1_u8(uabove_row); + const uint8x8_t v_vabove = vld1_u8(vabove_row); + for (i = 0; i < 8; ++i) { + const uint8x8_t v_uleft = vmov_n_u8((int8_t)uleft[0]); + const uint8x8_t v_vleft = vmov_n_u8((int8_t)vleft[0]); + const uint16x8_t a_u = vaddl_u8(v_uabove, v_uleft); + const uint16x8_t a_v = vaddl_u8(v_vabove, v_vleft); + const int16x8_t b_u = vsubq_s16(vreinterpretq_s16_u16(a_u), + vreinterpretq_s16_u16(v_utop_left)); + const int16x8_t b_v = vsubq_s16(vreinterpretq_s16_u16(a_v), + vreinterpretq_s16_u16(v_vtop_left)); + const uint8x8_t pred_u = vqmovun_s16(b_u); + const uint8x8_t pred_v = vqmovun_s16(b_v); + + vst1_u8(upred_ptr, pred_u); + vst1_u8(vpred_ptr, pred_v); + upred_ptr += pred_stride; + vpred_ptr += pred_stride; + uleft += left_stride; + vleft += left_stride; + } + } + break; + } +} diff --git a/media/libvpx/vp8/common/arm/neon/sad16_neon.asm b/media/libvpx/vp8/common/arm/neon/sad16_neon.asm deleted file mode 100644 index d7c590e15a21..000000000000 --- a/media/libvpx/vp8/common/arm/neon/sad16_neon.asm +++ /dev/null @@ -1,207 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sad16x16_neon| - EXPORT |vp8_sad16x8_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int src_stride -; r2 unsigned char *ref_ptr -; r3 int ref_stride -|vp8_sad16x16_neon| PROC -;; - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabdl.u8 q12, d0, d8 - vabdl.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0], r1 - vld1.8 {q7}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - -;; - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabal.u8 q12, d0, d8 - vabal.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0], r1 - vld1.8 {q7}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - -;; - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabal.u8 q12, d0, d8 - vabal.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0], r1 - vld1.8 {q7}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - -;; - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabal.u8 q12, d0, d8 - vabal.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0] - vld1.8 {q7}, [r2] - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vadd.u16 q0, q12, q13 - - vpaddl.u16 q1, q0 - vpaddl.u32 q0, q1 - - vadd.u32 d0, d0, d1 - - vmov.32 r0, d0[0] - - bx lr - - ENDP - -;============================== -;unsigned int vp8_sad16x8_c( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -|vp8_sad16x8_neon| PROC - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabdl.u8 q12, d0, d8 - vabdl.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0], r1 - vld1.8 {q7}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabal.u8 q12, d0, d8 - vabal.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0], r1 - vld1.8 {q7}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vadd.u16 q0, q12, q13 - - vpaddl.u16 q1, q0 - vpaddl.u32 q0, q1 - - vadd.u32 d0, d0, d1 - - vmov.32 r0, d0[0] - - bx lr - - ENDP - - END diff --git a/media/libvpx/vp8/common/arm/neon/sad8_neon.asm b/media/libvpx/vp8/common/arm/neon/sad8_neon.asm deleted file mode 100644 index 23ba6df93a4d..000000000000 --- a/media/libvpx/vp8/common/arm/neon/sad8_neon.asm +++ /dev/null @@ -1,209 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sad8x8_neon| - EXPORT |vp8_sad8x16_neon| - EXPORT |vp8_sad4x4_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; unsigned int vp8_sad8x8_c( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) - -|vp8_sad8x8_neon| PROC - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabdl.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vabal.u8 q12, d6, d14 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabal.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q12, d6, d14 - - vpaddl.u16 q1, q12 - vpaddl.u32 q0, q1 - vadd.u32 d0, d0, d1 - - vmov.32 r0, d0[0] - - bx lr - - ENDP - -;============================ -;unsigned int vp8_sad8x16_c( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) - -|vp8_sad8x16_neon| PROC - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabdl.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vabal.u8 q12, d6, d14 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabal.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vabal.u8 q12, d6, d14 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabal.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vabal.u8 q12, d6, d14 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabal.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q12, d6, d14 - - vpaddl.u16 q1, q12 - vpaddl.u32 q0, q1 - vadd.u32 d0, d0, d1 - - vmov.32 r0, d0[0] - - bx lr - - ENDP - -;=========================== -;unsigned int vp8_sad4x4_c( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) - -|vp8_sad4x4_neon| PROC - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabdl.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q12, d6, d14 - - vpaddl.u16 d1, d24 - vpaddl.u32 d0, d1 - vmov.32 r0, d0[0] - - bx lr - - ENDP - - END diff --git a/media/libvpx/vp8/common/arm/neon/sad_neon.c b/media/libvpx/vp8/common/arm/neon/sad_neon.c new file mode 100644 index 000000000000..6595ac0519b0 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/sad_neon.c @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +unsigned int vp8_sad8x8_neon( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride) { + uint8x8_t d0, d8; + uint16x8_t q12; + uint32x4_t q1; + uint64x2_t q3; + uint32x2_t d5; + int i; + + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(d0, d8); + + for (i = 0; i < 7; i++) { + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, d0, d8); + } + + q1 = vpaddlq_u16(q12); + q3 = vpaddlq_u32(q1); + d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), + vreinterpret_u32_u64(vget_high_u64(q3))); + + return vget_lane_u32(d5, 0); +} + +unsigned int vp8_sad8x16_neon( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride) { + uint8x8_t d0, d8; + uint16x8_t q12; + uint32x4_t q1; + uint64x2_t q3; + uint32x2_t d5; + int i; + + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(d0, d8); + + for (i = 0; i < 15; i++) { + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, d0, d8); + } + + q1 = vpaddlq_u16(q12); + q3 = vpaddlq_u32(q1); + d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), + vreinterpret_u32_u64(vget_high_u64(q3))); + + return vget_lane_u32(d5, 0); +} + +unsigned int vp8_sad4x4_neon( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride) { + uint8x8_t d0, d8; + uint16x8_t q12; + uint32x2_t d1; + uint64x1_t d3; + int i; + + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(d0, d8); + + for (i = 0; i < 3; i++) { + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, d0, d8); + } + + d1 = vpaddl_u16(vget_low_u16(q12)); + d3 = vpaddl_u32(d1); + + return vget_lane_u32(vreinterpret_u32_u64(d3), 0); +} + +unsigned int vp8_sad16x16_neon( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride) { + uint8x16_t q0, q4; + uint16x8_t q12, q13; + uint32x4_t q1; + uint64x2_t q3; + uint32x2_t d5; + int i; + + q0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + q4 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); + q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); + + for (i = 0; i < 15; i++) { + q0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + q4 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); + q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); + } + + q12 = vaddq_u16(q12, q13); + q1 = vpaddlq_u16(q12); + q3 = vpaddlq_u32(q1); + d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), + vreinterpret_u32_u64(vget_high_u64(q3))); + + return vget_lane_u32(d5, 0); +} + +unsigned int vp8_sad16x8_neon( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride) { + uint8x16_t q0, q4; + uint16x8_t q12, q13; + uint32x4_t q1; + uint64x2_t q3; + uint32x2_t d5; + int i; + + q0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + q4 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); + q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); + + for (i = 0; i < 7; i++) { + q0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + q4 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); + q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); + } + + q12 = vaddq_u16(q12, q13); + q1 = vpaddlq_u16(q12); + q3 = vpaddlq_u32(q1); + d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), + vreinterpret_u32_u64(vget_high_u64(q3))); + + return vget_lane_u32(d5, 0); +} diff --git a/media/libvpx/vp8/common/arm/neon/save_reg_neon.asm b/media/libvpx/vp8/common/arm/neon/save_reg_neon.asm deleted file mode 100644 index fd7002e7a9ec..000000000000 --- a/media/libvpx/vp8/common/arm/neon/save_reg_neon.asm +++ /dev/null @@ -1,36 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_push_neon| - EXPORT |vp8_pop_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -|vp8_push_neon| PROC - vst1.i64 {d8, d9, d10, d11}, [r0]! - vst1.i64 {d12, d13, d14, d15}, [r0]! - bx lr - - ENDP - -|vp8_pop_neon| PROC - vld1.i64 {d8, d9, d10, d11}, [r0]! - vld1.i64 {d12, d13, d14, d15}, [r0]! - bx lr - - ENDP - - END - diff --git a/media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.asm b/media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.asm deleted file mode 100644 index 67d2ab0150dd..000000000000 --- a/media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.asm +++ /dev/null @@ -1,139 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_short_idct4x4llm_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;************************************************************* -;void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, -; unsigned char *dst, int stride) -;r0 short * input -;r1 short * pred -;r2 int pitch -;r3 unsigned char dst -;sp int stride -;************************************************************* - -; static const int cospi8sqrt2minus1=20091; -; static const int sinpi8sqrt2 =35468; -; static const int rounding = 0; - -; Optimization note: The resulted data from dequantization are signed -; 13-bit data that is in the range of [-4096, 4095]. This allows to -; use "vqdmulh"(neon) instruction since it won't go out of range -; (13+16+1=30bits<32bits). This instruction gives the high half -; result of the multiplication that is needed in IDCT. - -|vp8_short_idct4x4llm_neon| PROC - adr r12, idct_coeff - vld1.16 {q1, q2}, [r0] - vld1.16 {d0}, [r12] - - vswp d3, d4 ;q2(vp[4] vp[12]) - ldr r0, [sp] ; stride - - vqdmulh.s16 q3, q2, d0[2] - vqdmulh.s16 q4, q2, d0[0] - - vqadd.s16 d12, d2, d3 ;a1 - vqsub.s16 d13, d2, d3 ;b1 - - vshr.s16 q3, q3, #1 - vshr.s16 q4, q4, #1 - - vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) - vqadd.s16 q4, q4, q2 - - ;d6 - c1:temp1 - ;d7 - d1:temp2 - ;d8 - d1:temp1 - ;d9 - c1:temp2 - - vqsub.s16 d10, d6, d9 ;c1 - vqadd.s16 d11, d7, d8 ;d1 - - vqadd.s16 d2, d12, d11 - vqadd.s16 d3, d13, d10 - vqsub.s16 d4, d13, d10 - vqsub.s16 d5, d12, d11 - - vtrn.32 d2, d4 - vtrn.32 d3, d5 - vtrn.16 d2, d3 - vtrn.16 d4, d5 - - vswp d3, d4 - - vqdmulh.s16 q3, q2, d0[2] - vqdmulh.s16 q4, q2, d0[0] - - vqadd.s16 d12, d2, d3 ;a1 - vqsub.s16 d13, d2, d3 ;b1 - - vshr.s16 q3, q3, #1 - vshr.s16 q4, q4, #1 - - vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) - vqadd.s16 q4, q4, q2 - - vqsub.s16 d10, d6, d9 ;c1 - vqadd.s16 d11, d7, d8 ;d1 - - vqadd.s16 d2, d12, d11 - vqadd.s16 d3, d13, d10 - vqsub.s16 d4, d13, d10 - vqsub.s16 d5, d12, d11 - - vrshr.s16 d2, d2, #3 - vrshr.s16 d3, d3, #3 - vrshr.s16 d4, d4, #3 - vrshr.s16 d5, d5, #3 - - vtrn.32 d2, d4 - vtrn.32 d3, d5 - vtrn.16 d2, d3 - vtrn.16 d4, d5 - - ; load prediction data - vld1.32 d6[0], [r1], r2 - vld1.32 d6[1], [r1], r2 - vld1.32 d7[0], [r1], r2 - vld1.32 d7[1], [r1], r2 - - ; add prediction and residual - vaddw.u8 q1, q1, d6 - vaddw.u8 q2, q2, d7 - - vqmovun.s16 d1, q1 - vqmovun.s16 d2, q2 - - ; store to destination - vst1.32 d1[0], [r3], r0 - vst1.32 d1[1], [r3], r0 - vst1.32 d2[0], [r3], r0 - vst1.32 d2[1], [r3], r0 - - bx lr - - ENDP - -;----------------- - -idct_coeff - DCD 0x4e7b4e7b, 0x8a8c8a8c - -;20091, 20091, 35468, 35468 - - END diff --git a/media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c b/media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c new file mode 100644 index 000000000000..373afa6ed356 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +static const int16_t cospi8sqrt2minus1 = 20091; +static const int16_t sinpi8sqrt2 = 35468; + +void vp8_short_idct4x4llm_neon( + int16_t *input, + unsigned char *pred_ptr, + int pred_stride, + unsigned char *dst_ptr, + int dst_stride) { + int i; + uint32x2_t d6u32 = vdup_n_u32(0); + uint8x8_t d1u8; + int16x4_t d2, d3, d4, d5, d10, d11, d12, d13; + uint16x8_t q1u16; + int16x8_t q1s16, q2s16, q3s16, q4s16; + int32x2x2_t v2tmp0, v2tmp1; + int16x4x2_t v2tmp2, v2tmp3; + + d2 = vld1_s16(input); + d3 = vld1_s16(input + 4); + d4 = vld1_s16(input + 8); + d5 = vld1_s16(input + 12); + + // 1st for loop + q1s16 = vcombine_s16(d2, d4); // Swap d3 d4 here + q2s16 = vcombine_s16(d3, d5); + + q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2); + q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1); + + d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 + d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 + + q3s16 = vshrq_n_s16(q3s16, 1); + q4s16 = vshrq_n_s16(q4s16, 1); + + q3s16 = vqaddq_s16(q3s16, q2s16); + q4s16 = vqaddq_s16(q4s16, q2s16); + + d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 + d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1 + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]), + vreinterpret_s16_s32(v2tmp1.val[0])); + v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]), + vreinterpret_s16_s32(v2tmp1.val[1])); + + // 2nd for loop + q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]); + q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]); + + q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2); + q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1); + + d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 + d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 + + q3s16 = vshrq_n_s16(q3s16, 1); + q4s16 = vshrq_n_s16(q4s16, 1); + + q3s16 = vqaddq_s16(q3s16, q2s16); + q4s16 = vqaddq_s16(q4s16, q2s16); + + d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 + d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1 + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + d2 = vrshr_n_s16(d2, 3); + d3 = vrshr_n_s16(d3, 3); + d4 = vrshr_n_s16(d4, 3); + d5 = vrshr_n_s16(d5, 3); + + v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]), + vreinterpret_s16_s32(v2tmp1.val[0])); + v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]), + vreinterpret_s16_s32(v2tmp1.val[1])); + + q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]); + q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]); + + // dc_only_idct_add + for (i = 0; i < 2; i++, q1s16 = q2s16) { + d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0); + pred_ptr += pred_stride; + d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1); + pred_ptr += pred_stride; + + q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16), + vreinterpret_u8_u32(d6u32)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0); + dst_ptr += dst_stride; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1); + dst_ptr += dst_stride; + } + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/sixtappredict16x16_neon.asm b/media/libvpx/vp8/common/arm/neon/sixtappredict16x16_neon.asm deleted file mode 100644 index 9fdafd3609ed..000000000000 --- a/media/libvpx/vp8/common/arm/neon/sixtappredict16x16_neon.asm +++ /dev/null @@ -1,490 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sixtap_predict16x16_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -filter16_coeff - DCD 0, 0, 128, 0, 0, 0, 0, 0 - DCD 0, -6, 123, 12, -1, 0, 0, 0 - DCD 2, -11, 108, 36, -8, 1, 0, 0 - DCD 0, -9, 93, 50, -6, 0, 0, 0 - DCD 3, -16, 77, 77, -16, 3, 0, 0 - DCD 0, -6, 50, 93, -9, 0, 0, 0 - DCD 1, -8, 36, 108, -11, 2, 0, 0 - DCD 0, -1, 12, 123, -6, 0, 0, 0 - -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(r5) int dst_pitch - -;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to -; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication, -; the result can be negtive. So, I treat the result as s16. But, since it is also possible -; that the result can be a large positive number (> 2^15-1), which could be confused as a -; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2, -; which ensures that the result stays in s16 range. Finally, saturated add the result by -; applying 3rd filter coeff. Same applys to other filter functions. - -|vp8_sixtap_predict16x16_neon| PROC - push {r4-r5, lr} - - adr r12, filter16_coeff - ldr r4, [sp, #12] ;load parameters from stack - ldr r5, [sp, #16] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_filter16x16_only - - add r2, r12, r2, lsl #5 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - - vld1.s32 {q14, q15}, [r2] ;load first_pass filter - - beq firstpass_filter16x16_only - - sub sp, sp, #336 ;reserve space on stack for temporary storage - mov lr, sp - - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - mov r2, #7 ;loop counter - sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) - sub r0, r0, r1, lsl #1 - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vdup.8 d1, d24[4] - vdup.8 d2, d25[0] - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - -;First Pass: output_height lines x output_width columns (21x16) -filt_blk2d_fp16x16_loop_neon - vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data - vld1.u8 {d9, d10, d11}, [r0], r1 - vld1.u8 {d12, d13, d14}, [r0], r1 - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q9, d7, d0 - vmull.u8 q10, d9, d0 - vmull.u8 q11, d10, d0 - vmull.u8 q12, d12, d0 - vmull.u8 q13, d13, d0 - - vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d29, d9, d10, #1 - vext.8 d30, d12, d13, #1 - - vmlsl.u8 q8, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q10, d29, d1 - vmlsl.u8 q12, d30, d1 - - vext.8 d28, d7, d8, #1 - vext.8 d29, d10, d11, #1 - vext.8 d30, d13, d14, #1 - - vmlsl.u8 q9, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q11, d29, d1 - vmlsl.u8 q13, d30, d1 - - vext.8 d28, d6, d7, #4 ;construct src_ptr[2] - vext.8 d29, d9, d10, #4 - vext.8 d30, d12, d13, #4 - - vmlsl.u8 q8, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q10, d29, d4 - vmlsl.u8 q12, d30, d4 - - vext.8 d28, d7, d8, #4 - vext.8 d29, d10, d11, #4 - vext.8 d30, d13, d14, #4 - - vmlsl.u8 q9, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q11, d29, d4 - vmlsl.u8 q13, d30, d4 - - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d9, d10, #5 - vext.8 d30, d12, d13, #5 - - vmlal.u8 q8, d28, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q10, d29, d5 - vmlal.u8 q12, d30, d5 - - vext.8 d28, d7, d8, #5 - vext.8 d29, d10, d11, #5 - vext.8 d30, d13, d14, #5 - - vmlal.u8 q9, d28, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q11, d29, d5 - vmlal.u8 q13, d30, d5 - - vext.8 d28, d6, d7, #2 ;construct src_ptr[0] - vext.8 d29, d9, d10, #2 - vext.8 d30, d12, d13, #2 - - vmlal.u8 q8, d28, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q10, d29, d2 - vmlal.u8 q12, d30, d2 - - vext.8 d28, d7, d8, #2 - vext.8 d29, d10, d11, #2 - vext.8 d30, d13, d14, #2 - - vmlal.u8 q9, d28, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q11, d29, d2 - vmlal.u8 q13, d30, d2 - - vext.8 d28, d6, d7, #3 ;construct src_ptr[1] - vext.8 d29, d9, d10, #3 - vext.8 d30, d12, d13, #3 - - vext.8 d15, d7, d8, #3 - vext.8 d31, d10, d11, #3 - vext.8 d6, d13, d14, #3 - - vmull.u8 q4, d28, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q5, d29, d3 - vmull.u8 q6, d30, d3 - - vqadd.s16 q8, q4 ;sum of all (src_data*filter_parameters) - vqadd.s16 q10, q5 - vqadd.s16 q12, q6 - - vmull.u8 q6, d15, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q7, d31, d3 - vmull.u8 q3, d6, d3 - - subs r2, r2, #1 - - vqadd.s16 q9, q6 - vqadd.s16 q11, q7 - vqadd.s16 q13, q3 - - vqrshrun.s16 d6, q8, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q9, #7 - vqrshrun.s16 d8, q10, #7 - vqrshrun.s16 d9, q11, #7 - vqrshrun.s16 d10, q12, #7 - vqrshrun.s16 d11, q13, #7 - - vst1.u8 {d6, d7, d8}, [lr]! ;store result - vst1.u8 {d9, d10, d11}, [lr]! - - bne filt_blk2d_fp16x16_loop_neon - -;Second pass: 16x16 -;secondpass_filter - do first 8-columns and then second 8-columns - add r3, r12, r3, lsl #5 - sub lr, lr, #336 - - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - mov r3, #2 ;loop counter - - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - mov r2, #16 - - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d14[4] - vdup.8 d2, d15[0] - vdup.8 d3, d15[4] - vdup.8 d4, d16[0] - vdup.8 d5, d16[4] - -filt_blk2d_sp16x16_outloop_neon - vld1.u8 {d18}, [lr], r2 ;load src data - vld1.u8 {d19}, [lr], r2 - vld1.u8 {d20}, [lr], r2 - vld1.u8 {d21}, [lr], r2 - mov r12, #4 ;loop counter - vld1.u8 {d22}, [lr], r2 - -secondpass_inner_loop_neon - vld1.u8 {d23}, [lr], r2 ;load src data - vld1.u8 {d24}, [lr], r2 - vld1.u8 {d25}, [lr], r2 - vld1.u8 {d26}, [lr], r2 - - vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q4, d19, d0 - vmull.u8 q5, d20, d0 - vmull.u8 q6, d21, d0 - - vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q4, d20, d1 - vmlsl.u8 q5, d21, d1 - vmlsl.u8 q6, d22, d1 - - vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q4, d23, d4 - vmlsl.u8 q5, d24, d4 - vmlsl.u8 q6, d25, d4 - - vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q4, d21, d2 - vmlal.u8 q5, d22, d2 - vmlal.u8 q6, d23, d2 - - vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q4, d24, d5 - vmlal.u8 q5, d25, d5 - vmlal.u8 q6, d26, d5 - - vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q8, d22, d3 - vmull.u8 q9, d23, d3 - vmull.u8 q10, d24, d3 - - subs r12, r12, #1 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vst1.u8 {d6}, [r4], r5 ;store result - vmov q9, q11 - vst1.u8 {d7}, [r4], r5 - vmov q10, q12 - vst1.u8 {d8}, [r4], r5 - vmov d22, d26 - vst1.u8 {d9}, [r4], r5 - - bne secondpass_inner_loop_neon - - subs r3, r3, #1 - sub lr, lr, #336 - add lr, lr, #8 - - sub r4, r4, r5, lsl #4 - add r4, r4, #8 - - bne filt_blk2d_sp16x16_outloop_neon - - add sp, sp, #336 - pop {r4-r5,pc} - -;-------------------- -firstpass_filter16x16_only - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - mov r2, #8 ;loop counter - sub r0, r0, #2 ;move srcptr back to (column-2) - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vdup.8 d1, d24[4] - vdup.8 d2, d25[0] - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - -;First Pass: output_height lines x output_width columns (16x16) -filt_blk2d_fpo16x16_loop_neon - vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data - vld1.u8 {d9, d10, d11}, [r0], r1 - - pld [r0] - pld [r0, r1] - - vmull.u8 q6, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q7, d7, d0 - vmull.u8 q8, d9, d0 - vmull.u8 q9, d10, d0 - - vext.8 d20, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d21, d9, d10, #1 - vext.8 d22, d7, d8, #1 - vext.8 d23, d10, d11, #1 - vext.8 d24, d6, d7, #4 ;construct src_ptr[2] - vext.8 d25, d9, d10, #4 - vext.8 d26, d7, d8, #4 - vext.8 d27, d10, d11, #4 - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d9, d10, #5 - - vmlsl.u8 q6, d20, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q8, d21, d1 - vmlsl.u8 q7, d22, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q9, d23, d1 - vmlsl.u8 q6, d24, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q8, d25, d4 - vmlsl.u8 q7, d26, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q9, d27, d4 - vmlal.u8 q6, d28, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q8, d29, d5 - - vext.8 d20, d7, d8, #5 - vext.8 d21, d10, d11, #5 - vext.8 d22, d6, d7, #2 ;construct src_ptr[0] - vext.8 d23, d9, d10, #2 - vext.8 d24, d7, d8, #2 - vext.8 d25, d10, d11, #2 - - vext.8 d26, d6, d7, #3 ;construct src_ptr[1] - vext.8 d27, d9, d10, #3 - vext.8 d28, d7, d8, #3 - vext.8 d29, d10, d11, #3 - - vmlal.u8 q7, d20, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q9, d21, d5 - vmlal.u8 q6, d22, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q8, d23, d2 - vmlal.u8 q7, d24, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q9, d25, d2 - - vmull.u8 q10, d26, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q11, d27, d3 - vmull.u8 q12, d28, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q15, d29, d3 - - vqadd.s16 q6, q10 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q11 - vqadd.s16 q7, q12 - vqadd.s16 q9, q15 - - subs r2, r2, #1 - - vqrshrun.s16 d6, q6, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q7, #7 - vqrshrun.s16 d8, q8, #7 - vqrshrun.s16 d9, q9, #7 - - vst1.u8 {q3}, [r4], r5 ;store result - vst1.u8 {q4}, [r4], r5 - - bne filt_blk2d_fpo16x16_loop_neon - - pop {r4-r5,pc} - -;-------------------- -secondpass_filter16x16_only -;Second pass: 16x16 - add r3, r12, r3, lsl #5 - sub r0, r0, r1, lsl #1 - - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - mov r3, #2 ;loop counter - - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d14[4] - vdup.8 d2, d15[0] - vdup.8 d3, d15[4] - vdup.8 d4, d16[0] - vdup.8 d5, d16[4] - -filt_blk2d_spo16x16_outloop_neon - vld1.u8 {d18}, [r0], r1 ;load src data - vld1.u8 {d19}, [r0], r1 - vld1.u8 {d20}, [r0], r1 - vld1.u8 {d21}, [r0], r1 - mov r12, #4 ;loop counter - vld1.u8 {d22}, [r0], r1 - -secondpass_only_inner_loop_neon - vld1.u8 {d23}, [r0], r1 ;load src data - vld1.u8 {d24}, [r0], r1 - vld1.u8 {d25}, [r0], r1 - vld1.u8 {d26}, [r0], r1 - - vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q4, d19, d0 - vmull.u8 q5, d20, d0 - vmull.u8 q6, d21, d0 - - vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q4, d20, d1 - vmlsl.u8 q5, d21, d1 - vmlsl.u8 q6, d22, d1 - - vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q4, d23, d4 - vmlsl.u8 q5, d24, d4 - vmlsl.u8 q6, d25, d4 - - vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q4, d21, d2 - vmlal.u8 q5, d22, d2 - vmlal.u8 q6, d23, d2 - - vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q4, d24, d5 - vmlal.u8 q5, d25, d5 - vmlal.u8 q6, d26, d5 - - vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q8, d22, d3 - vmull.u8 q9, d23, d3 - vmull.u8 q10, d24, d3 - - subs r12, r12, #1 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vst1.u8 {d6}, [r4], r5 ;store result - vmov q9, q11 - vst1.u8 {d7}, [r4], r5 - vmov q10, q12 - vst1.u8 {d8}, [r4], r5 - vmov d22, d26 - vst1.u8 {d9}, [r4], r5 - - bne secondpass_only_inner_loop_neon - - subs r3, r3, #1 - sub r0, r0, r1, lsl #4 - sub r0, r0, r1, lsl #2 - sub r0, r0, r1 - add r0, r0, #8 - - sub r4, r4, r5, lsl #4 - add r4, r4, #8 - - bne filt_blk2d_spo16x16_outloop_neon - - pop {r4-r5,pc} - - ENDP - -;----------------- - END diff --git a/media/libvpx/vp8/common/arm/neon/sixtappredict4x4_neon.asm b/media/libvpx/vp8/common/arm/neon/sixtappredict4x4_neon.asm deleted file mode 100644 index a4222bc62c54..000000000000 --- a/media/libvpx/vp8/common/arm/neon/sixtappredict4x4_neon.asm +++ /dev/null @@ -1,422 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sixtap_predict4x4_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -filter4_coeff - DCD 0, 0, 128, 0, 0, 0, 0, 0 - DCD 0, -6, 123, 12, -1, 0, 0, 0 - DCD 2, -11, 108, 36, -8, 1, 0, 0 - DCD 0, -9, 93, 50, -6, 0, 0, 0 - DCD 3, -16, 77, 77, -16, 3, 0, 0 - DCD 0, -6, 50, 93, -9, 0, 0, 0 - DCD 1, -8, 36, 108, -11, 2, 0, 0 - DCD 0, -1, 12, 123, -6, 0, 0, 0 - -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; stack(r4) unsigned char *dst_ptr, -; stack(lr) int dst_pitch - -|vp8_sixtap_predict4x4_neon| PROC - push {r4, lr} - - adr r12, filter4_coeff - ldr r4, [sp, #8] ;load parameters from stack - ldr lr, [sp, #12] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_filter4x4_only - - add r2, r12, r2, lsl #5 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - vld1.s32 {q14, q15}, [r2] ;load first_pass filter - - beq firstpass_filter4x4_only - - vabs.s32 q12, q14 ;get abs(filer_parameters) - vabs.s32 q13, q15 - - sub r0, r0, #2 ;go back 2 columns of src data - sub r0, r0, r1, lsl #1 ;go back 2 lines of src data - -;First pass: output_height lines x output_width columns (9x4) - vld1.u8 {q3}, [r0], r1 ;load first 4-line src data - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vld1.u8 {q4}, [r0], r1 - vdup.8 d1, d24[4] - vld1.u8 {q5}, [r0], r1 - vdup.8 d2, d25[0] - vld1.u8 {q6}, [r0], r1 - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vext.8 d18, d6, d7, #5 ;construct src_ptr[3] - vext.8 d19, d8, d9, #5 - vext.8 d20, d10, d11, #5 - vext.8 d21, d12, d13, #5 - - vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done - vswp d11, d12 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) - vzip.32 d20, d21 - vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5]) - vmull.u8 q8, d20, d5 - - vmov q4, q3 ;keep original src data in q4 q6 - vmov q6, q5 - - vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together - vzip.32 d10, d11 - vshr.u64 q9, q4, #8 ;construct src_ptr[-1] - vshr.u64 q10, q6, #8 - vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0]) - vmlal.u8 q8, d10, d0 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #32 ;construct src_ptr[2] - vshr.u64 q5, q6, #32 - vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q8, d20, d1 - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) - vzip.32 d10, d11 - vshr.u64 q9, q4, #16 ;construct src_ptr[0] - vshr.u64 q10, q6, #16 - vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q8, d10, d4 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #24 ;construct src_ptr[1] - vshr.u64 q5, q6, #24 - vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q8, d20, d2 - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) - vzip.32 d10, d11 - vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q10, d10, d3 - - vld1.u8 {q3}, [r0], r1 ;load rest 5-line src data - vld1.u8 {q4}, [r0], r1 - - vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q10 - - vld1.u8 {q5}, [r0], r1 - vld1.u8 {q6}, [r0], r1 - - vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d28, q8, #7 - - ;First Pass on rest 5-line data - vld1.u8 {q11}, [r0], r1 - - vext.8 d18, d6, d7, #5 ;construct src_ptr[3] - vext.8 d19, d8, d9, #5 - vext.8 d20, d10, d11, #5 - vext.8 d21, d12, d13, #5 - - vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done - vswp d11, d12 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) - vzip.32 d20, d21 - vext.8 d31, d22, d23, #5 ;construct src_ptr[3] - vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5]) - vmull.u8 q8, d20, d5 - vmull.u8 q12, d31, d5 ;(src_ptr[3] * vp8_filter[5]) - - vmov q4, q3 ;keep original src data in q4 q6 - vmov q6, q5 - - vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together - vzip.32 d10, d11 - vshr.u64 q9, q4, #8 ;construct src_ptr[-1] - vshr.u64 q10, q6, #8 - - vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0]) - vmlal.u8 q8, d10, d0 - vmlal.u8 q12, d22, d0 ;(src_ptr[-2] * vp8_filter[0]) - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #32 ;construct src_ptr[2] - vshr.u64 q5, q6, #32 - vext.8 d31, d22, d23, #1 ;construct src_ptr[-1] - - vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q8, d20, d1 - vmlsl.u8 q12, d31, d1 ;-(src_ptr[-1] * vp8_filter[1]) - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) - vzip.32 d10, d11 - vshr.u64 q9, q4, #16 ;construct src_ptr[0] - vshr.u64 q10, q6, #16 - vext.8 d31, d22, d23, #4 ;construct src_ptr[2] - - vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q8, d10, d4 - vmlsl.u8 q12, d31, d4 ;-(src_ptr[2] * vp8_filter[4]) - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #24 ;construct src_ptr[1] - vshr.u64 q5, q6, #24 - vext.8 d31, d22, d23, #2 ;construct src_ptr[0] - - vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q8, d20, d2 - vmlal.u8 q12, d31, d2 ;(src_ptr[0] * vp8_filter[2]) - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) - vzip.32 d10, d11 - vext.8 d31, d22, d23, #3 ;construct src_ptr[1] - vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q10, d10, d3 - vmull.u8 q11, d31, d3 ;(src_ptr[1] * vp8_filter[3]) - - add r3, r12, r3, lsl #5 - - vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q10 - vqadd.s16 q12, q11 - - vext.8 d23, d27, d28, #4 - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - - vqrshrun.s16 d29, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d30, q8, #7 - vqrshrun.s16 d31, q12, #7 - -;Second pass: 4x4 - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - vext.8 d24, d28, d29, #4 - vext.8 d25, d29, d30, #4 - vext.8 d26, d30, d31, #4 - - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d14[4] - vdup.8 d2, d15[0] - vdup.8 d3, d15[4] - vdup.8 d4, d16[0] - vdup.8 d5, d16[4] - - vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q4, d28, d0 - - vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5]) - vmull.u8 q6, d26, d5 - - vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q4, d30, d4 - - vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q6, d24, d1 - - vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q4, d29, d2 - - vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3]) - vmlal.u8 q6, d25, d3 - - add r0, r4, lr - add r1, r0, lr - add r2, r1, lr - - vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q6, q4 - - vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d4, q6, #7 - - vst1.32 {d3[0]}, [r4] ;store result - vst1.32 {d3[1]}, [r0] - vst1.32 {d4[0]}, [r1] - vst1.32 {d4[1]}, [r2] - - pop {r4, pc} - - -;--------------------- -firstpass_filter4x4_only - vabs.s32 q12, q14 ;get abs(filer_parameters) - vabs.s32 q13, q15 - - sub r0, r0, #2 ;go back 2 columns of src data - -;First pass: output_height lines x output_width columns (4x4) - vld1.u8 {q3}, [r0], r1 ;load first 4-line src data - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vld1.u8 {q4}, [r0], r1 - vdup.8 d1, d24[4] - vld1.u8 {q5}, [r0], r1 - vdup.8 d2, d25[0] - vld1.u8 {q6}, [r0], r1 - - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - - vext.8 d18, d6, d7, #5 ;construct src_ptr[3] - vext.8 d19, d8, d9, #5 - vext.8 d20, d10, d11, #5 - vext.8 d21, d12, d13, #5 - - vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done - vswp d11, d12 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) - vzip.32 d20, d21 - vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5]) - vmull.u8 q8, d20, d5 - - vmov q4, q3 ;keep original src data in q4 q6 - vmov q6, q5 - - vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together - vzip.32 d10, d11 - vshr.u64 q9, q4, #8 ;construct src_ptr[-1] - vshr.u64 q10, q6, #8 - vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0]) - vmlal.u8 q8, d10, d0 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #32 ;construct src_ptr[2] - vshr.u64 q5, q6, #32 - vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q8, d20, d1 - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) - vzip.32 d10, d11 - vshr.u64 q9, q4, #16 ;construct src_ptr[0] - vshr.u64 q10, q6, #16 - vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q8, d10, d4 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #24 ;construct src_ptr[1] - vshr.u64 q5, q6, #24 - vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q8, d20, d2 - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) - vzip.32 d10, d11 - vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q10, d10, d3 - - add r0, r4, lr - add r1, r0, lr - add r2, r1, lr - - vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q10 - - vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d28, q8, #7 - - vst1.32 {d27[0]}, [r4] ;store result - vst1.32 {d27[1]}, [r0] - vst1.32 {d28[0]}, [r1] - vst1.32 {d28[1]}, [r2] - - pop {r4, pc} - - -;--------------------- -secondpass_filter4x4_only - sub r0, r0, r1, lsl #1 - add r3, r12, r3, lsl #5 - - vld1.32 {d27[0]}, [r0], r1 ;load src data - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - vld1.32 {d27[1]}, [r0], r1 - vabs.s32 q7, q5 - vld1.32 {d28[0]}, [r0], r1 - vabs.s32 q8, q6 - vld1.32 {d28[1]}, [r0], r1 - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vld1.32 {d29[0]}, [r0], r1 - vdup.8 d1, d14[4] - vld1.32 {d29[1]}, [r0], r1 - vdup.8 d2, d15[0] - vld1.32 {d30[0]}, [r0], r1 - vdup.8 d3, d15[4] - vld1.32 {d30[1]}, [r0], r1 - vdup.8 d4, d16[0] - vld1.32 {d31[0]}, [r0], r1 - vdup.8 d5, d16[4] - - vext.8 d23, d27, d28, #4 - vext.8 d24, d28, d29, #4 - vext.8 d25, d29, d30, #4 - vext.8 d26, d30, d31, #4 - - vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q4, d28, d0 - - vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5]) - vmull.u8 q6, d26, d5 - - vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q4, d30, d4 - - vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q6, d24, d1 - - vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q4, d29, d2 - - vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3]) - vmlal.u8 q6, d25, d3 - - add r0, r4, lr - add r1, r0, lr - add r2, r1, lr - - vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q6, q4 - - vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d4, q6, #7 - - vst1.32 {d3[0]}, [r4] ;store result - vst1.32 {d3[1]}, [r0] - vst1.32 {d4[0]}, [r1] - vst1.32 {d4[1]}, [r2] - - pop {r4, pc} - - ENDP - -;----------------- - - END diff --git a/media/libvpx/vp8/common/arm/neon/sixtappredict8x4_neon.asm b/media/libvpx/vp8/common/arm/neon/sixtappredict8x4_neon.asm deleted file mode 100644 index a57ec015f2c0..000000000000 --- a/media/libvpx/vp8/common/arm/neon/sixtappredict8x4_neon.asm +++ /dev/null @@ -1,473 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sixtap_predict8x4_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -filter8_coeff - DCD 0, 0, 128, 0, 0, 0, 0, 0 - DCD 0, -6, 123, 12, -1, 0, 0, 0 - DCD 2, -11, 108, 36, -8, 1, 0, 0 - DCD 0, -9, 93, 50, -6, 0, 0, 0 - DCD 3, -16, 77, 77, -16, 3, 0, 0 - DCD 0, -6, 50, 93, -9, 0, 0, 0 - DCD 1, -8, 36, 108, -11, 2, 0, 0 - DCD 0, -1, 12, 123, -6, 0, 0, 0 - -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(r5) int dst_pitch - -|vp8_sixtap_predict8x4_neon| PROC - push {r4-r5, lr} - - adr r12, filter8_coeff - ldr r4, [sp, #12] ;load parameters from stack - ldr r5, [sp, #16] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_filter8x4_only - - add r2, r12, r2, lsl #5 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - - vld1.s32 {q14, q15}, [r2] ;load first_pass filter - - beq firstpass_filter8x4_only - - sub sp, sp, #32 ;reserve space on stack for temporary storage - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) - mov lr, sp - sub r0, r0, r1, lsl #1 - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vdup.8 d1, d24[4] - vdup.8 d2, d25[0] - -;First pass: output_height lines x output_width columns (9x8) - vld1.u8 {q3}, [r0], r1 ;load src data - vdup.8 d3, d25[4] - vld1.u8 {q4}, [r0], r1 - vdup.8 d4, d26[0] - vld1.u8 {q5}, [r0], r1 - vdup.8 d5, d26[4] - vld1.u8 {q6}, [r0], r1 - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q8, d8, d0 - vmull.u8 q9, d10, d0 - vmull.u8 q10, d12, d0 - - vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d29, d8, d9, #1 - vext.8 d30, d10, d11, #1 - vext.8 d31, d12, d13, #1 - - vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q8, d29, d1 - vmlsl.u8 q9, d30, d1 - vmlsl.u8 q10, d31, d1 - - vext.8 d28, d6, d7, #4 ;construct src_ptr[2] - vext.8 d29, d8, d9, #4 - vext.8 d30, d10, d11, #4 - vext.8 d31, d12, d13, #4 - - vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q8, d29, d4 - vmlsl.u8 q9, d30, d4 - vmlsl.u8 q10, d31, d4 - - vext.8 d28, d6, d7, #2 ;construct src_ptr[0] - vext.8 d29, d8, d9, #2 - vext.8 d30, d10, d11, #2 - vext.8 d31, d12, d13, #2 - - vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q8, d29, d2 - vmlal.u8 q9, d30, d2 - vmlal.u8 q10, d31, d2 - - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d8, d9, #5 - vext.8 d30, d10, d11, #5 - vext.8 d31, d12, d13, #5 - - vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q8, d29, d5 - vmlal.u8 q9, d30, d5 - vmlal.u8 q10, d31, d5 - - vext.8 d28, d6, d7, #3 ;construct src_ptr[1] - vext.8 d29, d8, d9, #3 - vext.8 d30, d10, d11, #3 - vext.8 d31, d12, d13, #3 - - vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q4, d29, d3 - vmull.u8 q5, d30, d3 - vmull.u8 q6, d31, d3 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vld1.u8 {q3}, [r0], r1 ;load src data - - vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d23, q8, #7 - vqrshrun.s16 d24, q9, #7 - vqrshrun.s16 d25, q10, #7 - - vld1.u8 {q4}, [r0], r1 - vst1.u8 {d22}, [lr]! ;store result - vld1.u8 {q5}, [r0], r1 - vst1.u8 {d23}, [lr]! - vld1.u8 {q6}, [r0], r1 - vst1.u8 {d24}, [lr]! - vld1.u8 {q7}, [r0], r1 - vst1.u8 {d25}, [lr]! - - ;first_pass filtering on the rest 5-line data - vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q9, d8, d0 - vmull.u8 q10, d10, d0 - vmull.u8 q11, d12, d0 - vmull.u8 q12, d14, d0 - - vext.8 d27, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d28, d8, d9, #1 - vext.8 d29, d10, d11, #1 - vext.8 d30, d12, d13, #1 - vext.8 d31, d14, d15, #1 - - vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q9, d28, d1 - vmlsl.u8 q10, d29, d1 - vmlsl.u8 q11, d30, d1 - vmlsl.u8 q12, d31, d1 - - vext.8 d27, d6, d7, #4 ;construct src_ptr[2] - vext.8 d28, d8, d9, #4 - vext.8 d29, d10, d11, #4 - vext.8 d30, d12, d13, #4 - vext.8 d31, d14, d15, #4 - - vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q9, d28, d4 - vmlsl.u8 q10, d29, d4 - vmlsl.u8 q11, d30, d4 - vmlsl.u8 q12, d31, d4 - - vext.8 d27, d6, d7, #2 ;construct src_ptr[0] - vext.8 d28, d8, d9, #2 - vext.8 d29, d10, d11, #2 - vext.8 d30, d12, d13, #2 - vext.8 d31, d14, d15, #2 - - vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q9, d28, d2 - vmlal.u8 q10, d29, d2 - vmlal.u8 q11, d30, d2 - vmlal.u8 q12, d31, d2 - - vext.8 d27, d6, d7, #5 ;construct src_ptr[3] - vext.8 d28, d8, d9, #5 - vext.8 d29, d10, d11, #5 - vext.8 d30, d12, d13, #5 - vext.8 d31, d14, d15, #5 - - vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q9, d28, d5 - vmlal.u8 q10, d29, d5 - vmlal.u8 q11, d30, d5 - vmlal.u8 q12, d31, d5 - - vext.8 d27, d6, d7, #3 ;construct src_ptr[1] - vext.8 d28, d8, d9, #3 - vext.8 d29, d10, d11, #3 - vext.8 d30, d12, d13, #3 - vext.8 d31, d14, d15, #3 - - vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q4, d28, d3 - vmull.u8 q5, d29, d3 - vmull.u8 q6, d30, d3 - vmull.u8 q7, d31, d3 - - vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q9, q4 - vqadd.s16 q10, q5 - vqadd.s16 q11, q6 - vqadd.s16 q12, q7 - - vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d27, q9, #7 - vqrshrun.s16 d28, q10, #7 - vqrshrun.s16 d29, q11, #7 ;load intermediate data from stack - vqrshrun.s16 d30, q12, #7 - -;Second pass: 8x4 -;secondpass_filter - add r3, r12, r3, lsl #5 - sub lr, lr, #32 - - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - vld1.u8 {q11}, [lr]! - - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - vld1.u8 {q12}, [lr]! - - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d14[4] - vdup.8 d2, d15[0] - vdup.8 d3, d15[4] - vdup.8 d4, d16[0] - vdup.8 d5, d16[4] - - vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q4, d23, d0 - vmull.u8 q5, d24, d0 - vmull.u8 q6, d25, d0 - - vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q4, d24, d1 - vmlsl.u8 q5, d25, d1 - vmlsl.u8 q6, d26, d1 - - vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q4, d27, d4 - vmlsl.u8 q5, d28, d4 - vmlsl.u8 q6, d29, d4 - - vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q4, d25, d2 - vmlal.u8 q5, d26, d2 - vmlal.u8 q6, d27, d2 - - vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q4, d28, d5 - vmlal.u8 q5, d29, d5 - vmlal.u8 q6, d30, d5 - - vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q8, d26, d3 - vmull.u8 q9, d27, d3 - vmull.u8 q10, d28, d3 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vst1.u8 {d6}, [r4], r5 ;store result - vst1.u8 {d7}, [r4], r5 - vst1.u8 {d8}, [r4], r5 - vst1.u8 {d9}, [r4], r5 - - add sp, sp, #32 - pop {r4-r5,pc} - -;-------------------- -firstpass_filter8x4_only - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) - vld1.u8 {q3}, [r0], r1 ;load src data - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vld1.u8 {q4}, [r0], r1 - vdup.8 d1, d24[4] - vld1.u8 {q5}, [r0], r1 - vdup.8 d2, d25[0] - vld1.u8 {q6}, [r0], r1 - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - -;First pass: output_height lines x output_width columns (4x8) - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q8, d8, d0 - vmull.u8 q9, d10, d0 - vmull.u8 q10, d12, d0 - - vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d29, d8, d9, #1 - vext.8 d30, d10, d11, #1 - vext.8 d31, d12, d13, #1 - - vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q8, d29, d1 - vmlsl.u8 q9, d30, d1 - vmlsl.u8 q10, d31, d1 - - vext.8 d28, d6, d7, #4 ;construct src_ptr[2] - vext.8 d29, d8, d9, #4 - vext.8 d30, d10, d11, #4 - vext.8 d31, d12, d13, #4 - - vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q8, d29, d4 - vmlsl.u8 q9, d30, d4 - vmlsl.u8 q10, d31, d4 - - vext.8 d28, d6, d7, #2 ;construct src_ptr[0] - vext.8 d29, d8, d9, #2 - vext.8 d30, d10, d11, #2 - vext.8 d31, d12, d13, #2 - - vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q8, d29, d2 - vmlal.u8 q9, d30, d2 - vmlal.u8 q10, d31, d2 - - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d8, d9, #5 - vext.8 d30, d10, d11, #5 - vext.8 d31, d12, d13, #5 - - vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q8, d29, d5 - vmlal.u8 q9, d30, d5 - vmlal.u8 q10, d31, d5 - - vext.8 d28, d6, d7, #3 ;construct src_ptr[1] - vext.8 d29, d8, d9, #3 - vext.8 d30, d10, d11, #3 - vext.8 d31, d12, d13, #3 - - vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q4, d29, d3 - vmull.u8 q5, d30, d3 - vmull.u8 q6, d31, d3 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d23, q8, #7 - vqrshrun.s16 d24, q9, #7 - vqrshrun.s16 d25, q10, #7 - - vst1.u8 {d22}, [r4], r5 ;store result - vst1.u8 {d23}, [r4], r5 - vst1.u8 {d24}, [r4], r5 - vst1.u8 {d25}, [r4], r5 - - pop {r4-r5,pc} - -;--------------------- -secondpass_filter8x4_only -;Second pass: 8x4 - add r3, r12, r3, lsl #5 - sub r0, r0, r1, lsl #1 - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - vld1.u8 {d22}, [r0], r1 - vld1.u8 {d23}, [r0], r1 - vld1.u8 {d24}, [r0], r1 - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vld1.u8 {d25}, [r0], r1 - vdup.8 d1, d14[4] - vld1.u8 {d26}, [r0], r1 - vdup.8 d2, d15[0] - vld1.u8 {d27}, [r0], r1 - vdup.8 d3, d15[4] - vld1.u8 {d28}, [r0], r1 - vdup.8 d4, d16[0] - vld1.u8 {d29}, [r0], r1 - vdup.8 d5, d16[4] - vld1.u8 {d30}, [r0], r1 - - vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q4, d23, d0 - vmull.u8 q5, d24, d0 - vmull.u8 q6, d25, d0 - - vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q4, d24, d1 - vmlsl.u8 q5, d25, d1 - vmlsl.u8 q6, d26, d1 - - vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q4, d27, d4 - vmlsl.u8 q5, d28, d4 - vmlsl.u8 q6, d29, d4 - - vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q4, d25, d2 - vmlal.u8 q5, d26, d2 - vmlal.u8 q6, d27, d2 - - vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q4, d28, d5 - vmlal.u8 q5, d29, d5 - vmlal.u8 q6, d30, d5 - - vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q8, d26, d3 - vmull.u8 q9, d27, d3 - vmull.u8 q10, d28, d3 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vst1.u8 {d6}, [r4], r5 ;store result - vst1.u8 {d7}, [r4], r5 - vst1.u8 {d8}, [r4], r5 - vst1.u8 {d9}, [r4], r5 - - pop {r4-r5,pc} - - ENDP - -;----------------- - - END diff --git a/media/libvpx/vp8/common/arm/neon/sixtappredict8x8_neon.asm b/media/libvpx/vp8/common/arm/neon/sixtappredict8x8_neon.asm deleted file mode 100644 index 00ed5aeefe3c..000000000000 --- a/media/libvpx/vp8/common/arm/neon/sixtappredict8x8_neon.asm +++ /dev/null @@ -1,524 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sixtap_predict8x8_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -filter8_coeff - DCD 0, 0, 128, 0, 0, 0, 0, 0 - DCD 0, -6, 123, 12, -1, 0, 0, 0 - DCD 2, -11, 108, 36, -8, 1, 0, 0 - DCD 0, -9, 93, 50, -6, 0, 0, 0 - DCD 3, -16, 77, 77, -16, 3, 0, 0 - DCD 0, -6, 50, 93, -9, 0, 0, 0 - DCD 1, -8, 36, 108, -11, 2, 0, 0 - DCD 0, -1, 12, 123, -6, 0, 0, 0 - -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; stack(r4) unsigned char *dst_ptr, -; stack(r5) int dst_pitch - -|vp8_sixtap_predict8x8_neon| PROC - push {r4-r5, lr} - - adr r12, filter8_coeff - - ldr r4, [sp, #12] ;load parameters from stack - ldr r5, [sp, #16] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_filter8x8_only - - add r2, r12, r2, lsl #5 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - - vld1.s32 {q14, q15}, [r2] ;load first_pass filter - - beq firstpass_filter8x8_only - - sub sp, sp, #64 ;reserve space on stack for temporary storage - mov lr, sp - - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - mov r2, #2 ;loop counter - sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) - sub r0, r0, r1, lsl #1 - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vdup.8 d1, d24[4] - vdup.8 d2, d25[0] - -;First pass: output_height lines x output_width columns (13x8) - vld1.u8 {q3}, [r0], r1 ;load src data - vdup.8 d3, d25[4] - vld1.u8 {q4}, [r0], r1 - vdup.8 d4, d26[0] - vld1.u8 {q5}, [r0], r1 - vdup.8 d5, d26[4] - vld1.u8 {q6}, [r0], r1 - -filt_blk2d_fp8x8_loop_neon - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q8, d8, d0 - vmull.u8 q9, d10, d0 - vmull.u8 q10, d12, d0 - - vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d29, d8, d9, #1 - vext.8 d30, d10, d11, #1 - vext.8 d31, d12, d13, #1 - - vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q8, d29, d1 - vmlsl.u8 q9, d30, d1 - vmlsl.u8 q10, d31, d1 - - vext.8 d28, d6, d7, #4 ;construct src_ptr[2] - vext.8 d29, d8, d9, #4 - vext.8 d30, d10, d11, #4 - vext.8 d31, d12, d13, #4 - - vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q8, d29, d4 - vmlsl.u8 q9, d30, d4 - vmlsl.u8 q10, d31, d4 - - vext.8 d28, d6, d7, #2 ;construct src_ptr[0] - vext.8 d29, d8, d9, #2 - vext.8 d30, d10, d11, #2 - vext.8 d31, d12, d13, #2 - - vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q8, d29, d2 - vmlal.u8 q9, d30, d2 - vmlal.u8 q10, d31, d2 - - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d8, d9, #5 - vext.8 d30, d10, d11, #5 - vext.8 d31, d12, d13, #5 - - vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q8, d29, d5 - vmlal.u8 q9, d30, d5 - vmlal.u8 q10, d31, d5 - - vext.8 d28, d6, d7, #3 ;construct src_ptr[1] - vext.8 d29, d8, d9, #3 - vext.8 d30, d10, d11, #3 - vext.8 d31, d12, d13, #3 - - vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q4, d29, d3 - vmull.u8 q5, d30, d3 - vmull.u8 q6, d31, d3 - - subs r2, r2, #1 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vld1.u8 {q3}, [r0], r1 ;load src data - - vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d23, q8, #7 - vqrshrun.s16 d24, q9, #7 - vqrshrun.s16 d25, q10, #7 - - vst1.u8 {d22}, [lr]! ;store result - vld1.u8 {q4}, [r0], r1 - vst1.u8 {d23}, [lr]! - vld1.u8 {q5}, [r0], r1 - vst1.u8 {d24}, [lr]! - vld1.u8 {q6}, [r0], r1 - vst1.u8 {d25}, [lr]! - - bne filt_blk2d_fp8x8_loop_neon - - ;first_pass filtering on the rest 5-line data - ;vld1.u8 {q3}, [r0], r1 ;load src data - ;vld1.u8 {q4}, [r0], r1 - ;vld1.u8 {q5}, [r0], r1 - ;vld1.u8 {q6}, [r0], r1 - vld1.u8 {q7}, [r0], r1 - - vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q9, d8, d0 - vmull.u8 q10, d10, d0 - vmull.u8 q11, d12, d0 - vmull.u8 q12, d14, d0 - - vext.8 d27, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d28, d8, d9, #1 - vext.8 d29, d10, d11, #1 - vext.8 d30, d12, d13, #1 - vext.8 d31, d14, d15, #1 - - vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q9, d28, d1 - vmlsl.u8 q10, d29, d1 - vmlsl.u8 q11, d30, d1 - vmlsl.u8 q12, d31, d1 - - vext.8 d27, d6, d7, #4 ;construct src_ptr[2] - vext.8 d28, d8, d9, #4 - vext.8 d29, d10, d11, #4 - vext.8 d30, d12, d13, #4 - vext.8 d31, d14, d15, #4 - - vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q9, d28, d4 - vmlsl.u8 q10, d29, d4 - vmlsl.u8 q11, d30, d4 - vmlsl.u8 q12, d31, d4 - - vext.8 d27, d6, d7, #2 ;construct src_ptr[0] - vext.8 d28, d8, d9, #2 - vext.8 d29, d10, d11, #2 - vext.8 d30, d12, d13, #2 - vext.8 d31, d14, d15, #2 - - vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q9, d28, d2 - vmlal.u8 q10, d29, d2 - vmlal.u8 q11, d30, d2 - vmlal.u8 q12, d31, d2 - - vext.8 d27, d6, d7, #5 ;construct src_ptr[3] - vext.8 d28, d8, d9, #5 - vext.8 d29, d10, d11, #5 - vext.8 d30, d12, d13, #5 - vext.8 d31, d14, d15, #5 - - vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q9, d28, d5 - vmlal.u8 q10, d29, d5 - vmlal.u8 q11, d30, d5 - vmlal.u8 q12, d31, d5 - - vext.8 d27, d6, d7, #3 ;construct src_ptr[1] - vext.8 d28, d8, d9, #3 - vext.8 d29, d10, d11, #3 - vext.8 d30, d12, d13, #3 - vext.8 d31, d14, d15, #3 - - vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q4, d28, d3 - vmull.u8 q5, d29, d3 - vmull.u8 q6, d30, d3 - vmull.u8 q7, d31, d3 - - vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q9, q4 - vqadd.s16 q10, q5 - vqadd.s16 q11, q6 - vqadd.s16 q12, q7 - - add r3, r12, r3, lsl #5 - - vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8 - sub lr, lr, #64 - vqrshrun.s16 d27, q9, #7 - vld1.u8 {q9}, [lr]! ;load intermediate data from stack - vqrshrun.s16 d28, q10, #7 - vld1.u8 {q10}, [lr]! - - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - - vqrshrun.s16 d29, q11, #7 - vld1.u8 {q11}, [lr]! - - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - vqrshrun.s16 d30, q12, #7 - vld1.u8 {q12}, [lr]! - -;Second pass: 8x8 - mov r3, #2 ;loop counter - - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d14[4] - vdup.8 d2, d15[0] - vdup.8 d3, d15[4] - vdup.8 d4, d16[0] - vdup.8 d5, d16[4] - -filt_blk2d_sp8x8_loop_neon - vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q4, d19, d0 - vmull.u8 q5, d20, d0 - vmull.u8 q6, d21, d0 - - vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q4, d20, d1 - vmlsl.u8 q5, d21, d1 - vmlsl.u8 q6, d22, d1 - - vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q4, d23, d4 - vmlsl.u8 q5, d24, d4 - vmlsl.u8 q6, d25, d4 - - vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q4, d21, d2 - vmlal.u8 q5, d22, d2 - vmlal.u8 q6, d23, d2 - - vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q4, d24, d5 - vmlal.u8 q5, d25, d5 - vmlal.u8 q6, d26, d5 - - vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q8, d22, d3 - vmull.u8 q9, d23, d3 - vmull.u8 q10, d24, d3 - - subs r3, r3, #1 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vmov q9, q11 - vst1.u8 {d6}, [r4], r5 ;store result - vmov q10, q12 - vst1.u8 {d7}, [r4], r5 - vmov q11, q13 - vst1.u8 {d8}, [r4], r5 - vmov q12, q14 - vst1.u8 {d9}, [r4], r5 - vmov d26, d30 - - bne filt_blk2d_sp8x8_loop_neon - - add sp, sp, #64 - pop {r4-r5,pc} - -;--------------------- -firstpass_filter8x8_only - ;add r2, r12, r2, lsl #5 ;calculate filter location - ;vld1.s32 {q14, q15}, [r2] ;load first_pass filter - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - mov r2, #2 ;loop counter - sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vdup.8 d1, d24[4] - vdup.8 d2, d25[0] - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - -;First pass: output_height lines x output_width columns (8x8) -filt_blk2d_fpo8x8_loop_neon - vld1.u8 {q3}, [r0], r1 ;load src data - vld1.u8 {q4}, [r0], r1 - vld1.u8 {q5}, [r0], r1 - vld1.u8 {q6}, [r0], r1 - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q8, d8, d0 - vmull.u8 q9, d10, d0 - vmull.u8 q10, d12, d0 - - vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d29, d8, d9, #1 - vext.8 d30, d10, d11, #1 - vext.8 d31, d12, d13, #1 - - vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q8, d29, d1 - vmlsl.u8 q9, d30, d1 - vmlsl.u8 q10, d31, d1 - - vext.8 d28, d6, d7, #4 ;construct src_ptr[2] - vext.8 d29, d8, d9, #4 - vext.8 d30, d10, d11, #4 - vext.8 d31, d12, d13, #4 - - vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q8, d29, d4 - vmlsl.u8 q9, d30, d4 - vmlsl.u8 q10, d31, d4 - - vext.8 d28, d6, d7, #2 ;construct src_ptr[0] - vext.8 d29, d8, d9, #2 - vext.8 d30, d10, d11, #2 - vext.8 d31, d12, d13, #2 - - vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q8, d29, d2 - vmlal.u8 q9, d30, d2 - vmlal.u8 q10, d31, d2 - - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d8, d9, #5 - vext.8 d30, d10, d11, #5 - vext.8 d31, d12, d13, #5 - - vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q8, d29, d5 - vmlal.u8 q9, d30, d5 - vmlal.u8 q10, d31, d5 - - vext.8 d28, d6, d7, #3 ;construct src_ptr[1] - vext.8 d29, d8, d9, #3 - vext.8 d30, d10, d11, #3 - vext.8 d31, d12, d13, #3 - - vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q4, d29, d3 - vmull.u8 q5, d30, d3 - vmull.u8 q6, d31, d3 - ; - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - subs r2, r2, #1 - - vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d23, q8, #7 - vqrshrun.s16 d24, q9, #7 - vqrshrun.s16 d25, q10, #7 - - vst1.u8 {d22}, [r4], r5 ;store result - vst1.u8 {d23}, [r4], r5 - vst1.u8 {d24}, [r4], r5 - vst1.u8 {d25}, [r4], r5 - - bne filt_blk2d_fpo8x8_loop_neon - - pop {r4-r5,pc} - -;--------------------- -secondpass_filter8x8_only - sub r0, r0, r1, lsl #1 - add r3, r12, r3, lsl #5 - - vld1.u8 {d18}, [r0], r1 ;load src data - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - vld1.u8 {d19}, [r0], r1 - vabs.s32 q7, q5 - vld1.u8 {d20}, [r0], r1 - vabs.s32 q8, q6 - vld1.u8 {d21}, [r0], r1 - mov r3, #2 ;loop counter - vld1.u8 {d22}, [r0], r1 - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vld1.u8 {d23}, [r0], r1 - vdup.8 d1, d14[4] - vld1.u8 {d24}, [r0], r1 - vdup.8 d2, d15[0] - vld1.u8 {d25}, [r0], r1 - vdup.8 d3, d15[4] - vld1.u8 {d26}, [r0], r1 - vdup.8 d4, d16[0] - vld1.u8 {d27}, [r0], r1 - vdup.8 d5, d16[4] - vld1.u8 {d28}, [r0], r1 - vld1.u8 {d29}, [r0], r1 - vld1.u8 {d30}, [r0], r1 - -;Second pass: 8x8 -filt_blk2d_spo8x8_loop_neon - vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0]) - vmull.u8 q4, d19, d0 - vmull.u8 q5, d20, d0 - vmull.u8 q6, d21, d0 - - vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1]) - vmlsl.u8 q4, d20, d1 - vmlsl.u8 q5, d21, d1 - vmlsl.u8 q6, d22, d1 - - vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4]) - vmlsl.u8 q4, d23, d4 - vmlsl.u8 q5, d24, d4 - vmlsl.u8 q6, d25, d4 - - vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2]) - vmlal.u8 q4, d21, d2 - vmlal.u8 q5, d22, d2 - vmlal.u8 q6, d23, d2 - - vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5]) - vmlal.u8 q4, d24, d5 - vmlal.u8 q5, d25, d5 - vmlal.u8 q6, d26, d5 - - vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3]) - vmull.u8 q8, d22, d3 - vmull.u8 q9, d23, d3 - vmull.u8 q10, d24, d3 - - subs r3, r3, #1 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vmov q9, q11 - vst1.u8 {d6}, [r4], r5 ;store result - vmov q10, q12 - vst1.u8 {d7}, [r4], r5 - vmov q11, q13 - vst1.u8 {d8}, [r4], r5 - vmov q12, q14 - vst1.u8 {d9}, [r4], r5 - vmov d26, d30 - - bne filt_blk2d_spo8x8_loop_neon - - pop {r4-r5,pc} - - ENDP - -;----------------- - - END diff --git a/media/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/media/libvpx/vp8/common/arm/neon/sixtappredict_neon.c new file mode 100644 index 000000000000..aad6133f5f23 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/sixtappredict_neon.c @@ -0,0 +1,1757 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#ifdef _MSC_VER +#define __builtin_prefetch(x) +#endif + +static const int8_t vp8_sub_pel_filters[8][8] = { + {0, 0, 128, 0, 0, 0, 0, 0}, /* note that 1/8 pel positionyys are */ + {0, -6, 123, 12, -1, 0, 0, 0}, /* just as per alpha -0.5 bicubic */ + {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */ + {0, -9, 93, 50, -6, 0, 0, 0}, + {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */ + {0, -6, 50, 93, -9, 0, 0, 0}, + {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */ + {0, -1, 12, 123, -6, 0, 0, 0}, +}; + +void vp8_sixtap_predict4x4_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + unsigned char *src; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d18u8, d19u8, d20u8, d21u8; + uint8x8_t d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; + uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q11u8; + uint64x2_t q3u64, q4u64, q5u64, q6u64, q9u64, q10u64; + uint32x2x2_t d0u32x2, d1u32x2; + + if (xoffset == 0) { // secondpass_filter4x4_only + uint32x2_t d27u32 = vdup_n_u32(0); + uint32x2_t d28u32 = vdup_n_u32(0); + uint32x2_t d29u32 = vdup_n_u32(0); + uint32x2_t d30u32 = vdup_n_u32(0); + uint32x2_t d31u32 = vdup_n_u32(0); + + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src = src_ptr - src_pixels_per_line * 2; + d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 0); + src += src_pixels_per_line; + d27u32 = vld1_lane_u32((const uint32_t *)src, d27u32, 1); + src += src_pixels_per_line; + d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 0); + src += src_pixels_per_line; + d28u32 = vld1_lane_u32((const uint32_t *)src, d28u32, 1); + src += src_pixels_per_line; + d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 0); + src += src_pixels_per_line; + d29u32 = vld1_lane_u32((const uint32_t *)src, d29u32, 1); + src += src_pixels_per_line; + d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 0); + src += src_pixels_per_line; + d30u32 = vld1_lane_u32((const uint32_t *)src, d30u32, 1); + src += src_pixels_per_line; + d31u32 = vld1_lane_u32((const uint32_t *)src, d31u32, 0); + + d27u8 = vreinterpret_u8_u32(d27u32); + d28u8 = vreinterpret_u8_u32(d28u32); + d29u8 = vreinterpret_u8_u32(d29u32); + d30u8 = vreinterpret_u8_u32(d30u32); + d31u8 = vreinterpret_u8_u32(d31u32); + + d23u8 = vext_u8(d27u8, d28u8, 4); + d24u8 = vext_u8(d28u8, d29u8, 4); + d25u8 = vext_u8(d29u8, d30u8, 4); + d26u8 = vext_u8(d30u8, d31u8, 4); + + q3u16 = vmull_u8(d27u8, d0u8); + q4u16 = vmull_u8(d28u8, d0u8); + q5u16 = vmull_u8(d25u8, d5u8); + q6u16 = vmull_u8(d26u8, d5u8); + + q3u16 = vmlsl_u8(q3u16, d29u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d30u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d23u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d24u8, d1u8); + + q3u16 = vmlal_u8(q3u16, d28u8, d2u8); + q4u16 = vmlal_u8(q4u16, d29u8, d2u8); + q5u16 = vmlal_u8(q5u16, d24u8, d3u8); + q6u16 = vmlal_u8(q6u16, d25u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + + q5s16 = vqaddq_s16(q5s16, q3s16); + q6s16 = vqaddq_s16(q6s16, q4s16); + + d3u8 = vqrshrun_n_s16(q5s16, 7); + d4u8 = vqrshrun_n_s16(q6s16, 7); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1); + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + + if (yoffset == 0) // firstpass_filter4x4_only + src = src_ptr - 2; + else + src = src_ptr - 2 - (src_pixels_per_line * 2); + + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + + d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + + // vswp here + q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8)); + q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8)); + + d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8), // d18 d19 + vreinterpret_u32_u8(d19u8)); + d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8), // d20 d21 + vreinterpret_u32_u8(d21u8)); + q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8); + q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8); + + // keep original src data in q4 q6 + q4u64 = vreinterpretq_u64_u8(q3u8); + q6u64 = vreinterpretq_u64_u8(q5u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)), // d6 d7 + vreinterpret_u32_u8(vget_high_u8(q3u8))); + d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)), // d10 d11 + vreinterpret_u32_u8(vget_high_u8(q5u8))); + q9u64 = vshrq_n_u64(q4u64, 8); + q10u64 = vshrq_n_u64(q6u64, 8); + q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8); + q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 + vreinterpret_u32_u64(vget_high_u64(q9u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 + vreinterpret_u32_u64(vget_high_u64(q10u64))); + q3u64 = vshrq_n_u64(q4u64, 32); + q5u64 = vshrq_n_u64(q6u64, 32); + q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8); + q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 + vreinterpret_u32_u64(vget_high_u64(q3u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 + vreinterpret_u32_u64(vget_high_u64(q5u64))); + q9u64 = vshrq_n_u64(q4u64, 16); + q10u64 = vshrq_n_u64(q6u64, 16); + q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8); + q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 + vreinterpret_u32_u64(vget_high_u64(q9u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 + vreinterpret_u32_u64(vget_high_u64(q10u64))); + q3u64 = vshrq_n_u64(q4u64, 24); + q5u64 = vshrq_n_u64(q6u64, 24); + q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8); + q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 + vreinterpret_u32_u64(vget_high_u64(q3u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 + vreinterpret_u32_u64(vget_high_u64(q5u64))); + q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8); + q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8); + + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q7s16 = vqaddq_s16(q7s16, q9s16); + q8s16 = vqaddq_s16(q8s16, q10s16); + + d27u8 = vqrshrun_n_s16(q7s16, 7); + d28u8 = vqrshrun_n_s16(q8s16, 7); + + if (yoffset == 0) { // firstpass_filter4x4_only + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d27u8), 1); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1); + return; + } + + // First Pass on rest 5-line data + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + q11u8 = vld1q_u8(src); + + d18u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d19u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d20u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d21u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + + // vswp here + q3u8 = vcombine_u8(vget_low_u8(q3u8), vget_low_u8(q4u8)); + q5u8 = vcombine_u8(vget_low_u8(q5u8), vget_low_u8(q6u8)); + + d0u32x2 = vzip_u32(vreinterpret_u32_u8(d18u8), // d18 d19 + vreinterpret_u32_u8(d19u8)); + d1u32x2 = vzip_u32(vreinterpret_u32_u8(d20u8), // d20 d21 + vreinterpret_u32_u8(d21u8)); + d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 5); + q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d5u8); + q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d5u8); + q12u16 = vmull_u8(d31u8, d5u8); + + q4u64 = vreinterpretq_u64_u8(q3u8); + q6u64 = vreinterpretq_u64_u8(q5u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q3u8)), // d6 d7 + vreinterpret_u32_u8(vget_high_u8(q3u8))); + d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q5u8)), // d10 d11 + vreinterpret_u32_u8(vget_high_u8(q5u8))); + q9u64 = vshrq_n_u64(q4u64, 8); + q10u64 = vshrq_n_u64(q6u64, 8); + q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d0u8); + q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d0u8); + q12u16 = vmlal_u8(q12u16, vget_low_u8(q11u8), d0u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 + vreinterpret_u32_u64(vget_high_u64(q9u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 + vreinterpret_u32_u64(vget_high_u64(q10u64))); + q3u64 = vshrq_n_u64(q4u64, 32); + q5u64 = vshrq_n_u64(q6u64, 32); + d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 1); + q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d1u8); + q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d1u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 + vreinterpret_u32_u64(vget_high_u64(q3u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 + vreinterpret_u32_u64(vget_high_u64(q5u64))); + q9u64 = vshrq_n_u64(q4u64, 16); + q10u64 = vshrq_n_u64(q6u64, 16); + d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 4); + q7u16 = vmlsl_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d4u8); + q8u16 = vmlsl_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d4u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q9u64)), // d18 d19 + vreinterpret_u32_u64(vget_high_u64(q9u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q10u64)), // d20 d211 + vreinterpret_u32_u64(vget_high_u64(q10u64))); + q3u64 = vshrq_n_u64(q4u64, 24); + q5u64 = vshrq_n_u64(q6u64, 24); + d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 2); + q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d0u32x2.val[0]), d2u8); + q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d1u32x2.val[0]), d2u8); + q12u16 = vmlal_u8(q12u16, d31u8, d2u8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q3u64)), // d6 d7 + vreinterpret_u32_u64(vget_high_u64(q3u64))); + d1u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), // d10 d11 + vreinterpret_u32_u64(vget_high_u64(q5u64))); + d31u8 = vext_u8(vget_low_u8(q11u8), vget_high_u8(q11u8), 3); + q9u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d3u8); + q10u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d3u8); + q11u16 = vmull_u8(d31u8, d3u8); + + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + q7s16 = vqaddq_s16(q7s16, q9s16); + q8s16 = vqaddq_s16(q8s16, q10s16); + q12s16 = vqaddq_s16(q12s16, q11s16); + + d29u8 = vqrshrun_n_s16(q7s16, 7); + d30u8 = vqrshrun_n_s16(q8s16, 7); + d31u8 = vqrshrun_n_s16(q12s16, 7); + + // Second pass: 4x4 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + d23u8 = vext_u8(d27u8, d28u8, 4); + d24u8 = vext_u8(d28u8, d29u8, 4); + d25u8 = vext_u8(d29u8, d30u8, 4); + d26u8 = vext_u8(d30u8, d31u8, 4); + + q3u16 = vmull_u8(d27u8, d0u8); + q4u16 = vmull_u8(d28u8, d0u8); + q5u16 = vmull_u8(d25u8, d5u8); + q6u16 = vmull_u8(d26u8, d5u8); + + q3u16 = vmlsl_u8(q3u16, d29u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d30u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d23u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d24u8, d1u8); + + q3u16 = vmlal_u8(q3u16, d28u8, d2u8); + q4u16 = vmlal_u8(q4u16, d29u8, d2u8); + q5u16 = vmlal_u8(q5u16, d24u8, d3u8); + q6u16 = vmlal_u8(q6u16, d25u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + + q5s16 = vqaddq_s16(q5s16, q3s16); + q6s16 = vqaddq_s16(q6s16, q4s16); + + d3u8 = vqrshrun_n_s16(q5s16, 7); + d4u8 = vqrshrun_n_s16(q6s16, 7); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d4u8), 1); + return; +} + +void vp8_sixtap_predict8x4_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + unsigned char *src; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8; + uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; + uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8; + + if (xoffset == 0) { // secondpass_filter8x4_only + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src = src_ptr - src_pixels_per_line * 2; + d22u8 = vld1_u8(src); + src += src_pixels_per_line; + d23u8 = vld1_u8(src); + src += src_pixels_per_line; + d24u8 = vld1_u8(src); + src += src_pixels_per_line; + d25u8 = vld1_u8(src); + src += src_pixels_per_line; + d26u8 = vld1_u8(src); + src += src_pixels_per_line; + d27u8 = vld1_u8(src); + src += src_pixels_per_line; + d28u8 = vld1_u8(src); + src += src_pixels_per_line; + d29u8 = vld1_u8(src); + src += src_pixels_per_line; + d30u8 = vld1_u8(src); + + q3u16 = vmull_u8(d22u8, d0u8); + q4u16 = vmull_u8(d23u8, d0u8); + q5u16 = vmull_u8(d24u8, d0u8); + q6u16 = vmull_u8(d25u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d23u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d24u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d25u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d26u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d26u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d27u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d28u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d29u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d24u8, d2u8); + q4u16 = vmlal_u8(q4u16, d25u8, d2u8); + q5u16 = vmlal_u8(q5u16, d26u8, d2u8); + q6u16 = vmlal_u8(q6u16, d27u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d27u8, d5u8); + q4u16 = vmlal_u8(q4u16, d28u8, d5u8); + q5u16 = vmlal_u8(q5u16, d29u8, d5u8); + q6u16 = vmlal_u8(q6u16, d30u8, d5u8); + + q7u16 = vmull_u8(d25u8, d3u8); + q8u16 = vmull_u8(d26u8, d3u8); + q9u16 = vmull_u8(d27u8, d3u8); + q10u16 = vmull_u8(d28u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + if (yoffset == 0) // firstpass_filter4x4_only + src = src_ptr - 2; + else + src = src_ptr - 2 - (src_pixels_per_line * 2); + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + + q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + + q7u16 = vmlsl_u8(q7u16, d28u8, d1u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d1u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + + q7u16 = vmlsl_u8(q7u16, d28u8, d4u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d4u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + + q7u16 = vmlal_u8(q7u16, d28u8, d2u8); + q8u16 = vmlal_u8(q8u16, d29u8, d2u8); + q9u16 = vmlal_u8(q9u16, d30u8, d2u8); + q10u16 = vmlal_u8(q10u16, d31u8, d2u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + + q7u16 = vmlal_u8(q7u16, d28u8, d5u8); + q8u16 = vmlal_u8(q8u16, d29u8, d5u8); + q9u16 = vmlal_u8(q9u16, d30u8, d5u8); + q10u16 = vmlal_u8(q10u16, d31u8, d5u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + + q3u16 = vmull_u8(d28u8, d3u8); + q4u16 = vmull_u8(d29u8, d3u8); + q5u16 = vmull_u8(d30u8, d3u8); + q6u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d22u8 = vqrshrun_n_s16(q7s16, 7); + d23u8 = vqrshrun_n_s16(q8s16, 7); + d24u8 = vqrshrun_n_s16(q9s16, 7); + d25u8 = vqrshrun_n_s16(q10s16, 7); + + if (yoffset == 0) { // firstpass_filter8x4_only + vst1_u8(dst_ptr, d22u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d23u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d24u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d25u8); + return; + } + + // First Pass on rest 5-line data + src += src_pixels_per_line; + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + q7u8 = vld1q_u8(src); + + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1); + + q8u16 = vmlsl_u8(q8u16, d27u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d1u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4); + + q8u16 = vmlsl_u8(q8u16, d27u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d4u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2); + + q8u16 = vmlal_u8(q8u16, d27u8, d2u8); + q9u16 = vmlal_u8(q9u16, d28u8, d2u8); + q10u16 = vmlal_u8(q10u16, d29u8, d2u8); + q11u16 = vmlal_u8(q11u16, d30u8, d2u8); + q12u16 = vmlal_u8(q12u16, d31u8, d2u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5); + + q8u16 = vmlal_u8(q8u16, d27u8, d5u8); + q9u16 = vmlal_u8(q9u16, d28u8, d5u8); + q10u16 = vmlal_u8(q10u16, d29u8, d5u8); + q11u16 = vmlal_u8(q11u16, d30u8, d5u8); + q12u16 = vmlal_u8(q12u16, d31u8, d5u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3); + + q3u16 = vmull_u8(d27u8, d3u8); + q4u16 = vmull_u8(d28u8, d3u8); + q5u16 = vmull_u8(d29u8, d3u8); + q6u16 = vmull_u8(d30u8, d3u8); + q7u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + + q8s16 = vqaddq_s16(q8s16, q3s16); + q9s16 = vqaddq_s16(q9s16, q4s16); + q10s16 = vqaddq_s16(q10s16, q5s16); + q11s16 = vqaddq_s16(q11s16, q6s16); + q12s16 = vqaddq_s16(q12s16, q7s16); + + d26u8 = vqrshrun_n_s16(q8s16, 7); + d27u8 = vqrshrun_n_s16(q9s16, 7); + d28u8 = vqrshrun_n_s16(q10s16, 7); + d29u8 = vqrshrun_n_s16(q11s16, 7); + d30u8 = vqrshrun_n_s16(q12s16, 7); + + // Second pass: 8x4 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + q3u16 = vmull_u8(d22u8, d0u8); + q4u16 = vmull_u8(d23u8, d0u8); + q5u16 = vmull_u8(d24u8, d0u8); + q6u16 = vmull_u8(d25u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d23u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d24u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d25u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d26u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d26u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d27u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d28u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d29u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d24u8, d2u8); + q4u16 = vmlal_u8(q4u16, d25u8, d2u8); + q5u16 = vmlal_u8(q5u16, d26u8, d2u8); + q6u16 = vmlal_u8(q6u16, d27u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d27u8, d5u8); + q4u16 = vmlal_u8(q4u16, d28u8, d5u8); + q5u16 = vmlal_u8(q5u16, d29u8, d5u8); + q6u16 = vmlal_u8(q6u16, d30u8, d5u8); + + q7u16 = vmull_u8(d25u8, d3u8); + q8u16 = vmull_u8(d26u8, d3u8); + q9u16 = vmull_u8(d27u8, d3u8); + q10u16 = vmull_u8(d28u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + return; +} + +void vp8_sixtap_predict8x8_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + unsigned char *src, *tmpp; + unsigned char tmp[64]; + int i; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8; + uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16; + uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8; + + if (xoffset == 0) { // secondpass_filter8x8_only + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src = src_ptr - src_pixels_per_line * 2; + d18u8 = vld1_u8(src); + src += src_pixels_per_line; + d19u8 = vld1_u8(src); + src += src_pixels_per_line; + d20u8 = vld1_u8(src); + src += src_pixels_per_line; + d21u8 = vld1_u8(src); + src += src_pixels_per_line; + d22u8 = vld1_u8(src); + src += src_pixels_per_line; + d23u8 = vld1_u8(src); + src += src_pixels_per_line; + d24u8 = vld1_u8(src); + src += src_pixels_per_line; + d25u8 = vld1_u8(src); + src += src_pixels_per_line; + d26u8 = vld1_u8(src); + src += src_pixels_per_line; + d27u8 = vld1_u8(src); + src += src_pixels_per_line; + d28u8 = vld1_u8(src); + src += src_pixels_per_line; + d29u8 = vld1_u8(src); + src += src_pixels_per_line; + d30u8 = vld1_u8(src); + + for (i = 2; i > 0; i--) { + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + d23u8 = d27u8; + d24u8 = d28u8; + d25u8 = d29u8; + d26u8 = d30u8; + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + dst_ptr += dst_pitch; + } + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + if (yoffset == 0) // firstpass_filter4x4_only + src = src_ptr - 2; + else + src = src_ptr - 2 - (src_pixels_per_line * 2); + + tmpp = tmp; + for (i = 2; i > 0; i--) { + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + + __builtin_prefetch(src); + __builtin_prefetch(src + src_pixels_per_line); + __builtin_prefetch(src + src_pixels_per_line * 2); + + q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + + q7u16 = vmlsl_u8(q7u16, d28u8, d1u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d1u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + + q7u16 = vmlsl_u8(q7u16, d28u8, d4u8); + q8u16 = vmlsl_u8(q8u16, d29u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d30u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d31u8, d4u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + + q7u16 = vmlal_u8(q7u16, d28u8, d2u8); + q8u16 = vmlal_u8(q8u16, d29u8, d2u8); + q9u16 = vmlal_u8(q9u16, d30u8, d2u8); + q10u16 = vmlal_u8(q10u16, d31u8, d2u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + + q7u16 = vmlal_u8(q7u16, d28u8, d5u8); + q8u16 = vmlal_u8(q8u16, d29u8, d5u8); + q9u16 = vmlal_u8(q9u16, d30u8, d5u8); + q10u16 = vmlal_u8(q10u16, d31u8, d5u8); + + d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + + q3u16 = vmull_u8(d28u8, d3u8); + q4u16 = vmull_u8(d29u8, d3u8); + q5u16 = vmull_u8(d30u8, d3u8); + q6u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d22u8 = vqrshrun_n_s16(q7s16, 7); + d23u8 = vqrshrun_n_s16(q8s16, 7); + d24u8 = vqrshrun_n_s16(q9s16, 7); + d25u8 = vqrshrun_n_s16(q10s16, 7); + + if (yoffset == 0) { // firstpass_filter8x4_only + vst1_u8(dst_ptr, d22u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d23u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d24u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d25u8); + dst_ptr += dst_pitch; + } else { + vst1_u8(tmpp, d22u8); + tmpp += 8; + vst1_u8(tmpp, d23u8); + tmpp += 8; + vst1_u8(tmpp, d24u8); + tmpp += 8; + vst1_u8(tmpp, d25u8); + tmpp += 8; + } + } + if (yoffset == 0) + return; + + // First Pass on rest 5-line data + q3u8 = vld1q_u8(src); + src += src_pixels_per_line; + q4u8 = vld1q_u8(src); + src += src_pixels_per_line; + q5u8 = vld1q_u8(src); + src += src_pixels_per_line; + q6u8 = vld1q_u8(src); + src += src_pixels_per_line; + q7u8 = vld1q_u8(src); + + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8); + q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1); + + q8u16 = vmlsl_u8(q8u16, d27u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d1u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d1u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4); + + q8u16 = vmlsl_u8(q8u16, d27u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); + q11u16 = vmlsl_u8(q11u16, d30u8, d4u8); + q12u16 = vmlsl_u8(q12u16, d31u8, d4u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2); + + q8u16 = vmlal_u8(q8u16, d27u8, d2u8); + q9u16 = vmlal_u8(q9u16, d28u8, d2u8); + q10u16 = vmlal_u8(q10u16, d29u8, d2u8); + q11u16 = vmlal_u8(q11u16, d30u8, d2u8); + q12u16 = vmlal_u8(q12u16, d31u8, d2u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5); + + q8u16 = vmlal_u8(q8u16, d27u8, d5u8); + q9u16 = vmlal_u8(q9u16, d28u8, d5u8); + q10u16 = vmlal_u8(q10u16, d29u8, d5u8); + q11u16 = vmlal_u8(q11u16, d30u8, d5u8); + q12u16 = vmlal_u8(q12u16, d31u8, d5u8); + + d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3); + d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3); + d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3); + d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3); + d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3); + + q3u16 = vmull_u8(d27u8, d3u8); + q4u16 = vmull_u8(d28u8, d3u8); + q5u16 = vmull_u8(d29u8, d3u8); + q6u16 = vmull_u8(d30u8, d3u8); + q7u16 = vmull_u8(d31u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + + q8s16 = vqaddq_s16(q8s16, q3s16); + q9s16 = vqaddq_s16(q9s16, q4s16); + q10s16 = vqaddq_s16(q10s16, q5s16); + q11s16 = vqaddq_s16(q11s16, q6s16); + q12s16 = vqaddq_s16(q12s16, q7s16); + + d26u8 = vqrshrun_n_s16(q8s16, 7); + d27u8 = vqrshrun_n_s16(q9s16, 7); + d28u8 = vqrshrun_n_s16(q10s16, 7); + d29u8 = vqrshrun_n_s16(q11s16, 7); + d30u8 = vqrshrun_n_s16(q12s16, 7); + + // Second pass: 8x8 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + tmpp = tmp; + q9u8 = vld1q_u8(tmpp); + tmpp += 16; + q10u8 = vld1q_u8(tmpp); + tmpp += 16; + q11u8 = vld1q_u8(tmpp); + tmpp += 16; + q12u8 = vld1q_u8(tmpp); + + d18u8 = vget_low_u8(q9u8); + d19u8 = vget_high_u8(q9u8); + d20u8 = vget_low_u8(q10u8); + d21u8 = vget_high_u8(q10u8); + d22u8 = vget_low_u8(q11u8); + d23u8 = vget_high_u8(q11u8); + d24u8 = vget_low_u8(q12u8); + d25u8 = vget_high_u8(q12u8); + + for (i = 2; i > 0; i--) { + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + d23u8 = d27u8; + d24u8 = d28u8; + d25u8 = d29u8; + d26u8 = d30u8; + + vst1_u8(dst_ptr, d6u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d7u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d8u8); + dst_ptr += dst_pitch; + vst1_u8(dst_ptr, d9u8); + dst_ptr += dst_pitch; + } + return; +} + +void vp8_sixtap_predict16x16_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + unsigned char *src, *src_tmp, *dst, *tmpp; + unsigned char tmp[336]; + int i, j; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8; + uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8; + uint8x8_t d28u8, d29u8, d30u8, d31u8; + int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8; + uint8x16_t q3u8, q4u8; + uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16; + uint16x8_t q11u16, q12u16, q13u16, q15u16; + int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16; + int16x8_t q11s16, q12s16, q13s16, q15s16; + + if (xoffset == 0) { // secondpass_filter8x8_only + // load second_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // load src data + src_tmp = src_ptr - src_pixels_per_line * 2; + for (i = 0; i < 2; i++) { + src = src_tmp + i * 8; + dst = dst_ptr + i * 8; + d18u8 = vld1_u8(src); + src += src_pixels_per_line; + d19u8 = vld1_u8(src); + src += src_pixels_per_line; + d20u8 = vld1_u8(src); + src += src_pixels_per_line; + d21u8 = vld1_u8(src); + src += src_pixels_per_line; + d22u8 = vld1_u8(src); + src += src_pixels_per_line; + for (j = 0; j < 4; j++) { + d23u8 = vld1_u8(src); + src += src_pixels_per_line; + d24u8 = vld1_u8(src); + src += src_pixels_per_line; + d25u8 = vld1_u8(src); + src += src_pixels_per_line; + d26u8 = vld1_u8(src); + src += src_pixels_per_line; + + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + + vst1_u8(dst, d6u8); + dst += dst_pitch; + vst1_u8(dst, d7u8); + dst += dst_pitch; + vst1_u8(dst, d8u8); + dst += dst_pitch; + vst1_u8(dst, d9u8); + dst += dst_pitch; + } + } + return; + } + + // load first_pass filter + dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + // First pass: output_height lines x output_width columns (9x4) + if (yoffset == 0) { // firstpass_filter4x4_only + src = src_ptr - 2; + dst = dst_ptr; + for (i = 0; i < 8; i++) { + d6u8 = vld1_u8(src); + d7u8 = vld1_u8(src + 8); + d8u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + d9u8 = vld1_u8(src); + d10u8 = vld1_u8(src + 8); + d11u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + + __builtin_prefetch(src); + __builtin_prefetch(src + src_pixels_per_line); + + q6u16 = vmull_u8(d6u8, d0u8); + q7u16 = vmull_u8(d7u8, d0u8); + q8u16 = vmull_u8(d9u8, d0u8); + q9u16 = vmull_u8(d10u8, d0u8); + + d20u8 = vext_u8(d6u8, d7u8, 1); + d21u8 = vext_u8(d9u8, d10u8, 1); + d22u8 = vext_u8(d7u8, d8u8, 1); + d23u8 = vext_u8(d10u8, d11u8, 1); + d24u8 = vext_u8(d6u8, d7u8, 4); + d25u8 = vext_u8(d9u8, d10u8, 4); + d26u8 = vext_u8(d7u8, d8u8, 4); + d27u8 = vext_u8(d10u8, d11u8, 4); + d28u8 = vext_u8(d6u8, d7u8, 5); + d29u8 = vext_u8(d9u8, d10u8, 5); + + q6u16 = vmlsl_u8(q6u16, d20u8, d1u8); + q8u16 = vmlsl_u8(q8u16, d21u8, d1u8); + q7u16 = vmlsl_u8(q7u16, d22u8, d1u8); + q9u16 = vmlsl_u8(q9u16, d23u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d24u8, d4u8); + q8u16 = vmlsl_u8(q8u16, d25u8, d4u8); + q7u16 = vmlsl_u8(q7u16, d26u8, d4u8); + q9u16 = vmlsl_u8(q9u16, d27u8, d4u8); + q6u16 = vmlal_u8(q6u16, d28u8, d5u8); + q8u16 = vmlal_u8(q8u16, d29u8, d5u8); + + d20u8 = vext_u8(d7u8, d8u8, 5); + d21u8 = vext_u8(d10u8, d11u8, 5); + d22u8 = vext_u8(d6u8, d7u8, 2); + d23u8 = vext_u8(d9u8, d10u8, 2); + d24u8 = vext_u8(d7u8, d8u8, 2); + d25u8 = vext_u8(d10u8, d11u8, 2); + d26u8 = vext_u8(d6u8, d7u8, 3); + d27u8 = vext_u8(d9u8, d10u8, 3); + d28u8 = vext_u8(d7u8, d8u8, 3); + d29u8 = vext_u8(d10u8, d11u8, 3); + + q7u16 = vmlal_u8(q7u16, d20u8, d5u8); + q9u16 = vmlal_u8(q9u16, d21u8, d5u8); + q6u16 = vmlal_u8(q6u16, d22u8, d2u8); + q8u16 = vmlal_u8(q8u16, d23u8, d2u8); + q7u16 = vmlal_u8(q7u16, d24u8, d2u8); + q9u16 = vmlal_u8(q9u16, d25u8, d2u8); + + q10u16 = vmull_u8(d26u8, d3u8); + q11u16 = vmull_u8(d27u8, d3u8); + q12u16 = vmull_u8(d28u8, d3u8); + q15u16 = vmull_u8(d29u8, d3u8); + + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + q15s16 = vreinterpretq_s16_u16(q15u16); + + q6s16 = vqaddq_s16(q6s16, q10s16); + q8s16 = vqaddq_s16(q8s16, q11s16); + q7s16 = vqaddq_s16(q7s16, q12s16); + q9s16 = vqaddq_s16(q9s16, q15s16); + + d6u8 = vqrshrun_n_s16(q6s16, 7); + d7u8 = vqrshrun_n_s16(q7s16, 7); + d8u8 = vqrshrun_n_s16(q8s16, 7); + d9u8 = vqrshrun_n_s16(q9s16, 7); + + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + vst1q_u8(dst, q3u8); + dst += dst_pitch; + vst1q_u8(dst, q4u8); + dst += dst_pitch; + } + return; + } + + src = src_ptr - 2 - src_pixels_per_line * 2; + tmpp = tmp; + for (i = 0; i < 7; i++) { + d6u8 = vld1_u8(src); + d7u8 = vld1_u8(src + 8); + d8u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + d9u8 = vld1_u8(src); + d10u8 = vld1_u8(src + 8); + d11u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + d12u8 = vld1_u8(src); + d13u8 = vld1_u8(src + 8); + d14u8 = vld1_u8(src + 16); + src += src_pixels_per_line; + + __builtin_prefetch(src); + __builtin_prefetch(src + src_pixels_per_line); + __builtin_prefetch(src + src_pixels_per_line * 2); + + q8u16 = vmull_u8(d6u8, d0u8); + q9u16 = vmull_u8(d7u8, d0u8); + q10u16 = vmull_u8(d9u8, d0u8); + q11u16 = vmull_u8(d10u8, d0u8); + q12u16 = vmull_u8(d12u8, d0u8); + q13u16 = vmull_u8(d13u8, d0u8); + + d28u8 = vext_u8(d6u8, d7u8, 1); + d29u8 = vext_u8(d9u8, d10u8, 1); + d30u8 = vext_u8(d12u8, d13u8, 1); + q8u16 = vmlsl_u8(q8u16, d28u8, d1u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d1u8); + q12u16 = vmlsl_u8(q12u16, d30u8, d1u8); + d28u8 = vext_u8(d7u8, d8u8, 1); + d29u8 = vext_u8(d10u8, d11u8, 1); + d30u8 = vext_u8(d13u8, d14u8, 1); + q9u16 = vmlsl_u8(q9u16, d28u8, d1u8); + q11u16 = vmlsl_u8(q11u16, d29u8, d1u8); + q13u16 = vmlsl_u8(q13u16, d30u8, d1u8); + + d28u8 = vext_u8(d6u8, d7u8, 4); + d29u8 = vext_u8(d9u8, d10u8, 4); + d30u8 = vext_u8(d12u8, d13u8, 4); + q8u16 = vmlsl_u8(q8u16, d28u8, d4u8); + q10u16 = vmlsl_u8(q10u16, d29u8, d4u8); + q12u16 = vmlsl_u8(q12u16, d30u8, d4u8); + d28u8 = vext_u8(d7u8, d8u8, 4); + d29u8 = vext_u8(d10u8, d11u8, 4); + d30u8 = vext_u8(d13u8, d14u8, 4); + q9u16 = vmlsl_u8(q9u16, d28u8, d4u8); + q11u16 = vmlsl_u8(q11u16, d29u8, d4u8); + q13u16 = vmlsl_u8(q13u16, d30u8, d4u8); + + d28u8 = vext_u8(d6u8, d7u8, 5); + d29u8 = vext_u8(d9u8, d10u8, 5); + d30u8 = vext_u8(d12u8, d13u8, 5); + q8u16 = vmlal_u8(q8u16, d28u8, d5u8); + q10u16 = vmlal_u8(q10u16, d29u8, d5u8); + q12u16 = vmlal_u8(q12u16, d30u8, d5u8); + d28u8 = vext_u8(d7u8, d8u8, 5); + d29u8 = vext_u8(d10u8, d11u8, 5); + d30u8 = vext_u8(d13u8, d14u8, 5); + q9u16 = vmlal_u8(q9u16, d28u8, d5u8); + q11u16 = vmlal_u8(q11u16, d29u8, d5u8); + q13u16 = vmlal_u8(q13u16, d30u8, d5u8); + + d28u8 = vext_u8(d6u8, d7u8, 2); + d29u8 = vext_u8(d9u8, d10u8, 2); + d30u8 = vext_u8(d12u8, d13u8, 2); + q8u16 = vmlal_u8(q8u16, d28u8, d2u8); + q10u16 = vmlal_u8(q10u16, d29u8, d2u8); + q12u16 = vmlal_u8(q12u16, d30u8, d2u8); + d28u8 = vext_u8(d7u8, d8u8, 2); + d29u8 = vext_u8(d10u8, d11u8, 2); + d30u8 = vext_u8(d13u8, d14u8, 2); + q9u16 = vmlal_u8(q9u16, d28u8, d2u8); + q11u16 = vmlal_u8(q11u16, d29u8, d2u8); + q13u16 = vmlal_u8(q13u16, d30u8, d2u8); + + d28u8 = vext_u8(d6u8, d7u8, 3); + d29u8 = vext_u8(d9u8, d10u8, 3); + d30u8 = vext_u8(d12u8, d13u8, 3); + d15u8 = vext_u8(d7u8, d8u8, 3); + d31u8 = vext_u8(d10u8, d11u8, 3); + d6u8 = vext_u8(d13u8, d14u8, 3); + q4u16 = vmull_u8(d28u8, d3u8); + q5u16 = vmull_u8(d29u8, d3u8); + q6u16 = vmull_u8(d30u8, d3u8); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + q12s16 = vreinterpretq_s16_u16(q12u16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q10s16 = vqaddq_s16(q10s16, q5s16); + q12s16 = vqaddq_s16(q12s16, q6s16); + + q6u16 = vmull_u8(d15u8, d3u8); + q7u16 = vmull_u8(d31u8, d3u8); + q3u16 = vmull_u8(d6u8, d3u8); + q3s16 = vreinterpretq_s16_u16(q3u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q11s16 = vreinterpretq_s16_u16(q11u16); + q13s16 = vreinterpretq_s16_u16(q13u16); + q9s16 = vqaddq_s16(q9s16, q6s16); + q11s16 = vqaddq_s16(q11s16, q7s16); + q13s16 = vqaddq_s16(q13s16, q3s16); + + d6u8 = vqrshrun_n_s16(q8s16, 7); + d7u8 = vqrshrun_n_s16(q9s16, 7); + d8u8 = vqrshrun_n_s16(q10s16, 7); + d9u8 = vqrshrun_n_s16(q11s16, 7); + d10u8 = vqrshrun_n_s16(q12s16, 7); + d11u8 = vqrshrun_n_s16(q13s16, 7); + + vst1_u8(tmpp, d6u8); + tmpp += 8; + vst1_u8(tmpp, d7u8); + tmpp += 8; + vst1_u8(tmpp, d8u8); + tmpp += 8; + vst1_u8(tmpp, d9u8); + tmpp += 8; + vst1_u8(tmpp, d10u8); + tmpp += 8; + vst1_u8(tmpp, d11u8); + tmpp += 8; + } + + // Second pass: 16x16 + dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]); + d0s8 = vdup_lane_s8(dtmps8, 0); + d1s8 = vdup_lane_s8(dtmps8, 1); + d2s8 = vdup_lane_s8(dtmps8, 2); + d3s8 = vdup_lane_s8(dtmps8, 3); + d4s8 = vdup_lane_s8(dtmps8, 4); + d5s8 = vdup_lane_s8(dtmps8, 5); + d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8)); + d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8)); + d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8)); + d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8)); + d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8)); + d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8)); + + for (i = 0; i < 2; i++) { + dst = dst_ptr + 8 * i; + tmpp = tmp + 8 * i; + d18u8 = vld1_u8(tmpp); + tmpp += 16; + d19u8 = vld1_u8(tmpp); + tmpp += 16; + d20u8 = vld1_u8(tmpp); + tmpp += 16; + d21u8 = vld1_u8(tmpp); + tmpp += 16; + d22u8 = vld1_u8(tmpp); + tmpp += 16; + for (j = 0; j < 4; j++) { + d23u8 = vld1_u8(tmpp); + tmpp += 16; + d24u8 = vld1_u8(tmpp); + tmpp += 16; + d25u8 = vld1_u8(tmpp); + tmpp += 16; + d26u8 = vld1_u8(tmpp); + tmpp += 16; + + q3u16 = vmull_u8(d18u8, d0u8); + q4u16 = vmull_u8(d19u8, d0u8); + q5u16 = vmull_u8(d20u8, d0u8); + q6u16 = vmull_u8(d21u8, d0u8); + + q3u16 = vmlsl_u8(q3u16, d19u8, d1u8); + q4u16 = vmlsl_u8(q4u16, d20u8, d1u8); + q5u16 = vmlsl_u8(q5u16, d21u8, d1u8); + q6u16 = vmlsl_u8(q6u16, d22u8, d1u8); + + q3u16 = vmlsl_u8(q3u16, d22u8, d4u8); + q4u16 = vmlsl_u8(q4u16, d23u8, d4u8); + q5u16 = vmlsl_u8(q5u16, d24u8, d4u8); + q6u16 = vmlsl_u8(q6u16, d25u8, d4u8); + + q3u16 = vmlal_u8(q3u16, d20u8, d2u8); + q4u16 = vmlal_u8(q4u16, d21u8, d2u8); + q5u16 = vmlal_u8(q5u16, d22u8, d2u8); + q6u16 = vmlal_u8(q6u16, d23u8, d2u8); + + q3u16 = vmlal_u8(q3u16, d23u8, d5u8); + q4u16 = vmlal_u8(q4u16, d24u8, d5u8); + q5u16 = vmlal_u8(q5u16, d25u8, d5u8); + q6u16 = vmlal_u8(q6u16, d26u8, d5u8); + + q7u16 = vmull_u8(d21u8, d3u8); + q8u16 = vmull_u8(d22u8, d3u8); + q9u16 = vmull_u8(d23u8, d3u8); + q10u16 = vmull_u8(d24u8, d3u8); + + q3s16 = vreinterpretq_s16_u16(q3u16); + q4s16 = vreinterpretq_s16_u16(q4u16); + q5s16 = vreinterpretq_s16_u16(q5u16); + q6s16 = vreinterpretq_s16_u16(q6u16); + q7s16 = vreinterpretq_s16_u16(q7u16); + q8s16 = vreinterpretq_s16_u16(q8u16); + q9s16 = vreinterpretq_s16_u16(q9u16); + q10s16 = vreinterpretq_s16_u16(q10u16); + + q7s16 = vqaddq_s16(q7s16, q3s16); + q8s16 = vqaddq_s16(q8s16, q4s16); + q9s16 = vqaddq_s16(q9s16, q5s16); + q10s16 = vqaddq_s16(q10s16, q6s16); + + d6u8 = vqrshrun_n_s16(q7s16, 7); + d7u8 = vqrshrun_n_s16(q8s16, 7); + d8u8 = vqrshrun_n_s16(q9s16, 7); + d9u8 = vqrshrun_n_s16(q10s16, 7); + + d18u8 = d22u8; + d19u8 = d23u8; + d20u8 = d24u8; + d21u8 = d25u8; + d22u8 = d26u8; + + vst1_u8(dst, d6u8); + dst += dst_pitch; + vst1_u8(dst, d7u8); + dst += dst_pitch; + vst1_u8(dst, d8u8); + dst += dst_pitch; + vst1_u8(dst, d9u8); + dst += dst_pitch; + } + } + return; +} diff --git a/media/libvpx/vp8/common/arm/neon/variance_neon.asm b/media/libvpx/vp8/common/arm/neon/variance_neon.asm deleted file mode 100644 index e3b48327d3f5..000000000000 --- a/media/libvpx/vp8/common/arm/neon/variance_neon.asm +++ /dev/null @@ -1,276 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_variance16x16_neon| - EXPORT |vp8_variance16x8_neon| - EXPORT |vp8_variance8x16_neon| - EXPORT |vp8_variance8x8_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp8_variance16x16_neon| PROC - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - - mov r12, #8 - -variance16x16_neon_loop - vld1.8 {q0}, [r0], r1 ;Load up source and reference - vld1.8 {q2}, [r2], r3 - vld1.8 {q1}, [r0], r1 - vld1.8 {q3}, [r2], r3 - - vsubl.u8 q11, d0, d4 ;calculate diff - vsubl.u8 q12, d1, d5 - vsubl.u8 q13, d2, d6 - vsubl.u8 q14, d3, d7 - - ;VPADAL adds adjacent pairs of elements of a vector, and accumulates - ;the results into the elements of the destination vector. The explanation - ;in ARM guide is wrong. - vpadal.s16 q8, q11 ;calculate sum - vmlal.s16 q9, d22, d22 ;calculate sse - vmlal.s16 q10, d23, d23 - - subs r12, r12, #1 - - vpadal.s16 q8, q12 - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - vpadal.s16 q8, q13 - vmlal.s16 q9, d26, d26 - vmlal.s16 q10, d27, d27 - vpadal.s16 q8, q14 - vmlal.s16 q9, d28, d28 - vmlal.s16 q10, d29, d29 - - bne variance16x16_neon_loop - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - ldr r12, [sp] ;load *sse from stack - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - ;vmov.32 r0, d0[0] ;this instruction costs a lot - ;vmov.32 r1, d1[0] - ;mul r0, r0, r0 - ;str r1, [r12] - ;sub r0, r1, r0, lsr #8 - - ; while sum is signed, sum * sum is always positive and must be treated as - ; unsigned to avoid propagating the sign bit. - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [r12] ;store sse - vshr.u32 d10, d10, #8 - vsub.u32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - bx lr - - ENDP - -;================================ -;unsigned int vp8_variance16x8_c( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride, -; unsigned int *sse) -|vp8_variance16x8_neon| PROC - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - - mov r12, #4 - -variance16x8_neon_loop - vld1.8 {q0}, [r0], r1 ;Load up source and reference - vld1.8 {q2}, [r2], r3 - vld1.8 {q1}, [r0], r1 - vld1.8 {q3}, [r2], r3 - - vsubl.u8 q11, d0, d4 ;calculate diff - vsubl.u8 q12, d1, d5 - vsubl.u8 q13, d2, d6 - vsubl.u8 q14, d3, d7 - - vpadal.s16 q8, q11 ;calculate sum - vmlal.s16 q9, d22, d22 ;calculate sse - vmlal.s16 q10, d23, d23 - - subs r12, r12, #1 - - vpadal.s16 q8, q12 - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - vpadal.s16 q8, q13 - vmlal.s16 q9, d26, d26 - vmlal.s16 q10, d27, d27 - vpadal.s16 q8, q14 - vmlal.s16 q9, d28, d28 - vmlal.s16 q10, d29, d29 - - bne variance16x8_neon_loop - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - ldr r12, [sp] ;load *sse from stack - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [r12] ;store sse - vshr.u32 d10, d10, #7 - vsub.u32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - bx lr - - ENDP - -;================================= -;unsigned int vp8_variance8x16_c( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride, -; unsigned int *sse) - -|vp8_variance8x16_neon| PROC - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - - mov r12, #8 - -variance8x16_neon_loop - vld1.8 {d0}, [r0], r1 ;Load up source and reference - vld1.8 {d4}, [r2], r3 - vld1.8 {d2}, [r0], r1 - vld1.8 {d6}, [r2], r3 - - vsubl.u8 q11, d0, d4 ;calculate diff - vsubl.u8 q12, d2, d6 - - vpadal.s16 q8, q11 ;calculate sum - vmlal.s16 q9, d22, d22 ;calculate sse - vmlal.s16 q10, d23, d23 - - subs r12, r12, #1 - - vpadal.s16 q8, q12 - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - - bne variance8x16_neon_loop - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - ldr r12, [sp] ;load *sse from stack - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [r12] ;store sse - vshr.u32 d10, d10, #7 - vsub.u32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - bx lr - - ENDP - -;================================== -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp8_variance8x8_neon| PROC - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - - mov r12, #2 - -variance8x8_neon_loop - vld1.8 {d0}, [r0], r1 ;Load up source and reference - vld1.8 {d4}, [r2], r3 - vld1.8 {d1}, [r0], r1 - vld1.8 {d5}, [r2], r3 - vld1.8 {d2}, [r0], r1 - vld1.8 {d6}, [r2], r3 - vld1.8 {d3}, [r0], r1 - vld1.8 {d7}, [r2], r3 - - vsubl.u8 q11, d0, d4 ;calculate diff - vsubl.u8 q12, d1, d5 - vsubl.u8 q13, d2, d6 - vsubl.u8 q14, d3, d7 - - vpadal.s16 q8, q11 ;calculate sum - vmlal.s16 q9, d22, d22 ;calculate sse - vmlal.s16 q10, d23, d23 - - subs r12, r12, #1 - - vpadal.s16 q8, q12 - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - vpadal.s16 q8, q13 - vmlal.s16 q9, d26, d26 - vmlal.s16 q10, d27, d27 - vpadal.s16 q8, q14 - vmlal.s16 q9, d28, d28 - vmlal.s16 q10, d29, d29 - - bne variance8x8_neon_loop - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - ldr r12, [sp] ;load *sse from stack - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [r12] ;store sse - vshr.u32 d10, d10, #6 - vsub.u32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - bx lr - - ENDP - - END diff --git a/media/libvpx/vp8/common/arm/neon/variance_neon.c b/media/libvpx/vp8/common/arm/neon/variance_neon.c new file mode 100644 index 000000000000..afd2dc3d1e26 --- /dev/null +++ b/media/libvpx/vp8/common/arm/neon/variance_neon.c @@ -0,0 +1,323 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#ifdef _MSC_VER +#define __builtin_prefetch(x) +#endif + +unsigned int vp8_variance16x16_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 8; i++) { + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q1u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); + + q2u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); + q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); + q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); + q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vp8_variance16x8_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 4; i++) { // variance16x8_neon_loop + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q1u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); + + q2u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); + q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); + q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); + q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vp8_variance8x16_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + uint8x8_t d0u8, d2u8, d4u8, d6u8; + int16x4_t d22s16, d23s16, d24s16, d25s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint16x8_t q11u16, q12u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 8; i++) { // variance8x16_neon_loop + d0u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); + + d4u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d6u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(d0u8, d4u8); + q12u16 = vsubl_u8(d2u8, d6u8); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vp8_variance8x8_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 2; i++) { // variance8x8_neon_loop + d0u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d1u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d3u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + + d4u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d5u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d6u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d7u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + + q11u16 = vsubl_u8(d0u8, d4u8); + q12u16 = vsubl_u8(d1u8, d5u8); + q13u16 = vsubl_u8(d2u8, d6u8); + q14u16 = vsubl_u8(d3u8, d7u8); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} diff --git a/media/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm b/media/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm deleted file mode 100644 index 9d22c52521c7..000000000000 --- a/media/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm +++ /dev/null @@ -1,423 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -;----------------- - - EXPORT |vp8_sub_pixel_variance16x16_neon_func| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; stack(r4) unsigned char *dst_ptr, -; stack(r5) int dst_pixels_per_line, -; stack(r6) unsigned int *sse -;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon. - -bilinear_taps_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - -|vp8_sub_pixel_variance16x16_neon_func| PROC - push {r4-r6, lr} - - adr r12, bilinear_taps_coeff - ldr r4, [sp, #16] ;load *dst_ptr from stack - ldr r5, [sp, #20] ;load dst_pixels_per_line from stack - ldr r6, [sp, #24] ;load *sse from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_bfilter16x16_only - - add r2, r12, r2, lsl #3 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - - vld1.s32 {d31}, [r2] ;load first_pass filter - - beq firstpass_bfilter16x16_only - - sub sp, sp, #272 ;reserve space on stack for temporary storage - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - mov lr, sp - vld1.u8 {d5, d6, d7}, [r0], r1 - - mov r2, #3 ;loop counter - vld1.u8 {d8, d9, d10}, [r0], r1 - - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vld1.u8 {d11, d12, d13}, [r0], r1 - - vdup.8 d1, d31[4] - -;First Pass: output_height lines x output_width columns (17x16) -vp8e_filt_blk2d_fp16x16_loop_neon - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0]) - vmull.u8 q8, d3, d0 - vmull.u8 q9, d5, d0 - vmull.u8 q10, d6, d0 - vmull.u8 q11, d8, d0 - vmull.u8 q12, d9, d0 - vmull.u8 q13, d11, d0 - vmull.u8 q14, d12, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - vext.8 d11, d11, d12, #1 - - vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q9, d5, d1 - vmlal.u8 q11, d8, d1 - vmlal.u8 q13, d11, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - vext.8 d12, d12, d13, #1 - - vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q10, d6, d1 - vmlal.u8 q12, d9, d1 - vmlal.u8 q14, d12, d1 - - subs r2, r2, #1 - - vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d15, q8, #7 - vqrshrn.u16 d16, q9, #7 - vqrshrn.u16 d17, q10, #7 - vqrshrn.u16 d18, q11, #7 - vqrshrn.u16 d19, q12, #7 - vqrshrn.u16 d20, q13, #7 - - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - vqrshrn.u16 d21, q14, #7 - vld1.u8 {d5, d6, d7}, [r0], r1 - - vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result - vld1.u8 {d8, d9, d10}, [r0], r1 - vst1.u8 {d18, d19, d20, d21}, [lr]! - vld1.u8 {d11, d12, d13}, [r0], r1 - - bne vp8e_filt_blk2d_fp16x16_loop_neon - -;First-pass filtering for rest 5 lines - vld1.u8 {d14, d15, d16}, [r0], r1 - - vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0]) - vmull.u8 q10, d3, d0 - vmull.u8 q11, d5, d0 - vmull.u8 q12, d6, d0 - vmull.u8 q13, d8, d0 - vmull.u8 q14, d9, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - - vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q11, d5, d1 - vmlal.u8 q13, d8, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - - vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q12, d6, d1 - vmlal.u8 q14, d9, d1 - - vmull.u8 q1, d11, d0 - vmull.u8 q2, d12, d0 - vmull.u8 q3, d14, d0 - vmull.u8 q4, d15, d0 - - vext.8 d11, d11, d12, #1 ;construct src_ptr[1] - vext.8 d14, d14, d15, #1 - - vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q3, d14, d1 - - vext.8 d12, d12, d13, #1 - vext.8 d15, d15, d16, #1 - - vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q4, d15, d1 - - vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d11, q10, #7 - vqrshrn.u16 d12, q11, #7 - vqrshrn.u16 d13, q12, #7 - vqrshrn.u16 d14, q13, #7 - vqrshrn.u16 d15, q14, #7 - vqrshrn.u16 d16, q1, #7 - vqrshrn.u16 d17, q2, #7 - vqrshrn.u16 d18, q3, #7 - vqrshrn.u16 d19, q4, #7 - - vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result - vst1.u8 {d14, d15, d16, d17}, [lr]! - vst1.u8 {d18, d19}, [lr]! - -;Second pass: 16x16 -;secondpass_filter - add r3, r12, r3, lsl #3 - sub lr, lr, #272 - - vld1.u32 {d31}, [r3] ;load second_pass filter - - sub sp, sp, #256 - mov r3, sp - - vld1.u8 {d22, d23}, [lr]! ;load src data - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - mov r12, #4 ;loop counter - -vp8e_filt_blk2d_sp16x16_loop_neon - vld1.u8 {d24, d25}, [lr]! - vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0]) - vld1.u8 {d26, d27}, [lr]! - vmull.u8 q2, d23, d0 - vld1.u8 {d28, d29}, [lr]! - vmull.u8 q3, d24, d0 - vld1.u8 {d30, d31}, [lr]! - - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1]) - vmlal.u8 q2, d25, d1 - vmlal.u8 q3, d26, d1 - vmlal.u8 q4, d27, d1 - vmlal.u8 q5, d28, d1 - vmlal.u8 q6, d29, d1 - vmlal.u8 q7, d30, d1 - vmlal.u8 q8, d31, d1 - - subs r12, r12, #1 - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - vqrshrn.u16 d6, q5, #7 - vqrshrn.u16 d7, q6, #7 - vqrshrn.u16 d8, q7, #7 - vqrshrn.u16 d9, q8, #7 - - vst1.u8 {d2, d3}, [r3]! ;store result - vst1.u8 {d4, d5}, [r3]! - vst1.u8 {d6, d7}, [r3]! - vmov q11, q15 - vst1.u8 {d8, d9}, [r3]! - - bne vp8e_filt_blk2d_sp16x16_loop_neon - - b sub_pixel_variance16x16_neon - -;-------------------- -firstpass_bfilter16x16_only - mov r2, #4 ;loop counter - sub sp, sp, #528 ;reserve space on stack for temporary storage - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vdup.8 d1, d31[4] - mov r3, sp - -;First Pass: output_height lines x output_width columns (16x16) -vp8e_filt_blk2d_fpo16x16_loop_neon - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - vld1.u8 {d5, d6, d7}, [r0], r1 - vld1.u8 {d8, d9, d10}, [r0], r1 - vld1.u8 {d11, d12, d13}, [r0], r1 - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0]) - vmull.u8 q8, d3, d0 - vmull.u8 q9, d5, d0 - vmull.u8 q10, d6, d0 - vmull.u8 q11, d8, d0 - vmull.u8 q12, d9, d0 - vmull.u8 q13, d11, d0 - vmull.u8 q14, d12, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - vext.8 d11, d11, d12, #1 - - vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q9, d5, d1 - vmlal.u8 q11, d8, d1 - vmlal.u8 q13, d11, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - vext.8 d12, d12, d13, #1 - - vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q10, d6, d1 - vmlal.u8 q12, d9, d1 - vmlal.u8 q14, d12, d1 - - subs r2, r2, #1 - - vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d15, q8, #7 - vqrshrn.u16 d16, q9, #7 - vqrshrn.u16 d17, q10, #7 - vqrshrn.u16 d18, q11, #7 - vqrshrn.u16 d19, q12, #7 - vqrshrn.u16 d20, q13, #7 - vst1.u8 {d14, d15}, [r3]! ;store result - vqrshrn.u16 d21, q14, #7 - - vst1.u8 {d16, d17}, [r3]! - vst1.u8 {d18, d19}, [r3]! - vst1.u8 {d20, d21}, [r3]! - - bne vp8e_filt_blk2d_fpo16x16_loop_neon - - b sub_pixel_variance16x16_neon - -;--------------------- -secondpass_bfilter16x16_only -;Second pass: 16x16 -;secondpass_filter - sub sp, sp, #528 ;reserve space on stack for temporary storage - add r3, r12, r3, lsl #3 - mov r12, #4 ;loop counter - vld1.u32 {d31}, [r3] ;load second_pass filter - vld1.u8 {d22, d23}, [r0], r1 ;load src data - mov r3, sp - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - -vp8e_filt_blk2d_spo16x16_loop_neon - vld1.u8 {d24, d25}, [r0], r1 - vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0]) - vld1.u8 {d26, d27}, [r0], r1 - vmull.u8 q2, d23, d0 - vld1.u8 {d28, d29}, [r0], r1 - vmull.u8 q3, d24, d0 - vld1.u8 {d30, d31}, [r0], r1 - - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1]) - vmlal.u8 q2, d25, d1 - vmlal.u8 q3, d26, d1 - vmlal.u8 q4, d27, d1 - vmlal.u8 q5, d28, d1 - vmlal.u8 q6, d29, d1 - vmlal.u8 q7, d30, d1 - vmlal.u8 q8, d31, d1 - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - vqrshrn.u16 d6, q5, #7 - vqrshrn.u16 d7, q6, #7 - vqrshrn.u16 d8, q7, #7 - vqrshrn.u16 d9, q8, #7 - - vst1.u8 {d2, d3}, [r3]! ;store result - subs r12, r12, #1 - vst1.u8 {d4, d5}, [r3]! - vmov q11, q15 - vst1.u8 {d6, d7}, [r3]! - vst1.u8 {d8, d9}, [r3]! - - bne vp8e_filt_blk2d_spo16x16_loop_neon - - b sub_pixel_variance16x16_neon - -;---------------------------- -;variance16x16 -sub_pixel_variance16x16_neon - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - - sub r3, r3, #256 - mov r12, #8 - -sub_pixel_variance16x16_neon_loop - vld1.8 {q0}, [r3]! ;Load up source and reference - vld1.8 {q2}, [r4], r5 - vld1.8 {q1}, [r3]! - vld1.8 {q3}, [r4], r5 - - vsubl.u8 q11, d0, d4 ;diff - vsubl.u8 q12, d1, d5 - vsubl.u8 q13, d2, d6 - vsubl.u8 q14, d3, d7 - - vpadal.s16 q8, q11 ;sum - vmlal.s16 q9, d22, d22 ;sse - vmlal.s16 q10, d23, d23 - - subs r12, r12, #1 - - vpadal.s16 q8, q12 - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - vpadal.s16 q8, q13 - vmlal.s16 q9, d26, d26 - vmlal.s16 q10, d27, d27 - vpadal.s16 q8, q14 - vmlal.s16 q9, d28, d28 - vmlal.s16 q10, d29, d29 - - bne sub_pixel_variance16x16_neon_loop - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [r6] ;store sse - vshr.u32 d10, d10, #8 - vsub.u32 d0, d1, d10 - - add sp, sp, #528 - vmov.32 r0, d0[0] ;return - - pop {r4-r6,pc} - - ENDP - - END diff --git a/media/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/media/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm deleted file mode 100644 index 155be4fc54b4..000000000000 --- a/media/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm +++ /dev/null @@ -1,572 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_variance_halfpixvar16x16_h_neon| - EXPORT |vp8_variance_halfpixvar16x16_v_neon| - EXPORT |vp8_variance_halfpixvar16x16_hv_neon| - EXPORT |vp8_sub_pixel_variance16x16s_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;================================================ -;unsigned int vp8_variance_halfpixvar16x16_h_neon -;( -; unsigned char *src_ptr, r0 -; int src_pixels_per_line, r1 -; unsigned char *dst_ptr, r2 -; int dst_pixels_per_line, r3 -; unsigned int *sse -;); -;================================================ -|vp8_variance_halfpixvar16x16_h_neon| PROC - push {lr} - - mov r12, #4 ;loop counter - ldr lr, [sp, #4] ;load *sse from stack - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - -;First Pass: output_height lines x output_width columns (16x16) -vp8_filt_fpo16x16s_4_0_loop_neon - vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data - vld1.8 {q11}, [r2], r3 - vld1.u8 {d4, d5, d6, d7}, [r0], r1 - vld1.8 {q12}, [r2], r3 - vld1.u8 {d8, d9, d10, d11}, [r0], r1 - vld1.8 {q13}, [r2], r3 - vld1.u8 {d12, d13, d14, d15}, [r0], r1 - - ;pld [r0] - ;pld [r0, r1] - ;pld [r0, r1, lsl #1] - - vext.8 q1, q0, q1, #1 ;construct src_ptr[1] - vext.8 q3, q2, q3, #1 - vext.8 q5, q4, q5, #1 - vext.8 q7, q6, q7, #1 - - vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 - vld1.8 {q14}, [r2], r3 - vrhadd.u8 q1, q2, q3 - vrhadd.u8 q2, q4, q5 - vrhadd.u8 q3, q6, q7 - - vsubl.u8 q4, d0, d22 ;diff - vsubl.u8 q5, d1, d23 - vsubl.u8 q6, d2, d24 - vsubl.u8 q7, d3, d25 - vsubl.u8 q0, d4, d26 - vsubl.u8 q1, d5, d27 - vsubl.u8 q2, d6, d28 - vsubl.u8 q3, d7, d29 - - vpadal.s16 q8, q4 ;sum - vmlal.s16 q9, d8, d8 ;sse - vmlal.s16 q10, d9, d9 - - subs r12, r12, #1 - - vpadal.s16 q8, q5 - vmlal.s16 q9, d10, d10 - vmlal.s16 q10, d11, d11 - vpadal.s16 q8, q6 - vmlal.s16 q9, d12, d12 - vmlal.s16 q10, d13, d13 - vpadal.s16 q8, q7 - vmlal.s16 q9, d14, d14 - vmlal.s16 q10, d15, d15 - - vpadal.s16 q8, q0 ;sum - vmlal.s16 q9, d0, d0 ;sse - vmlal.s16 q10, d1, d1 - vpadal.s16 q8, q1 - vmlal.s16 q9, d2, d2 - vmlal.s16 q10, d3, d3 - vpadal.s16 q8, q2 - vmlal.s16 q9, d4, d4 - vmlal.s16 q10, d5, d5 - vpadal.s16 q8, q3 - vmlal.s16 q9, d6, d6 - vmlal.s16 q10, d7, d7 - - bne vp8_filt_fpo16x16s_4_0_loop_neon - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [lr] ;store sse - vshr.u32 d10, d10, #8 - vsub.u32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - pop {pc} - ENDP - -;================================================ -;unsigned int vp8_variance_halfpixvar16x16_v_neon -;( -; unsigned char *src_ptr, r0 -; int src_pixels_per_line, r1 -; unsigned char *dst_ptr, r2 -; int dst_pixels_per_line, r3 -; unsigned int *sse -;); -;================================================ -|vp8_variance_halfpixvar16x16_v_neon| PROC - push {lr} - - mov r12, #4 ;loop counter - - vld1.u8 {q0}, [r0], r1 ;load src data - ldr lr, [sp, #4] ;load *sse from stack - - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - -vp8_filt_spo16x16s_0_4_loop_neon - vld1.u8 {q2}, [r0], r1 - vld1.8 {q1}, [r2], r3 - vld1.u8 {q4}, [r0], r1 - vld1.8 {q3}, [r2], r3 - vld1.u8 {q6}, [r0], r1 - vld1.8 {q5}, [r2], r3 - vld1.u8 {q15}, [r0], r1 - - vrhadd.u8 q0, q0, q2 - vld1.8 {q7}, [r2], r3 - vrhadd.u8 q2, q2, q4 - vrhadd.u8 q4, q4, q6 - vrhadd.u8 q6, q6, q15 - - vsubl.u8 q11, d0, d2 ;diff - vsubl.u8 q12, d1, d3 - vsubl.u8 q13, d4, d6 - vsubl.u8 q14, d5, d7 - vsubl.u8 q0, d8, d10 - vsubl.u8 q1, d9, d11 - vsubl.u8 q2, d12, d14 - vsubl.u8 q3, d13, d15 - - vpadal.s16 q8, q11 ;sum - vmlal.s16 q9, d22, d22 ;sse - vmlal.s16 q10, d23, d23 - - subs r12, r12, #1 - - vpadal.s16 q8, q12 - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - vpadal.s16 q8, q13 - vmlal.s16 q9, d26, d26 - vmlal.s16 q10, d27, d27 - vpadal.s16 q8, q14 - vmlal.s16 q9, d28, d28 - vmlal.s16 q10, d29, d29 - - vpadal.s16 q8, q0 ;sum - vmlal.s16 q9, d0, d0 ;sse - vmlal.s16 q10, d1, d1 - vpadal.s16 q8, q1 - vmlal.s16 q9, d2, d2 - vmlal.s16 q10, d3, d3 - vpadal.s16 q8, q2 - vmlal.s16 q9, d4, d4 - vmlal.s16 q10, d5, d5 - - vmov q0, q15 - - vpadal.s16 q8, q3 - vmlal.s16 q9, d6, d6 - vmlal.s16 q10, d7, d7 - - bne vp8_filt_spo16x16s_0_4_loop_neon - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [lr] ;store sse - vshr.u32 d10, d10, #8 - vsub.u32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - pop {pc} - ENDP - -;================================================ -;unsigned int vp8_variance_halfpixvar16x16_hv_neon -;( -; unsigned char *src_ptr, r0 -; int src_pixels_per_line, r1 -; unsigned char *dst_ptr, r2 -; int dst_pixels_per_line, r3 -; unsigned int *sse -;); -;================================================ -|vp8_variance_halfpixvar16x16_hv_neon| PROC - push {lr} - - vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data - - ldr lr, [sp, #4] ;load *sse from stack - vmov.i8 q13, #0 ;q8 - sum - vext.8 q1, q0, q1, #1 ;construct src_ptr[1] - - vmov.i8 q14, #0 ;q9, q10 - sse - vmov.i8 q15, #0 - - mov r12, #4 ;loop counter - vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 - -;First Pass: output_height lines x output_width columns (17x16) -vp8_filt16x16s_4_4_loop_neon - vld1.u8 {d4, d5, d6, d7}, [r0], r1 - vld1.u8 {d8, d9, d10, d11}, [r0], r1 - vld1.u8 {d12, d13, d14, d15}, [r0], r1 - vld1.u8 {d16, d17, d18, d19}, [r0], r1 - - ;pld [r0] - ;pld [r0, r1] - ;pld [r0, r1, lsl #1] - - vext.8 q3, q2, q3, #1 ;construct src_ptr[1] - vext.8 q5, q4, q5, #1 - vext.8 q7, q6, q7, #1 - vext.8 q9, q8, q9, #1 - - vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 - vrhadd.u8 q2, q4, q5 - vrhadd.u8 q3, q6, q7 - vrhadd.u8 q4, q8, q9 - - vld1.8 {q5}, [r2], r3 - vrhadd.u8 q0, q0, q1 - vld1.8 {q6}, [r2], r3 - vrhadd.u8 q1, q1, q2 - vld1.8 {q7}, [r2], r3 - vrhadd.u8 q2, q2, q3 - vld1.8 {q8}, [r2], r3 - vrhadd.u8 q3, q3, q4 - - vsubl.u8 q9, d0, d10 ;diff - vsubl.u8 q10, d1, d11 - vsubl.u8 q11, d2, d12 - vsubl.u8 q12, d3, d13 - - vsubl.u8 q0, d4, d14 ;diff - vsubl.u8 q1, d5, d15 - vsubl.u8 q5, d6, d16 - vsubl.u8 q6, d7, d17 - - vpadal.s16 q13, q9 ;sum - vmlal.s16 q14, d18, d18 ;sse - vmlal.s16 q15, d19, d19 - - vpadal.s16 q13, q10 ;sum - vmlal.s16 q14, d20, d20 ;sse - vmlal.s16 q15, d21, d21 - - vpadal.s16 q13, q11 ;sum - vmlal.s16 q14, d22, d22 ;sse - vmlal.s16 q15, d23, d23 - - vpadal.s16 q13, q12 ;sum - vmlal.s16 q14, d24, d24 ;sse - vmlal.s16 q15, d25, d25 - - subs r12, r12, #1 - - vpadal.s16 q13, q0 ;sum - vmlal.s16 q14, d0, d0 ;sse - vmlal.s16 q15, d1, d1 - - vpadal.s16 q13, q1 ;sum - vmlal.s16 q14, d2, d2 ;sse - vmlal.s16 q15, d3, d3 - - vpadal.s16 q13, q5 ;sum - vmlal.s16 q14, d10, d10 ;sse - vmlal.s16 q15, d11, d11 - - vmov q0, q4 - - vpadal.s16 q13, q6 ;sum - vmlal.s16 q14, d12, d12 ;sse - vmlal.s16 q15, d13, d13 - - bne vp8_filt16x16s_4_4_loop_neon - - vadd.u32 q15, q14, q15 ;accumulate sse - vpaddl.s32 q0, q13 ;accumulate sum - - vpaddl.u32 q1, q15 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [lr] ;store sse - vshr.u32 d10, d10, #8 - vsub.u32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - pop {pc} - ENDP - -;============================== -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; stack unsigned char *dst_ptr, -; stack int dst_pixels_per_line, -; stack unsigned int *sse -;note: in vp8_find_best_half_pixel_step()(called when 8 +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" + +#ifdef _MSC_VER +#define __builtin_prefetch(x) +#endif + +static const uint16_t bilinear_taps_coeff[8][2] = { + {128, 0}, + {112, 16}, + { 96, 32}, + { 80, 48}, + { 64, 64}, + { 48, 80}, + { 32, 96}, + { 16, 112} +}; + +unsigned int vp8_sub_pixel_variance16x16_neon_func( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse) { + int i; + DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 528); + unsigned char *tmpp; + unsigned char *tmpp2; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8; + uint8x8_t d19u8, d20u8, d21u8; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64, d2s64, d3s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8; + uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16; + uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + tmpp2 = tmp + 272; + tmpp = tmp; + if (xoffset == 0) { // secondpass_bfilter16x16_only + d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]); + + q11u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q13u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q14u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + q15u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + + __builtin_prefetch(src_ptr); + __builtin_prefetch(src_ptr + src_pixels_per_line); + __builtin_prefetch(src_ptr + src_pixels_per_line * 2); + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)tmpp2, q1u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q2u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q3u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q4u8); + tmpp2 += 16; + } + } else if (yoffset == 0) { // firstpass_bfilter16x16_only + d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]); + + for (i = 4; i > 0 ; i--) { + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + __builtin_prefetch(src_ptr); + __builtin_prefetch(src_ptr + src_pixels_per_line); + __builtin_prefetch(src_ptr + src_pixels_per_line * 2); + + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 = vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)tmpp2, q7u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q8u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q9u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q10u8); + tmpp2 += 16; + } + } else { + d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + // First Pass: output_height lines x output_width columns (17x16) + for (i = 3; i > 0; i--) { + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 = vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)tmpp, q7u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q10u8); + tmpp += 16; + } + + // First-pass filtering for rest 5 lines + d14u8 = vld1_u8(src_ptr); + d15u8 = vld1_u8(src_ptr + 8); + d16u8 = vld1_u8(src_ptr + 16); + src_ptr += src_pixels_per_line; + + q9u16 = vmull_u8(d2u8, d0u8); + q10u16 = vmull_u8(d3u8, d0u8); + q11u16 = vmull_u8(d5u8, d0u8); + q12u16 = vmull_u8(d6u8, d0u8); + q13u16 = vmull_u8(d8u8, d0u8); + q14u16 = vmull_u8(d9u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + + q9u16 = vmlal_u8(q9u16, d2u8, d1u8); + q11u16 = vmlal_u8(q11u16, d5u8, d1u8); + q13u16 = vmlal_u8(q13u16, d8u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + + q10u16 = vmlal_u8(q10u16, d3u8, d1u8); + q12u16 = vmlal_u8(q12u16, d6u8, d1u8); + q14u16 = vmlal_u8(q14u16, d9u8, d1u8); + + q1u16 = vmull_u8(d11u8, d0u8); + q2u16 = vmull_u8(d12u8, d0u8); + q3u16 = vmull_u8(d14u8, d0u8); + q4u16 = vmull_u8(d15u8, d0u8); + + d11u8 = vext_u8(d11u8, d12u8, 1); + d14u8 = vext_u8(d14u8, d15u8, 1); + + q1u16 = vmlal_u8(q1u16, d11u8, d1u8); + q3u16 = vmlal_u8(q3u16, d14u8, d1u8); + + d12u8 = vext_u8(d12u8, d13u8, 1); + d15u8 = vext_u8(d15u8, d16u8, 1); + + q2u16 = vmlal_u8(q2u16, d12u8, d1u8); + q4u16 = vmlal_u8(q4u16, d15u8, d1u8); + + d10u8 = vqrshrn_n_u16(q9u16, 7); + d11u8 = vqrshrn_n_u16(q10u16, 7); + d12u8 = vqrshrn_n_u16(q11u16, 7); + d13u8 = vqrshrn_n_u16(q12u16, 7); + d14u8 = vqrshrn_n_u16(q13u16, 7); + d15u8 = vqrshrn_n_u16(q14u16, 7); + d16u8 = vqrshrn_n_u16(q1u16, 7); + d17u8 = vqrshrn_n_u16(q2u16, 7); + d18u8 = vqrshrn_n_u16(q3u16, 7); + d19u8 = vqrshrn_n_u16(q4u16, 7); + + q5u8 = vcombine_u8(d10u8, d11u8); + q6u8 = vcombine_u8(d12u8, d13u8); + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + + vst1q_u8((uint8_t *)tmpp, q5u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q6u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q7u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); + tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); + + // secondpass_filter + d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]); + + tmpp = tmp; + tmpp2 = tmpp + 272; + q11u8 = vld1q_u8(tmpp); + tmpp += 16; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(tmpp); + tmpp += 16; + q13u8 = vld1q_u8(tmpp); + tmpp += 16; + q14u8 = vld1q_u8(tmpp); + tmpp += 16; + q15u8 = vld1q_u8(tmpp); + tmpp += 16; + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)tmpp2, q1u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q2u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q3u8); + tmpp2 += 16; + vst1q_u8((uint8_t *)tmpp2, q4u8); + tmpp2 += 16; + } + } + + // sub_pixel_variance16x16_neon + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + tmpp = tmp + 272; + for (i = 0; i < 8; i++) { // sub_pixel_variance16x16_neon_loop + q0u8 = vld1q_u8(tmpp); + tmpp += 16; + q1u8 = vld1q_u8(tmpp); + tmpp += 16; + q2u8 = vld1q_u8(dst_ptr); + dst_ptr += dst_pixels_per_line; + q3u8 = vld1q_u8(dst_ptr); + dst_ptr += dst_pixels_per_line; + + d0u8 = vget_low_u8(q0u8); + d1u8 = vget_high_u8(q0u8); + d2u8 = vget_low_u8(q1u8); + d3u8 = vget_high_u8(q1u8); + + q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8)); + q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8)); + q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8)); + q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vget_low_s64(q0s64); + d1s64 = vget_high_s64(q0s64); + d2s64 = vget_low_s64(q1s64); + d3s64 = vget_high_s64(q1s64); + d0s64 = vadd_s64(d0s64, d1s64); + d1s64 = vadd_s64(d2s64, d3s64); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vp8_variance_halfpixvar16x16_h_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64, d2s64, d3s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8; + uint8x16_t q7u8, q11u8, q12u8, q13u8, q14u8; + uint16x8_t q0u16, q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon + q0u8 = vld1q_u8(src_ptr); + q1u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q2u8 = vld1q_u8(src_ptr); + q3u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q4u8 = vld1q_u8(src_ptr); + q5u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q6u8 = vld1q_u8(src_ptr); + q7u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + + q11u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q12u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q13u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q14u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + + q1u8 = vextq_u8(q0u8, q1u8, 1); + q3u8 = vextq_u8(q2u8, q3u8, 1); + q5u8 = vextq_u8(q4u8, q5u8, 1); + q7u8 = vextq_u8(q6u8, q7u8, 1); + + q0u8 = vrhaddq_u8(q0u8, q1u8); + q1u8 = vrhaddq_u8(q2u8, q3u8); + q2u8 = vrhaddq_u8(q4u8, q5u8); + q3u8 = vrhaddq_u8(q6u8, q7u8); + + d0u8 = vget_low_u8(q0u8); + d1u8 = vget_high_u8(q0u8); + d2u8 = vget_low_u8(q1u8); + d3u8 = vget_high_u8(q1u8); + d4u8 = vget_low_u8(q2u8); + d5u8 = vget_high_u8(q2u8); + d6u8 = vget_low_u8(q3u8); + d7u8 = vget_high_u8(q3u8); + + q4u16 = vsubl_u8(d0u8, vget_low_u8(q11u8)); + q5u16 = vsubl_u8(d1u8, vget_high_u8(q11u8)); + q6u16 = vsubl_u8(d2u8, vget_low_u8(q12u8)); + q7u16 = vsubl_u8(d3u8, vget_high_u8(q12u8)); + q0u16 = vsubl_u8(d4u8, vget_low_u8(q13u8)); + q1u16 = vsubl_u8(d5u8, vget_high_u8(q13u8)); + q2u16 = vsubl_u8(d6u8, vget_low_u8(q14u8)); + q3u16 = vsubl_u8(d7u8, vget_high_u8(q14u8)); + + d8s16 = vreinterpret_s16_u16(vget_low_u16(q4u16)); + d9s16 = vreinterpret_s16_u16(vget_high_u16(q4u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q4u16)); + q9s32 = vmlal_s16(q9s32, d8s16, d8s16); + q10s32 = vmlal_s16(q10s32, d9s16, d9s16); + d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16)); + d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q5u16)); + q9s32 = vmlal_s16(q9s32, d10s16, d10s16); + q10s32 = vmlal_s16(q10s32, d11s16, d11s16); + d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16)); + d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q6u16)); + q9s32 = vmlal_s16(q9s32, d12s16, d12s16); + q10s32 = vmlal_s16(q10s32, d13s16, d13s16); + d14s16 = vreinterpret_s16_u16(vget_low_u16(q7u16)); + d15s16 = vreinterpret_s16_u16(vget_high_u16(q7u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q7u16)); + q9s32 = vmlal_s16(q9s32, d14s16, d14s16); + q10s32 = vmlal_s16(q10s32, d15s16, d15s16); + d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16)); + d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16)); + q9s32 = vmlal_s16(q9s32, d0s16, d0s16); + q10s32 = vmlal_s16(q10s32, d1s16, d1s16); + d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16)); + d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16)); + q9s32 = vmlal_s16(q9s32, d2s16, d2s16); + q10s32 = vmlal_s16(q10s32, d3s16, d3s16); + d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16)); + d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16)); + q9s32 = vmlal_s16(q9s32, d4s16, d4s16); + q10s32 = vmlal_s16(q10s32, d5s16, d5s16); + d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16)); + d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16)); + q9s32 = vmlal_s16(q9s32, d6s16, d6s16); + q10s32 = vmlal_s16(q10s32, d7s16, d7s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vget_low_s64(q0s64); + d1s64 = vget_high_s64(q0s64); + d2s64 = vget_low_s64(q1s64); + d3s64 = vget_high_s64(q1s64); + d0s64 = vadd_s64(d0s64, d1s64); + d1s64 = vadd_s64(d2s64, d3s64); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vp8_variance_halfpixvar16x16_v_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + uint8x8_t d0u8, d1u8, d4u8, d5u8, d8u8, d9u8, d12u8, d13u8; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64, d2s64, d3s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q15u8; + uint16x8_t q0u16, q1u16, q2u16, q3u16, q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon + q2u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q4u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q6u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q15u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + + q1u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q5u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q7u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + + q0u8 = vrhaddq_u8(q0u8, q2u8); + q2u8 = vrhaddq_u8(q2u8, q4u8); + q4u8 = vrhaddq_u8(q4u8, q6u8); + q6u8 = vrhaddq_u8(q6u8, q15u8); + + d0u8 = vget_low_u8(q0u8); + d1u8 = vget_high_u8(q0u8); + d4u8 = vget_low_u8(q2u8); + d5u8 = vget_high_u8(q2u8); + d8u8 = vget_low_u8(q4u8); + d9u8 = vget_high_u8(q4u8); + d12u8 = vget_low_u8(q6u8); + d13u8 = vget_high_u8(q6u8); + + q11u16 = vsubl_u8(d0u8, vget_low_u8(q1u8)); + q12u16 = vsubl_u8(d1u8, vget_high_u8(q1u8)); + q13u16 = vsubl_u8(d4u8, vget_low_u8(q3u8)); + q14u16 = vsubl_u8(d5u8, vget_high_u8(q3u8)); + q0u16 = vsubl_u8(d8u8, vget_low_u8(q5u8)); + q1u16 = vsubl_u8(d9u8, vget_high_u8(q5u8)); + q2u16 = vsubl_u8(d12u8, vget_low_u8(q7u8)); + q3u16 = vsubl_u8(d13u8, vget_high_u8(q7u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16)); + d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16)); + q9s32 = vmlal_s16(q9s32, d0s16, d0s16); + q10s32 = vmlal_s16(q10s32, d1s16, d1s16); + d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16)); + d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16)); + q9s32 = vmlal_s16(q9s32, d2s16, d2s16); + q10s32 = vmlal_s16(q10s32, d3s16, d3s16); + d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16)); + d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16)); + q9s32 = vmlal_s16(q9s32, d4s16, d4s16); + q10s32 = vmlal_s16(q10s32, d5s16, d5s16); + d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16)); + d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16)); + q9s32 = vmlal_s16(q9s32, d6s16, d6s16); + q10s32 = vmlal_s16(q10s32, d7s16, d7s16); + + q0u8 = q15u8; + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vget_low_s64(q0s64); + d1s64 = vget_high_s64(q0s64); + d2s64 = vget_low_s64(q1s64); + d3s64 = vget_high_s64(q1s64); + d0s64 = vadd_s64(d0s64, d1s64); + d1s64 = vadd_s64(d2s64, d3s64); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int vp8_variance_halfpixvar16x16_hv_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + int16x4_t d0s16, d1s16, d2s16, d3s16, d10s16, d11s16, d12s16, d13s16; + int16x4_t d18s16, d19s16, d20s16, d21s16, d22s16, d23s16, d24s16, d25s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64, d2s64, d3s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8; + uint16x8_t q0u16, q1u16, q5u16, q6u16, q9u16, q10u16, q11u16, q12u16; + int32x4_t q13s32, q14s32, q15s32; + int64x2_t q0s64, q1s64, q5s64; + + q13s32 = vdupq_n_s32(0); + q14s32 = vdupq_n_s32(0); + q15s32 = vdupq_n_s32(0); + + q0u8 = vld1q_u8(src_ptr); + q1u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q1u8 = vextq_u8(q0u8, q1u8, 1); + q0u8 = vrhaddq_u8(q0u8, q1u8); + for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon + q2u8 = vld1q_u8(src_ptr); + q3u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q4u8 = vld1q_u8(src_ptr); + q5u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q6u8 = vld1q_u8(src_ptr); + q7u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + q8u8 = vld1q_u8(src_ptr); + q9u8 = vld1q_u8(src_ptr + 16); + src_ptr += source_stride; + + q3u8 = vextq_u8(q2u8, q3u8, 1); + q5u8 = vextq_u8(q4u8, q5u8, 1); + q7u8 = vextq_u8(q6u8, q7u8, 1); + q9u8 = vextq_u8(q8u8, q9u8, 1); + + q1u8 = vrhaddq_u8(q2u8, q3u8); + q2u8 = vrhaddq_u8(q4u8, q5u8); + q3u8 = vrhaddq_u8(q6u8, q7u8); + q4u8 = vrhaddq_u8(q8u8, q9u8); + q0u8 = vrhaddq_u8(q0u8, q1u8); + q1u8 = vrhaddq_u8(q1u8, q2u8); + q2u8 = vrhaddq_u8(q2u8, q3u8); + q3u8 = vrhaddq_u8(q3u8, q4u8); + + q5u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q6u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q7u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q8u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + + d0u8 = vget_low_u8(q0u8); + d1u8 = vget_high_u8(q0u8); + d2u8 = vget_low_u8(q1u8); + d3u8 = vget_high_u8(q1u8); + d4u8 = vget_low_u8(q2u8); + d5u8 = vget_high_u8(q2u8); + d6u8 = vget_low_u8(q3u8); + d7u8 = vget_high_u8(q3u8); + + q9u16 = vsubl_u8(d0u8, vget_low_u8(q5u8)); + q10u16 = vsubl_u8(d1u8, vget_high_u8(q5u8)); + q11u16 = vsubl_u8(d2u8, vget_low_u8(q6u8)); + q12u16 = vsubl_u8(d3u8, vget_high_u8(q6u8)); + q0u16 = vsubl_u8(d4u8, vget_low_u8(q7u8)); + q1u16 = vsubl_u8(d5u8, vget_high_u8(q7u8)); + q5u16 = vsubl_u8(d6u8, vget_low_u8(q8u8)); + q6u16 = vsubl_u8(d7u8, vget_high_u8(q8u8)); + + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q9u16)); + q14s32 = vmlal_s16(q14s32, d18s16, d18s16); + q15s32 = vmlal_s16(q15s32, d19s16, d19s16); + + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q10u16)); + q14s32 = vmlal_s16(q14s32, d20s16, d20s16); + q15s32 = vmlal_s16(q15s32, d21s16, d21s16); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q11u16)); + q14s32 = vmlal_s16(q14s32, d22s16, d22s16); + q15s32 = vmlal_s16(q15s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q12u16)); + q14s32 = vmlal_s16(q14s32, d24s16, d24s16); + q15s32 = vmlal_s16(q15s32, d25s16, d25s16); + + d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16)); + d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q0u16)); + q14s32 = vmlal_s16(q14s32, d0s16, d0s16); + q15s32 = vmlal_s16(q15s32, d1s16, d1s16); + + d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16)); + d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q1u16)); + q14s32 = vmlal_s16(q14s32, d2s16, d2s16); + q15s32 = vmlal_s16(q15s32, d3s16, d3s16); + + d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16)); + d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q5u16)); + q14s32 = vmlal_s16(q14s32, d10s16, d10s16); + q15s32 = vmlal_s16(q15s32, d11s16, d11s16); + + d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16)); + d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16)); + q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q6u16)); + q14s32 = vmlal_s16(q14s32, d12s16, d12s16); + q15s32 = vmlal_s16(q15s32, d13s16, d13s16); + + q0u8 = q4u8; + } + + q15s32 = vaddq_s32(q14s32, q15s32); + q0s64 = vpaddlq_s32(q13s32); + q1s64 = vpaddlq_s32(q15s32); + + d0s64 = vget_low_s64(q0s64); + d1s64 = vget_high_s64(q0s64); + d2s64 = vget_low_s64(q1s64); + d3s64 = vget_high_s64(q1s64); + d0s64 = vadd_s64(d0s64, d1s64); + d1s64 = vadd_s64(d2s64, d3s64); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), + vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +enum { kWidth8 = 8 }; +enum { kHeight8 = 8 }; +enum { kHeight8PlusOne = 9 }; +enum { kPixelStepOne = 1 }; +enum { kAlign16 = 16 }; + +#define FILTER_BITS 7 + +static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { + const int32x4_t a = vpaddlq_s16(v_16x8); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { + const int64x2_t b = vpaddlq_s32(v_32x4); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static void variance_neon_w8(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + int i, j; + int16x8_t v_sum = vdupq_n_s16(0); + int32x4_t v_sse_lo = vdupq_n_s32(0); + int32x4_t v_sse_hi = vdupq_n_s32(0); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const uint8x8_t v_a = vld1_u8(&a[j]); + const uint8x8_t v_b = vld1_u8(&b[j]); + const uint16x8_t v_diff = vsubl_u8(v_a, v_b); + const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); + v_sum = vaddq_s16(v_sum, sv_diff); + v_sse_lo = vmlal_s16(v_sse_lo, + vget_low_s16(sv_diff), + vget_low_s16(sv_diff)); + v_sse_hi = vmlal_s16(v_sse_hi, + vget_high_s16(sv_diff), + vget_high_s16(sv_diff)); + } + a += a_stride; + b += b_stride; + } + + *sum = horizontal_add_s16x8(v_sum); + *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); +} + +static unsigned int variance8x8_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum); + return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8)); +} + +static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint16_t *vpx_filter) { + const uint8x8_t f0 = vmov_n_u8((uint8_t)vpx_filter[0]); + const uint8x8_t f1 = vmov_n_u8((uint8_t)vpx_filter[1]); + unsigned int i; + for (i = 0; i < output_height; ++i) { + const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); + const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); + const uint16x8_t a = vmull_u8(src_0, f0); + const uint16x8_t b = vmlal_u8(a, src_1, f1); + const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); + vst1_u8(&output_ptr[0], out); + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } +} + +unsigned int vp8_sub_pixel_variance8x8_neon( + const unsigned char *src, + int src_stride, + int xoffset, + int yoffset, + const unsigned char *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8 * kWidth8); + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8); + if (xoffset == 0) { + var_filter_block2d_bil_w8(src, temp2, src_stride, kWidth8, kHeight8, + kWidth8, bilinear_taps_coeff[yoffset]); + } else if (yoffset == 0) { + var_filter_block2d_bil_w8(src, temp2, src_stride, kPixelStepOne, + kHeight8PlusOne, kWidth8, + bilinear_taps_coeff[xoffset]); + } else { + var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne, + kHeight8PlusOne, kWidth8, + bilinear_taps_coeff[xoffset]); + var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8, + kWidth8, bilinear_taps_coeff[yoffset]); + } + return variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse); +} + diff --git a/media/libvpx/vp8/common/arm/reconintra_arm.c b/media/libvpx/vp8/common/arm/reconintra_arm.c deleted file mode 100644 index 2874896e8e24..000000000000 --- a/media/libvpx/vp8/common/arm/reconintra_arm.c +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_config.h" -#include "vp8_rtcd.h" -#include "vp8/common/blockd.h" -#include "vpx_mem/vpx_mem.h" - -#if HAVE_NEON -extern void vp8_build_intra_predictors_mby_neon_func( - unsigned char *y_buffer, - unsigned char *ypred_ptr, - int y_stride, - int mode, - int Up, - int Left); - -void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x) -{ - unsigned char *y_buffer = x->dst.y_buffer; - unsigned char *ypred_ptr = x->predictor; - int y_stride = x->dst.y_stride; - int mode = x->mode_info_context->mbmi.mode; - int Up = x->up_available; - int Left = x->left_available; - - vp8_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left); -} - -extern void vp8_build_intra_predictors_mby_s_neon_func( - unsigned char *y_buffer, - unsigned char *ypred_ptr, - int y_stride, - int mode, - int Up, - int Left); - -void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x) -{ - unsigned char *y_buffer = x->dst.y_buffer; - unsigned char *ypred_ptr = x->predictor; - int y_stride = x->dst.y_stride; - int mode = x->mode_info_context->mbmi.mode; - int Up = x->up_available; - int Left = x->left_available; - - vp8_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left); -} - -#endif diff --git a/media/libvpx/vp8/common/blockd.h b/media/libvpx/vp8/common/blockd.h index f7ff57763526..ea1a6a4adfde 100644 --- a/media/libvpx/vp8/common/blockd.h +++ b/media/libvpx/vp8/common/blockd.h @@ -9,8 +9,8 @@ */ -#ifndef __INC_BLOCKD_H -#define __INC_BLOCKD_H +#ifndef VP8_COMMON_BLOCKD_H_ +#define VP8_COMMON_BLOCKD_H_ void vpx_log(const char *format, ...); @@ -20,6 +20,10 @@ void vpx_log(const char *format, ...); #include "treecoder.h" #include "vpx_ports/mem.h" +#ifdef __cplusplus +extern "C" { +#endif + /*#define DCPRED 1*/ #define DCPREDSIMTHRESH 0 #define DCPREDCNTTHRESH 3 @@ -297,4 +301,8 @@ typedef struct macroblockd extern void vp8_build_block_doffsets(MACROBLOCKD *x); extern void vp8_setup_block_dptrs(MACROBLOCKD *x); -#endif /* __INC_BLOCKD_H */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_BLOCKD_H_ diff --git a/media/libvpx/vp8/common/coefupdateprobs.h b/media/libvpx/vp8/common/coefupdateprobs.h index 9e194dc9a4dc..d96a19e7478c 100644 --- a/media/libvpx/vp8/common/coefupdateprobs.h +++ b/media/libvpx/vp8/common/coefupdateprobs.h @@ -8,6 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP8_COMMON_COEFUPDATEPROBS_H_ +#define VP8_COMMON_COEFUPDATEPROBS_H_ + +#ifdef __cplusplus +extern "C" { +#endif /* Update probabilities for the nodes in the token entropy tree. Generated file included by entropy.c */ @@ -183,3 +189,9 @@ const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTE }, }, }; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_COEFUPDATEPROBS_H_ diff --git a/media/libvpx/vp8/common/common.h b/media/libvpx/vp8/common/common.h index 2cc1c544cdfe..17262d6983cd 100644 --- a/media/libvpx/vp8/common/common.h +++ b/media/libvpx/vp8/common/common.h @@ -9,8 +9,8 @@ */ -#ifndef common_h -#define common_h 1 +#ifndef VP8_COMMON_COMMON_H_ +#define VP8_COMMON_COMMON_H_ #include @@ -18,6 +18,13 @@ #include "vpx_mem/vpx_mem.h" +#ifdef __cplusplus +extern "C" { +#endif + +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) + /* Only need this for fixed-size arrays, for structs just assign. */ #define vp8_copy( Dest, Src) { \ @@ -37,4 +44,8 @@ #define vp8_zero_array( Dest, N) vpx_memset( Dest, 0, N * sizeof( *Dest)); -#endif /* common_h */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_COMMON_H_ diff --git a/media/libvpx/vp8/common/default_coef_probs.h b/media/libvpx/vp8/common/default_coef_probs.h index 0d195636bcb7..4d69e4be6645 100644 --- a/media/libvpx/vp8/common/default_coef_probs.h +++ b/media/libvpx/vp8/common/default_coef_probs.h @@ -8,6 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP8_COMMON_DEFAULT_COEF_PROBS_H_ +#define VP8_COMMON_DEFAULT_COEF_PROBS_H_ + +#ifdef __cplusplus +extern "C" { +#endif /*Generated file, included by entropy.c*/ @@ -186,3 +192,9 @@ static const vp8_prob default_coef_probs [BLOCK_TYPES] } } }; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_DEFAULT_COEF_PROBS_H_ diff --git a/media/libvpx/vp8/common/entropy.h b/media/libvpx/vp8/common/entropy.h index 5389bc1de4e5..a90bab4bac2e 100644 --- a/media/libvpx/vp8/common/entropy.h +++ b/media/libvpx/vp8/common/entropy.h @@ -9,12 +9,16 @@ */ -#ifndef __INC_ENTROPY_H -#define __INC_ENTROPY_H +#ifndef VP8_COMMON_ENTROPY_H_ +#define VP8_COMMON_ENTROPY_H_ #include "treecoder.h" #include "blockd.h" +#ifdef __cplusplus +extern "C" { +#endif + /* Coefficient token alphabet */ #define ZERO_TOKEN 0 /* 0 Extra Bits 0+0 */ @@ -98,4 +102,8 @@ extern DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]); extern const int vp8_mb_feature_data_bits[MB_LVL_MAX]; void vp8_coef_tree_initialize(void); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_ENTROPY_H_ diff --git a/media/libvpx/vp8/common/entropymode.h b/media/libvpx/vp8/common/entropymode.h index 1df0f641e496..81bdfc4b8bdd 100644 --- a/media/libvpx/vp8/common/entropymode.h +++ b/media/libvpx/vp8/common/entropymode.h @@ -9,12 +9,16 @@ */ -#ifndef __INC_ENTROPYMODE_H -#define __INC_ENTROPYMODE_H +#ifndef VP8_COMMON_ENTROPYMODE_H_ +#define VP8_COMMON_ENTROPYMODE_H_ #include "onyxc_int.h" #include "treecoder.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef enum { SUBMVREF_NORMAL, @@ -77,4 +81,8 @@ void vp8_init_mbmode_probs(VP8_COMMON *x); void vp8_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES-1]); void vp8_kf_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1]); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_ENTROPYMODE_H_ diff --git a/media/libvpx/vp8/common/entropymv.h b/media/libvpx/vp8/common/entropymv.h index 2db1e385baea..42840d58ad26 100644 --- a/media/libvpx/vp8/common/entropymv.h +++ b/media/libvpx/vp8/common/entropymv.h @@ -9,11 +9,15 @@ */ -#ifndef __INC_ENTROPYMV_H -#define __INC_ENTROPYMV_H +#ifndef VP8_COMMON_ENTROPYMV_H_ +#define VP8_COMMON_ENTROPYMV_H_ #include "treecoder.h" +#ifdef __cplusplus +extern "C" { +#endif + enum { mv_max = 1023, /* max absolute value of a MV component */ @@ -41,4 +45,8 @@ typedef struct mv_context extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2]; +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_ENTROPYMV_H_ diff --git a/media/libvpx/vp8/common/extend.h b/media/libvpx/vp8/common/extend.h index 74a0b177d94d..068f4ac5236f 100644 --- a/media/libvpx/vp8/common/extend.h +++ b/media/libvpx/vp8/common/extend.h @@ -9,11 +9,15 @@ */ -#ifndef __INC_EXTEND_H -#define __INC_EXTEND_H +#ifndef VP8_COMMON_EXTEND_H_ +#define VP8_COMMON_EXTEND_H_ #include "vpx_scale/yv12config.h" +#ifdef __cplusplus +extern "C" { +#endif + void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr); void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst); @@ -22,4 +26,8 @@ void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src, int srcy, int srcx, int srch, int srcw); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_EXTEND_H_ diff --git a/media/libvpx/vp8/common/filter.h b/media/libvpx/vp8/common/filter.h index ccda7c8d0202..cfba775fce47 100644 --- a/media/libvpx/vp8/common/filter.h +++ b/media/libvpx/vp8/common/filter.h @@ -9,11 +9,15 @@ */ -#ifndef FILTER_H -#define FILTER_H +#ifndef VP8_COMMON_FILTER_H_ +#define VP8_COMMON_FILTER_H_ #include "vpx_ports/mem.h" +#ifdef __cplusplus +extern "C" { +#endif + #define BLOCK_HEIGHT_WIDTH 4 #define VP8_FILTER_WEIGHT 128 #define VP8_FILTER_SHIFT 7 @@ -21,4 +25,8 @@ extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]); extern DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_FILTER_H_ diff --git a/media/libvpx/vp8/common/findnearmv.h b/media/libvpx/vp8/common/findnearmv.h index c60e463614bc..3c8c0506f675 100644 --- a/media/libvpx/vp8/common/findnearmv.h +++ b/media/libvpx/vp8/common/findnearmv.h @@ -9,14 +9,18 @@ */ -#ifndef __INC_FINDNEARMV_H -#define __INC_FINDNEARMV_H +#ifndef VP8_COMMON_FINDNEARMV_H_ +#define VP8_COMMON_FINDNEARMV_H_ #include "mv.h" #include "blockd.h" #include "modecont.h" #include "treecoder.h" +#ifdef __cplusplus +extern "C" { +#endif + static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, const int *ref_frame_sign_bias) @@ -179,4 +183,8 @@ static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b, int mi return (cur_mb->bmi + b - 4)->as_mode; } +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_FINDNEARMV_H_ diff --git a/media/libvpx/vp8/common/header.h b/media/libvpx/vp8/common/header.h index 3e98eeb3c344..e27bca16bd73 100644 --- a/media/libvpx/vp8/common/header.h +++ b/media/libvpx/vp8/common/header.h @@ -9,8 +9,12 @@ */ -#ifndef __INC_HEADER_H -#define __INC_HEADER_H +#ifndef VP8_COMMON_HEADER_H_ +#define VP8_COMMON_HEADER_H_ + +#ifdef __cplusplus +extern "C" { +#endif /* 24 bits total */ typedef struct @@ -40,4 +44,8 @@ typedef struct #endif +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_HEADER_H_ diff --git a/media/libvpx/vp8/common/invtrans.h b/media/libvpx/vp8/common/invtrans.h index 9262640d5707..affe57e3d6e2 100644 --- a/media/libvpx/vp8/common/invtrans.h +++ b/media/libvpx/vp8/common/invtrans.h @@ -9,8 +9,8 @@ */ -#ifndef __INC_INVTRANS_H -#define __INC_INVTRANS_H +#ifndef VP8_COMMON_INVTRANS_H_ +#define VP8_COMMON_INVTRANS_H_ #include "vpx_config.h" #include "vp8_rtcd.h" @@ -21,6 +21,10 @@ #include "vpx_mem/vpx_mem.h" #endif +#ifdef __cplusplus +extern "C" { +#endif + static void eob_adjust(char *eobs, short *diff) { /* eob adjust.... the idct can only skip if both the dc and eob are zero */ @@ -59,4 +63,8 @@ static void vp8_inverse_transform_mby(MACROBLOCKD *xd) xd->dst.y_buffer, xd->dst.y_stride, xd->eobs); } +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_INVTRANS_H_ diff --git a/media/libvpx/vp8/common/loopfilter.c b/media/libvpx/vp8/common/loopfilter.c index 3c0fa636f352..7a07e76fc41c 100644 --- a/media/libvpx/vp8/common/loopfilter.c +++ b/media/libvpx/vp8/common/loopfilter.c @@ -15,6 +15,7 @@ #include "onyxc_int.h" #include "vpx_mem/vpx_mem.h" + static void lf_init_lut(loop_filter_info_n *lfi) { int filt_lvl; diff --git a/media/libvpx/vp8/common/loopfilter.h b/media/libvpx/vp8/common/loopfilter.h index 1e47f3490733..20a6bd375b60 100644 --- a/media/libvpx/vp8/common/loopfilter.h +++ b/media/libvpx/vp8/common/loopfilter.h @@ -9,13 +9,17 @@ */ -#ifndef loopfilter_h -#define loopfilter_h +#ifndef VP8_COMMON_LOOPFILTER_H_ +#define VP8_COMMON_LOOPFILTER_H_ #include "vpx_ports/mem.h" #include "vpx_config.h" #include "vp8_rtcd.h" +#ifdef __cplusplus +extern "C" { +#endif + #define MAX_LOOP_FILTER 63 /* fraction of total macroblock rows to be used in fast filter level picking */ /* has to be > 2 */ @@ -102,4 +106,8 @@ void vp8_loop_filter_row_simple(struct VP8Common *cm, int mb_row, int post_ystride, int post_uvstride, unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_LOOPFILTER_H_ diff --git a/media/libvpx/vp8/common/modecont.h b/media/libvpx/vp8/common/modecont.h index 24db88295f92..ff34c33c557c 100644 --- a/media/libvpx/vp8/common/modecont.h +++ b/media/libvpx/vp8/common/modecont.h @@ -9,9 +9,17 @@ */ -#ifndef __INC_MODECONT_H -#define __INC_MODECONT_H +#ifndef VP8_COMMON_MODECONT_H_ +#define VP8_COMMON_MODECONT_H_ + +#ifdef __cplusplus +extern "C" { +#endif extern const int vp8_mode_contexts[6][4]; +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_MODECONT_H_ diff --git a/media/libvpx/vp8/common/mv.h b/media/libvpx/vp8/common/mv.h index b3f919db2bd6..111ccd63c722 100644 --- a/media/libvpx/vp8/common/mv.h +++ b/media/libvpx/vp8/common/mv.h @@ -9,10 +9,14 @@ */ -#ifndef __INC_MV_H -#define __INC_MV_H +#ifndef VP8_COMMON_MV_H_ +#define VP8_COMMON_MV_H_ #include "vpx/vpx_integer.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct { short row; @@ -25,4 +29,8 @@ typedef union int_mv MV as_mv; } int_mv; /* facilitates faster equality tests and copies */ +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_MV_H_ diff --git a/media/libvpx/vp8/common/onyx.h b/media/libvpx/vp8/common/onyx.h index 30c4cbbca63a..d48c4fe5e3d6 100644 --- a/media/libvpx/vp8/common/onyx.h +++ b/media/libvpx/vp8/common/onyx.h @@ -9,8 +9,8 @@ */ -#ifndef __INC_VP8_H -#define __INC_VP8_H +#ifndef VP8_COMMON_ONYX_H_ +#define VP8_COMMON_ONYX_H_ #ifdef __cplusplus extern "C" @@ -39,8 +39,8 @@ extern "C" typedef enum { - USAGE_STREAM_FROM_SERVER = 0x0, - USAGE_LOCAL_FILE_PLAYBACK = 0x1, + USAGE_LOCAL_FILE_PLAYBACK = 0x0, + USAGE_STREAM_FROM_SERVER = 0x1, USAGE_CONSTRAINED_QUALITY = 0x2, USAGE_CONSTANT_QUALITY = 0x3 } END_USAGE; @@ -104,7 +104,18 @@ extern "C" struct vpx_rational timebase; unsigned int target_bandwidth; /* kilobits per second */ - /* parameter used for applying pre processing blur: recommendation 0 */ + /* Parameter used for applying denoiser. + * For temporal denoiser: noise_sensitivity = 0 means off, + * noise_sensitivity = 1 means temporal denoiser on for Y channel only, + * noise_sensitivity = 2 means temporal denoiser on for all channels. + * noise_sensitivity = 3 means aggressive denoising mode. + * noise_sensitivity >= 4 means adaptive denoising mode. + * Temporal denoiser is enabled via the configuration option: + * CONFIG_TEMPORAL_DENOISING. + * For spatial denoiser: noise_sensitivity controls the amount of + * pre-processing blur: noise_sensitivity = 0 means off. + * Spatial denoiser invoked under !CONFIG_TEMPORAL_DENOISING. + */ int noise_sensitivity; /* parameter used for sharpening output: recommendation 0: */ @@ -213,7 +224,7 @@ extern "C" int arnr_strength; int arnr_type; - struct vpx_fixed_buf two_pass_stats_in; + vpx_fixed_buf_t two_pass_stats_in; struct vpx_codec_pkt_list *output_pkt_list; vp8e_tuning tuning; @@ -267,4 +278,4 @@ extern "C" } #endif -#endif +#endif // VP8_COMMON_ONYX_H_ diff --git a/media/libvpx/vp8/common/onyxc_int.h b/media/libvpx/vp8/common/onyxc_int.h index e9bb7af26b16..6d89865c600a 100644 --- a/media/libvpx/vp8/common/onyxc_int.h +++ b/media/libvpx/vp8/common/onyxc_int.h @@ -9,8 +9,8 @@ */ -#ifndef __INC_VP8C_INT_H -#define __INC_VP8C_INT_H +#ifndef VP8_COMMON_ONYXC_INT_H_ +#define VP8_COMMON_ONYXC_INT_H_ #include "vpx_config.h" #include "vp8_rtcd.h" @@ -26,6 +26,10 @@ #include "header.h" /*#endif*/ +#ifdef __cplusplus +extern "C" { +#endif + #define MINQ 0 #define MAXQ 127 #define QINDEX_RANGE (MAXQ + 1) @@ -174,4 +178,8 @@ typedef struct VP8Common int cpu_caps; } VP8_COMMON; +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_ONYXC_INT_H_ diff --git a/media/libvpx/vp8/common/onyxd.h b/media/libvpx/vp8/common/onyxd.h index 97c81c130a0a..e37b29f32cff 100644 --- a/media/libvpx/vp8/common/onyxd.h +++ b/media/libvpx/vp8/common/onyxd.h @@ -9,8 +9,8 @@ */ -#ifndef __INC_VP8D_H -#define __INC_VP8D_H +#ifndef VP8_COMMON_ONYXD_H_ +#define VP8_COMMON_ONYXD_H_ /* Create/destroy static data structures. */ @@ -60,4 +60,4 @@ extern "C" #endif -#endif +#endif // VP8_COMMON_ONYXD_H_ diff --git a/media/libvpx/vp8/common/postproc.c b/media/libvpx/vp8/common/postproc.c index dd998f16e8a2..e50d3935f3a1 100644 --- a/media/libvpx/vp8/common/postproc.c +++ b/media/libvpx/vp8/common/postproc.c @@ -71,11 +71,6 @@ static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] = }; #endif -static const short kernel5[] = -{ - 1, 1, 4, 1, 1 -}; - const short vp8_rv[] = { 8, 5, 2, 2, 8, 12, 4, 9, 8, 3, @@ -308,13 +303,14 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, i { d[r&15] = (rv2[r&127] + sum + s[0]) >> 4; } - - s[-8*pitch] = d[(r-8)&15]; + if (r >= 8) + s[-8*pitch] = d[(r-8)&15]; s += pitch; } } } +#if CONFIG_POSTPROC static void vp8_de_mblock(YV12_BUFFER_CONFIG *post, int q) { @@ -387,6 +383,7 @@ void vp8_deblock(VP8_COMMON *cm, vp8_yv12_copy_frame(source, post); } } +#endif #if !(CONFIG_TEMPORAL_DENOISING) void vp8_de_noise(VP8_COMMON *cm, @@ -396,12 +393,12 @@ void vp8_de_noise(VP8_COMMON *cm, int low_var_thresh, int flag) { + int mbr; double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; int ppl = (int)(level + .5); - int mb_rows = source->y_width >> 4; - int mb_cols = source->y_height >> 4; + int mb_rows = cm->mb_rows; + int mb_cols = cm->mb_cols; unsigned char *limits = cm->pp_limits_buffer;; - int mbr, mbc; (void) post; (void) low_var_thresh; (void) flag; diff --git a/media/libvpx/vp8/common/postproc.h b/media/libvpx/vp8/common/postproc.h index 495a2c906f62..33d0a7f025be 100644 --- a/media/libvpx/vp8/common/postproc.h +++ b/media/libvpx/vp8/common/postproc.h @@ -9,8 +9,8 @@ */ -#ifndef POSTPROC_H -#define POSTPROC_H +#ifndef VP8_COMMON_POSTPROC_H_ +#define VP8_COMMON_POSTPROC_H_ #include "vpx_ports/mem.h" struct postproc_state @@ -26,6 +26,10 @@ struct postproc_state }; #include "onyxc_int.h" #include "ppflags.h" + +#ifdef __cplusplus +extern "C" { +#endif int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags); @@ -47,4 +51,8 @@ void vp8_deblock(struct VP8Common *oci, #define MFQE_PRECISION 4 void vp8_multiframe_quality_enhance(struct VP8Common *cm); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_POSTPROC_H_ diff --git a/media/libvpx/vp8/common/ppflags.h b/media/libvpx/vp8/common/ppflags.h index 665e21fd9654..768224aad5e6 100644 --- a/media/libvpx/vp8/common/ppflags.h +++ b/media/libvpx/vp8/common/ppflags.h @@ -9,8 +9,12 @@ */ -#ifndef __INC_PPFLAGS_H -#define __INC_PPFLAGS_H +#ifndef VP8_COMMON_PPFLAGS_H_ +#define VP8_COMMON_PPFLAGS_H_ + +#ifdef __cplusplus +extern "C" { +#endif enum { VP8D_NOFILTERING = 0, @@ -38,4 +42,8 @@ typedef struct int display_mv_flag; } vp8_ppflags_t; +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_PPFLAGS_H_ diff --git a/media/libvpx/vp8/common/pragmas.h b/media/libvpx/vp8/common/pragmas.h deleted file mode 100644 index 99fee5ae2327..000000000000 --- a/media/libvpx/vp8/common/pragmas.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - - - -#ifdef __INTEL_COMPILER -#pragma warning(disable:997 1011 170) -#endif -#ifdef _MSC_VER -#pragma warning(disable:4799) -#endif diff --git a/media/libvpx/vp8/common/quant_common.h b/media/libvpx/vp8/common/quant_common.h index cb64d8eb8d87..700b5e6d726c 100644 --- a/media/libvpx/vp8/common/quant_common.h +++ b/media/libvpx/vp8/common/quant_common.h @@ -8,14 +8,27 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP8_COMMON_QUANT_COMMON_H_ +#define VP8_COMMON_QUANT_COMMON_H_ + #include "string.h" #include "blockd.h" #include "onyxc_int.h" +#ifdef __cplusplus +extern "C" { +#endif + extern int vp8_ac_yquant(int QIndex); extern int vp8_dc_quant(int QIndex, int Delta); extern int vp8_dc2quant(int QIndex, int Delta); extern int vp8_ac2quant(int QIndex, int Delta); extern int vp8_dc_uv_quant(int QIndex, int Delta); extern int vp8_ac_uv_quant(int QIndex, int Delta); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_QUANT_COMMON_H_ diff --git a/media/libvpx/vp8/common/reconinter.h b/media/libvpx/vp8/common/reconinter.h index 233c02e5ba5f..ba979b9664af 100644 --- a/media/libvpx/vp8/common/reconinter.h +++ b/media/libvpx/vp8/common/reconinter.h @@ -9,8 +9,12 @@ */ -#ifndef __INC_RECONINTER_H -#define __INC_RECONINTER_H +#ifndef VP8_COMMON_RECONINTER_H_ +#define VP8_COMMON_RECONINTER_H_ + +#ifdef __cplusplus +extern "C" { +#endif extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x); extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x, @@ -32,4 +36,8 @@ extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x); extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_RECONINTER_H_ diff --git a/media/libvpx/vp8/common/reconintra4x4.h b/media/libvpx/vp8/common/reconintra4x4.h index d2b0d43461eb..ed59c9edd4cd 100644 --- a/media/libvpx/vp8/common/reconintra4x4.h +++ b/media/libvpx/vp8/common/reconintra4x4.h @@ -9,10 +9,14 @@ */ -#ifndef __INC_RECONINTRA4x4_H -#define __INC_RECONINTRA4x4_H +#ifndef VP8_COMMON_RECONINTRA4X4_H_ +#define VP8_COMMON_RECONINTRA4X4_H_ #include "vp8/common/blockd.h" +#ifdef __cplusplus +extern "C" { +#endif + static void intra_prediction_down_copy(MACROBLOCKD *xd, unsigned char *above_right_src) { @@ -29,4 +33,8 @@ static void intra_prediction_down_copy(MACROBLOCKD *xd, *dst_ptr2 = *src_ptr; } +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_RECONINTRA4X4_H_ diff --git a/media/libvpx/vp8/common/setupintrarecon.h b/media/libvpx/vp8/common/setupintrarecon.h index 8b6c50bd3c78..608f4a9ac255 100644 --- a/media/libvpx/vp8/common/setupintrarecon.h +++ b/media/libvpx/vp8/common/setupintrarecon.h @@ -8,10 +8,14 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef SETUPINTRARECON_H -#define SETUPINTRARECON_H +#ifndef VP8_COMMON_SETUPINTRARECON_H_ +#define VP8_COMMON_SETUPINTRARECON_H_ #include "vpx_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif extern void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf); extern void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf); @@ -34,4 +38,8 @@ void setup_intra_recon_left(unsigned char *y_buffer, v_buffer[uv_stride *i] = (unsigned char) 129; } +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_SETUPINTRARECON_H_ diff --git a/media/libvpx/vp8/common/swapyv12buffer.h b/media/libvpx/vp8/common/swapyv12buffer.h index a6473ed92be7..1d66cd3d62e6 100644 --- a/media/libvpx/vp8/common/swapyv12buffer.h +++ b/media/libvpx/vp8/common/swapyv12buffer.h @@ -9,11 +9,19 @@ */ -#ifndef SWAPYV12_BUFFER_H -#define SWAPYV12_BUFFER_H +#ifndef VP8_COMMON_SWAPYV12BUFFER_H_ +#define VP8_COMMON_SWAPYV12BUFFER_H_ #include "vpx_scale/yv12config.h" +#ifdef __cplusplus +extern "C" { +#endif + void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_SWAPYV12BUFFER_H_ diff --git a/media/libvpx/vp8/common/systemdependent.h b/media/libvpx/vp8/common/systemdependent.h index e6b0456f75bc..3d44e37cf249 100644 --- a/media/libvpx/vp8/common/systemdependent.h +++ b/media/libvpx/vp8/common/systemdependent.h @@ -8,8 +8,20 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP8_COMMON_SYSTEMDEPENDENT_H_ +#define VP8_COMMON_SYSTEMDEPENDENT_H_ #include "vpx_config.h" +#ifdef __cplusplus +extern "C" { +#endif + struct VP8Common; void vp8_machine_specific_config(struct VP8Common *); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_SYSTEMDEPENDENT_H_ diff --git a/media/libvpx/vp8/common/threading.h b/media/libvpx/vp8/common/threading.h index ed9e3e60dcb8..01c82dbb805b 100644 --- a/media/libvpx/vp8/common/threading.h +++ b/media/libvpx/vp8/common/threading.h @@ -9,8 +9,12 @@ */ -#ifndef _PTHREAD_EMULATION -#define _PTHREAD_EMULATION +#ifndef VP8_COMMON_THREADING_H_ +#define VP8_COMMON_THREADING_H_ + +#ifdef __cplusplus +extern "C" { +#endif #if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD @@ -183,4 +187,8 @@ static inline int sem_destroy(sem_t * sem) #endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */ +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_THREADING_H_ diff --git a/media/libvpx/vp8/common/treecoder.h b/media/libvpx/vp8/common/treecoder.h index ebf51c5ed657..d22b7c570cbe 100644 --- a/media/libvpx/vp8/common/treecoder.h +++ b/media/libvpx/vp8/common/treecoder.h @@ -9,8 +9,12 @@ */ -#ifndef __INC_TREECODER_H -#define __INC_TREECODER_H +#ifndef VP8_COMMON_TREECODER_H_ +#define VP8_COMMON_TREECODER_H_ + +#ifdef __cplusplus +extern "C" { +#endif typedef unsigned char vp8bc_index_t; /* probability index */ @@ -87,4 +91,8 @@ void vp8bc_tree_probs_from_distribution( ); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_TREECODER_H_ diff --git a/media/libvpx/vp8/common/variance.h b/media/libvpx/vp8/common/variance.h index 01193b8d7243..89a32a72268c 100644 --- a/media/libvpx/vp8/common/variance.h +++ b/media/libvpx/vp8/common/variance.h @@ -9,11 +9,15 @@ */ -#ifndef VARIANCE_H -#define VARIANCE_H +#ifndef VP8_COMMON_VARIANCE_H_ +#define VP8_COMMON_VARIANCE_H_ #include "vpx_config.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef unsigned int(*vp8_sad_fn_t)( const unsigned char *src_ptr, int source_stride, @@ -112,4 +116,8 @@ typedef struct variance_vtable #endif } vp8_variance_fn_ptr_t; +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_VARIANCE_H_ diff --git a/media/libvpx/vp8/common/vp8_entropymodedata.h b/media/libvpx/vp8/common/vp8_entropymodedata.h index 13e9a92fc106..c4aed4989705 100644 --- a/media/libvpx/vp8/common/vp8_entropymodedata.h +++ b/media/libvpx/vp8/common/vp8_entropymodedata.h @@ -8,6 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP8_COMMON_VP8_ENTROPYMODEDATA_H_ +#define VP8_COMMON_VP8_ENTROPYMODEDATA_H_ + +#ifdef __cplusplus +extern "C" { +#endif /*Generated file, included by entropymode.c*/ @@ -240,3 +246,9 @@ const vp8_prob vp8_kf_bmode_prob { 112, 19, 12, 61, 195, 128, 48, 4, 24 } } }; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_VP8_ENTROPYMODEDATA_H_ diff --git a/media/libvpx/vp8/common/x86/filter_x86.h b/media/libvpx/vp8/common/x86/filter_x86.h index cfadaeecbc12..d282841bee4d 100644 --- a/media/libvpx/vp8/common/x86/filter_x86.h +++ b/media/libvpx/vp8/common/x86/filter_x86.h @@ -8,11 +8,15 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef FILTER_X86_H -#define FILTER_X86_H +#ifndef VP8_COMMON_X86_FILTER_X86_H_ +#define VP8_COMMON_X86_FILTER_X86_H_ #include "vpx_ports/mem.h" +#ifdef __cplusplus +extern "C" { +#endif + /* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with * duplicated values */ @@ -22,4 +26,8 @@ extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]); /* duplicated 8x */ extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]); -#endif /* FILTER_X86_H */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_X86_FILTER_X86_H_ diff --git a/media/libvpx/vp8/common/x86/loopfilter_block_sse2.asm b/media/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm similarity index 100% rename from media/libvpx/vp8/common/x86/loopfilter_block_sse2.asm rename to media/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm diff --git a/media/libvpx/vp8/common/x86/loopfilter_mmx.asm b/media/libvpx/vp8/common/x86/loopfilter_mmx.asm index f388d2476818..88a07b9f3fa7 100644 --- a/media/libvpx/vp8/common/x86/loopfilter_mmx.asm +++ b/media/libvpx/vp8/common/x86/loopfilter_mmx.asm @@ -527,7 +527,7 @@ sym(vp8_loop_filter_vertical_edge_mmx): pxor mm7, [GLOBAL(t80)] ; unoffset ; mm7 = q1 - ; tranpose and write back + ; transpose and write back ; mm1 = 72 62 52 42 32 22 12 02 ; mm6 = 73 63 53 43 33 23 13 03 ; mm3 = 74 64 54 44 34 24 14 04 @@ -1289,7 +1289,7 @@ sym(vp8_mbloop_filter_vertical_edge_mmx): pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01 pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06 - ; tranpose and write back + ; transpose and write back movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00 movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00 diff --git a/media/libvpx/vp8/common/x86/loopfilter_sse2.asm b/media/libvpx/vp8/common/x86/loopfilter_sse2.asm index a66753ba806e..1913abc69b07 100644 --- a/media/libvpx/vp8/common/x86/loopfilter_sse2.asm +++ b/media/libvpx/vp8/common/x86/loopfilter_sse2.asm @@ -958,7 +958,7 @@ sym(vp8_loop_filter_vertical_edge_sse2): ; start work on filters B_FILTER 2 - ; tranpose and write back - only work on q1, q0, p0, p1 + ; transpose and write back - only work on q1, q0, p0, p1 BV_TRANSPOSE ; store 16-line result @@ -1023,7 +1023,7 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): ; start work on filters B_FILTER 2 - ; tranpose and write back - only work on q1, q0, p0, p1 + ; transpose and write back - only work on q1, q0, p0, p1 BV_TRANSPOSE lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing diff --git a/media/libvpx/vp8/common/x86/postproc_mmx.asm b/media/libvpx/vp8/common/x86/postproc_mmx.asm index 5cf110b53264..a2b16327f08a 100644 --- a/media/libvpx/vp8/common/x86/postproc_mmx.asm +++ b/media/libvpx/vp8/common/x86/postproc_mmx.asm @@ -204,13 +204,16 @@ sym(vp8_mbpost_proc_down_mmx): and rcx, 15 movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4] + cmp edx, 8 + jl .skip_assignment + mov rcx, rdx sub rcx, 8 - and rcx, 15 movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4] - movd [rsi], mm1 + +.skip_assignment lea rsi, [rsi+rax] lea rdi, [rdi+rax] @@ -243,7 +246,6 @@ sym(vp8_mbpost_proc_down_mmx): ; unsigned char whiteclamp[16], ; unsigned char bothclamp[16], ; unsigned int Width, unsigned int Height, int Pitch) -extern sym(rand) global sym(vp8_plane_add_noise_mmx) PRIVATE sym(vp8_plane_add_noise_mmx): push rbp @@ -255,7 +257,7 @@ sym(vp8_plane_add_noise_mmx): ; end prolog .addnoise_loop: - call sym(rand) WRT_PLT + call sym(LIBVPX_RAND) WRT_PLT mov rcx, arg(1) ;noise and rax, 0xff add rcx, rax diff --git a/media/libvpx/vp8/common/x86/postproc_sse2.asm b/media/libvpx/vp8/common/x86/postproc_sse2.asm index 00f84a31b21d..fed4ee5ccf80 100644 --- a/media/libvpx/vp8/common/x86/postproc_sse2.asm +++ b/media/libvpx/vp8/common/x86/postproc_sse2.asm @@ -425,13 +425,16 @@ sym(vp8_mbpost_proc_down_xmm): and rcx, 15 movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8] + cmp edx, 8 + jl .skip_assignment + mov rcx, rdx sub rcx, 8 - and rcx, 15 movq mm0, [rsp + rcx*8] ;d[rcx*8] - movq [rsi], mm0 + +.skip_assignment lea rsi, [rsi+rax] lea rdi, [rdi+rax] @@ -657,7 +660,6 @@ sym(vp8_mbpost_proc_across_ip_xmm): ; unsigned char whiteclamp[16], ; unsigned char bothclamp[16], ; unsigned int Width, unsigned int Height, int Pitch) -extern sym(rand) global sym(vp8_plane_add_noise_wmt) PRIVATE sym(vp8_plane_add_noise_wmt): push rbp @@ -669,7 +671,7 @@ sym(vp8_plane_add_noise_wmt): ; end prolog .addnoise_loop: - call sym(rand) WRT_PLT + call sym(LIBVPX_RAND) WRT_PLT mov rcx, arg(1) ;noise and rax, 0xff add rcx, rax diff --git a/media/libvpx/vp8/common/x86/postproc_x86.c b/media/libvpx/vp8/common/x86/postproc_x86.c deleted file mode 100644 index 3ec0106a8ac5..000000000000 --- a/media/libvpx/vp8/common/x86/postproc_x86.c +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -/* On Android NDK, rand is inlined function, but postproc needs rand symbol */ -#if defined(__ANDROID__) -#define rand __rand -#include -#undef rand - -extern int rand(void) -{ - return __rand(); -} -#else -/* ISO C forbids an empty translation unit. */ -int vp8_unused; -#endif diff --git a/media/libvpx/vp8/common/x86/recon_sse2.asm b/media/libvpx/vp8/common/x86/recon_sse2.asm index 1434bcd9379d..7141f8324632 100644 --- a/media/libvpx/vp8/common/x86/recon_sse2.asm +++ b/media/libvpx/vp8/common/x86/recon_sse2.asm @@ -365,6 +365,7 @@ sym(vp8_intra_pred_uv_tm_%1): GET_GOT rbx push rsi push rdi + push rbx ; end prolog ; read top row @@ -395,8 +396,11 @@ sym(vp8_intra_pred_uv_tm_%1): movsxd rcx, dword ptr arg(1) ;dst_stride .vp8_intra_pred_uv_tm_%1_loop: - movd xmm3, [rsi] - movd xmm5, [rsi+rax] + mov bl, [rsi] + movd xmm3, ebx + + mov bl, [rsi+rax] + movd xmm5, ebx %ifidn %1, sse2 punpcklbw xmm3, xmm0 punpcklbw xmm5, xmm0 @@ -419,6 +423,7 @@ sym(vp8_intra_pred_uv_tm_%1): jnz .vp8_intra_pred_uv_tm_%1_loop ; begin epilog + pop rbx pop rdi pop rsi RESTORE_GOT @@ -486,10 +491,8 @@ sym(vp8_intra_pred_uv_ho_%1): SHADOW_ARGS_TO_STACK 5 push rsi push rdi -%ifidn %1, ssse3 -%ifndef GET_GOT_SAVE_ARG push rbx -%endif +%ifidn %1, ssse3 GET_GOT rbx %endif ; end prolog @@ -507,13 +510,16 @@ sym(vp8_intra_pred_uv_ho_%1): %ifidn %1, ssse3 lea rdx, [rcx*3] movdqa xmm2, [GLOBAL(dc_00001111)] - lea rbx, [rax*3] %endif %ifidn %1, mmx2 .vp8_intra_pred_uv_ho_%1_loop: - movd mm0, [rsi] - movd mm1, [rsi+rax] + mov bl, [rsi] + movd mm0, ebx + + mov bl, [rsi+rax] + movd mm1, ebx + punpcklbw mm0, mm0 punpcklbw mm1, mm1 pshufw mm0, mm0, 0x0 @@ -525,10 +531,19 @@ sym(vp8_intra_pred_uv_ho_%1): dec edx jnz .vp8_intra_pred_uv_ho_%1_loop %else - movd xmm0, [rsi] - movd xmm3, [rsi+rax] - movd xmm1, [rsi+rax*2] - movd xmm4, [rsi+rbx] + mov bl, [rsi] + movd xmm0, ebx + + mov bl, [rsi+rax] + movd xmm3, ebx + + mov bl, [rsi+rax*2] + movd xmm1, ebx + + lea rbx, [rax*3] + mov bl, [rsi+rbx] + movd xmm4, ebx + punpcklbw xmm0, xmm3 punpcklbw xmm1, xmm4 pshufb xmm0, xmm2 @@ -539,10 +554,20 @@ sym(vp8_intra_pred_uv_ho_%1): movhps [rdi+rdx], xmm1 lea rsi, [rsi+rax*4] lea rdi, [rdi+rcx*4] - movd xmm0, [rsi] - movd xmm3, [rsi+rax] - movd xmm1, [rsi+rax*2] - movd xmm4, [rsi+rbx] + + mov bl, [rsi] + movd xmm0, ebx + + mov bl, [rsi+rax] + movd xmm3, ebx + + mov bl, [rsi+rax*2] + movd xmm1, ebx + + lea rbx, [rax*3] + mov bl, [rsi+rbx] + movd xmm4, ebx + punpcklbw xmm0, xmm3 punpcklbw xmm1, xmm4 pshufb xmm0, xmm2 @@ -556,10 +581,8 @@ sym(vp8_intra_pred_uv_ho_%1): ; begin epilog %ifidn %1, ssse3 RESTORE_GOT -%ifndef GET_GOT_SAVE_ARG +%endif pop rbx -%endif -%endif pop rdi pop rsi UNSHADOW_ARGS @@ -893,6 +916,7 @@ sym(vp8_intra_pred_y_tm_%1): SAVE_XMM 7 push rsi push rdi + push rbx GET_GOT rbx ; end prolog @@ -926,8 +950,11 @@ sym(vp8_intra_pred_y_tm_%1): mov rdi, arg(0) ;dst; movsxd rcx, dword ptr arg(1) ;dst_stride vp8_intra_pred_y_tm_%1_loop: - movd xmm4, [rsi] - movd xmm5, [rsi+rax] + mov bl, [rsi] + movd xmm4, ebx + + mov bl, [rsi+rax] + movd xmm5, ebx %ifidn %1, sse2 punpcklbw xmm4, xmm0 punpcklbw xmm5, xmm0 @@ -956,6 +983,7 @@ vp8_intra_pred_y_tm_%1_loop: ; begin epilog RESTORE_GOT + pop rbx pop rdi pop rsi RESTORE_XMM @@ -1029,6 +1057,7 @@ sym(vp8_intra_pred_y_ho_sse2): SHADOW_ARGS_TO_STACK 5 push rsi push rdi + push rbx ; end prolog ;arg(2) not used @@ -1041,8 +1070,11 @@ sym(vp8_intra_pred_y_ho_sse2): movsxd rcx, dword ptr arg(1) ;dst_stride vp8_intra_pred_y_ho_sse2_loop: - movd xmm0, [rsi] - movd xmm1, [rsi+rax] + mov bl, [rsi] + movd xmm0, ebx + mov bl, [rsi+rax] + movd xmm1, ebx + ; FIXME use pshufb for ssse3 version punpcklbw xmm0, xmm0 punpcklbw xmm1, xmm1 @@ -1058,6 +1090,7 @@ vp8_intra_pred_y_ho_sse2_loop: jnz vp8_intra_pred_y_ho_sse2_loop ; begin epilog + pop rbx pop rdi pop rsi UNSHADOW_ARGS diff --git a/media/libvpx/vp8/common/x86/variance_impl_mmx.asm b/media/libvpx/vp8/common/x86/variance_impl_mmx.asm index d9120d0d41ec..7d5e6810bf0d 100644 --- a/media/libvpx/vp8/common/x86/variance_impl_mmx.asm +++ b/media/libvpx/vp8/common/x86/variance_impl_mmx.asm @@ -342,8 +342,8 @@ sym(vp8_get4x4var_mmx): movsxd rdx, dword ptr arg(3) ;[recon_stride] ; Row 1 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm1, [rbx] ; Copy eight bytes to mm1 + movd mm0, [rax] ; Copy four bytes to mm0 + movd mm1, [rbx] ; Copy four bytes to mm1 punpcklbw mm0, mm6 ; unpack to higher prrcision punpcklbw mm1, mm6 psubsw mm0, mm1 ; A-B (low order) to MM0 @@ -351,12 +351,12 @@ sym(vp8_get4x4var_mmx): pmaddwd mm0, mm0 ; square and accumulate add rbx,rdx ; Inc pointer into ref data add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 + movd mm1, [rbx] ; Copy four bytes to mm1 paddd mm7, mm0 ; accumulate in mm7 ; Row 2 - movq mm0, [rax] ; Copy eight bytes to mm0 + movd mm0, [rax] ; Copy four bytes to mm0 punpcklbw mm0, mm6 ; unpack to higher prrcision punpcklbw mm1, mm6 psubsw mm0, mm1 ; A-B (low order) to MM0 @@ -365,12 +365,12 @@ sym(vp8_get4x4var_mmx): pmaddwd mm0, mm0 ; square and accumulate add rbx,rdx ; Inc pointer into ref data add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 + movd mm1, [rbx] ; Copy four bytes to mm1 paddd mm7, mm0 ; accumulate in mm7 ; Row 3 - movq mm0, [rax] ; Copy eight bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision + movd mm0, [rax] ; Copy four bytes to mm0 + punpcklbw mm0, mm6 ; unpack to higher precision punpcklbw mm1, mm6 psubsw mm0, mm1 ; A-B (low order) to MM0 paddw mm5, mm0 ; accumulate differences in mm5 @@ -378,11 +378,11 @@ sym(vp8_get4x4var_mmx): pmaddwd mm0, mm0 ; square and accumulate add rbx,rdx ; Inc pointer into ref data add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 + movd mm1, [rbx] ; Copy four bytes to mm1 paddd mm7, mm0 ; accumulate in mm7 ; Row 4 - movq mm0, [rax] ; Copy eight bytes to mm0 + movd mm0, [rax] ; Copy four bytes to mm0 punpcklbw mm0, mm6 ; unpack to higher prrcision punpcklbw mm1, mm6 diff --git a/media/libvpx/vp8/common/x86/variance_mmx.c b/media/libvpx/vp8/common/x86/variance_mmx.c index 36995db9aa7b..02e02420f464 100644 --- a/media/libvpx/vp8/common/x86/variance_mmx.c +++ b/media/libvpx/vp8/common/x86/variance_mmx.c @@ -10,7 +10,6 @@ #include "vpx_config.h" #include "vp8/common/variance.h" -#include "vp8/common/pragmas.h" #include "vpx_ports/mem.h" #include "vp8/common/x86/filter_x86.h" diff --git a/media/libvpx/vp8/common/x86/variance_sse2.c b/media/libvpx/vp8/common/x86/variance_sse2.c index 7fa5f53dcbf6..1fe127bf2c6a 100644 --- a/media/libvpx/vp8/common/x86/variance_sse2.c +++ b/media/libvpx/vp8/common/x86/variance_sse2.c @@ -10,7 +10,6 @@ #include "vpx_config.h" #include "vp8/common/variance.h" -#include "vp8/common/pragmas.h" #include "vpx_ports/mem.h" #include "vp8/common/x86/filter_x86.h" diff --git a/media/libvpx/vp8/common/x86/variance_ssse3.c b/media/libvpx/vp8/common/x86/variance_ssse3.c index f90f8117c4db..73eb90df61f6 100644 --- a/media/libvpx/vp8/common/x86/variance_ssse3.c +++ b/media/libvpx/vp8/common/x86/variance_ssse3.c @@ -10,7 +10,6 @@ #include "vpx_config.h" #include "vp8/common/variance.h" -#include "vp8/common/pragmas.h" #include "vpx_ports/mem.h" extern unsigned int vp8_get16x16var_sse2 diff --git a/media/libvpx/vp8/decoder/dboolhuff.c b/media/libvpx/vp8/decoder/dboolhuff.c index 0007d7a7a3e3..b874d4c46c07 100644 --- a/media/libvpx/vp8/decoder/dboolhuff.c +++ b/media/libvpx/vp8/decoder/dboolhuff.c @@ -10,11 +10,12 @@ #include "dboolhuff.h" +#include "vp8/common/common.h" int vp8dx_start_decode(BOOL_DECODER *br, const unsigned char *source, unsigned int source_sz, - vp8_decrypt_cb *decrypt_cb, + vpx_decrypt_cb decrypt_cb, void *decrypt_state) { br->user_buffer_end = source+source_sz; @@ -39,7 +40,7 @@ void vp8dx_bool_decoder_fill(BOOL_DECODER *br) const unsigned char *bufptr = br->user_buffer; VP8_BD_VALUE value = br->value; int count = br->count; - int shift = VP8_BD_VALUE_SIZE - 8 - (count + 8); + int shift = VP8_BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT); size_t bytes_left = br->user_buffer_end - bufptr; size_t bits_left = bytes_left * CHAR_BIT; int x = (int)(shift + CHAR_BIT - bits_left); @@ -47,7 +48,7 @@ void vp8dx_bool_decoder_fill(BOOL_DECODER *br) unsigned char decrypted[sizeof(VP8_BD_VALUE) + 1]; if (br->decrypt_cb) { - size_t n = bytes_left > sizeof(decrypted) ? sizeof(decrypted) : bytes_left; + size_t n = MIN(sizeof(decrypted), bytes_left); br->decrypt_cb(br->decrypt_state, bufptr, decrypted, (int)n); bufptr = decrypted; } diff --git a/media/libvpx/vp8/decoder/dboolhuff.h b/media/libvpx/vp8/decoder/dboolhuff.h index 4c0ca1ce7376..51c5adc28abe 100644 --- a/media/libvpx/vp8/decoder/dboolhuff.h +++ b/media/libvpx/vp8/decoder/dboolhuff.h @@ -9,16 +9,21 @@ */ -#ifndef DBOOLHUFF_H_ -#define DBOOLHUFF_H_ +#ifndef VP8_DECODER_DBOOLHUFF_H_ +#define VP8_DECODER_DBOOLHUFF_H_ #include #include #include "vpx_config.h" #include "vpx_ports/mem.h" +#include "vpx/vp8dx.h" #include "vpx/vpx_integer.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef size_t VP8_BD_VALUE; #define VP8_BD_VALUE_SIZE ((int)sizeof(VP8_BD_VALUE)*CHAR_BIT) @@ -28,12 +33,6 @@ typedef size_t VP8_BD_VALUE; Even relatively modest values like 100 would work fine.*/ #define VP8_LOTS_OF_BITS (0x40000000) -/*Decrypt n bytes of data from input -> output, using the decrypt_state - passed in VP8D_SET_DECRYPTOR. -*/ -typedef void (vp8_decrypt_cb)(void *decrypt_state, const unsigned char *input, - unsigned char *output, int count); - typedef struct { const unsigned char *user_buffer_end; @@ -41,7 +40,7 @@ typedef struct VP8_BD_VALUE value; int count; unsigned int range; - vp8_decrypt_cb *decrypt_cb; + vpx_decrypt_cb decrypt_cb; void *decrypt_state; } BOOL_DECODER; @@ -50,7 +49,7 @@ DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]); int vp8dx_start_decode(BOOL_DECODER *br, const unsigned char *source, unsigned int source_sz, - vp8_decrypt_cb *decrypt_cb, + vpx_decrypt_cb decrypt_cb, void *decrypt_state); void vp8dx_bool_decoder_fill(BOOL_DECODER *br); @@ -135,4 +134,8 @@ static int vp8dx_bool_error(BOOL_DECODER *br) return 0; } -#endif // DBOOLHUFF_H_ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_DECODER_DBOOLHUFF_H_ diff --git a/media/libvpx/vp8/decoder/decodframe.c b/media/libvpx/vp8/decoder/decodeframe.c similarity index 98% rename from media/libvpx/vp8/decoder/decodframe.c rename to media/libvpx/vp8/decoder/decodeframe.c index 16da78a2c1ee..e7cf0d9b9c61 100644 --- a/media/libvpx/vp8/decoder/decodframe.c +++ b/media/libvpx/vp8/decoder/decodeframe.c @@ -17,6 +17,7 @@ #include "vp8/common/reconintra4x4.h" #include "vp8/common/reconinter.h" #include "detokenize.h" +#include "vp8/common/common.h" #include "vp8/common/invtrans.h" #include "vp8/common/alloccommon.h" #include "vp8/common/entropymode.h" @@ -631,9 +632,17 @@ static void decode_mb_rows(VP8D_COMP *pbi) xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset; xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset; - xd->pre.y_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][0] + recon_yoffset; - xd->pre.u_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][1] + recon_uvoffset; - xd->pre.v_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][2] + recon_uvoffset; + if (xd->mode_info_context->mbmi.ref_frame >= LAST_FRAME) { + MV_REFERENCE_FRAME ref = xd->mode_info_context->mbmi.ref_frame; + xd->pre.y_buffer = ref_buffer[ref][0] + recon_yoffset; + xd->pre.u_buffer = ref_buffer[ref][1] + recon_uvoffset; + xd->pre.v_buffer = ref_buffer[ref][2] + recon_uvoffset; + } else { + // ref_frame is INTRA_FRAME, pre buffer should not be used. + xd->pre.y_buffer = 0; + xd->pre.u_buffer = 0; + xd->pre.v_buffer = 0; + } /* propagate errors from reference frames */ xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame]; @@ -680,7 +689,6 @@ static void decode_mb_rows(VP8D_COMP *pbi) vp8_loop_filter_row_simple(pc, lf_mic, mb_row-1, recon_y_stride, recon_uv_stride, lf_dst[0], lf_dst[1], lf_dst[2]); - if(mb_row > 1) { yv12_extend_frame_left_right_c(yv12_fb_new, @@ -691,10 +699,6 @@ static void decode_mb_rows(VP8D_COMP *pbi) eb_dst[0] += recon_y_stride * 16; eb_dst[1] += recon_uv_stride * 8; eb_dst[2] += recon_uv_stride * 8; - - if(mb_row == 2) - yv12_extend_frame_top_c(yv12_fb_new); - } lf_dst[0] += recon_y_stride * 16; @@ -713,13 +717,9 @@ static void decode_mb_rows(VP8D_COMP *pbi) eb_dst[0], eb_dst[1], eb_dst[2]); - eb_dst[0] += recon_y_stride * 16; eb_dst[1] += recon_uv_stride * 8; eb_dst[2] += recon_uv_stride * 8; - - if(mb_row == 1) - yv12_extend_frame_top_c(yv12_fb_new); } } } @@ -747,7 +747,7 @@ static void decode_mb_rows(VP8D_COMP *pbi) eb_dst[0], eb_dst[1], eb_dst[2]); - + yv12_extend_frame_top_c(yv12_fb_new); yv12_extend_frame_bottom_c(yv12_fb_new); } @@ -1019,8 +1019,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) const unsigned char *clear = data; if (pbi->decrypt_cb) { - int n = (int)(data_end - data); - if (n > 10) n = 10; + int n = (int)MIN(sizeof(clear_buffer), data_end - data); pbi->decrypt_cb(pbi->decrypt_state, data, clear_buffer, n); clear = clear_buffer; } diff --git a/media/libvpx/vp8/decoder/decodemv.h b/media/libvpx/vp8/decoder/decodemv.h index 05a33d27f6a3..f33b07351d3c 100644 --- a/media/libvpx/vp8/decoder/decodemv.h +++ b/media/libvpx/vp8/decoder/decodemv.h @@ -8,11 +8,19 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef DECODEMV_H_ -#define DECODEMV_H_ +#ifndef VP8_DECODER_DECODEMV_H_ +#define VP8_DECODER_DECODEMV_H_ #include "onyxd_int.h" +#ifdef __cplusplus +extern "C" { +#endif + void vp8_decode_mode_mvs(VP8D_COMP *); -#endif // DECODEMV_H_ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_DECODER_DECODEMV_H_ diff --git a/media/libvpx/vp8/decoder/decoderthreading.h b/media/libvpx/vp8/decoder/decoderthreading.h index bc716e489e08..c563cf6e93aa 100644 --- a/media/libvpx/vp8/decoder/decoderthreading.h +++ b/media/libvpx/vp8/decoder/decoderthreading.h @@ -8,8 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef DECODERTHREADING_H_ -#define DECODERTHREADING_H_ +#ifndef VP8_DECODER_DECODERTHREADING_H_ +#define VP8_DECODER_DECODERTHREADING_H_ + +#ifdef __cplusplus +extern "C" { +#endif #if CONFIG_MULTITHREAD void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd); @@ -19,4 +23,8 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows); void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows); #endif -#endif // DECODERTHREADING_H_ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_DECODER_DECODERTHREADING_H_ diff --git a/media/libvpx/vp8/decoder/detokenize.h b/media/libvpx/vp8/decoder/detokenize.h index f2130b361789..f0b125444f03 100644 --- a/media/libvpx/vp8/decoder/detokenize.h +++ b/media/libvpx/vp8/decoder/detokenize.h @@ -8,12 +8,20 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef DETOKENIZE_H_ -#define DETOKENIZE_H_ +#ifndef VP8_DECODER_DETOKENIZE_H_ +#define VP8_DECODER_DETOKENIZE_H_ #include "onyxd_int.h" +#ifdef __cplusplus +extern "C" { +#endif + void vp8_reset_mb_tokens_context(MACROBLOCKD *x); int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *); -#endif // DETOKENIZE_H +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_DECODER_DETOKENIZE_H_ diff --git a/media/libvpx/vp8/decoder/ec_types.h b/media/libvpx/vp8/decoder/ec_types.h index b24bfd9439f7..3af5ca86b4b9 100644 --- a/media/libvpx/vp8/decoder/ec_types.h +++ b/media/libvpx/vp8/decoder/ec_types.h @@ -8,8 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_DEC_EC_TYPES_H -#define VP8_DEC_EC_TYPES_H +#ifndef VP8_DECODER_EC_TYPES_H_ +#define VP8_DECODER_EC_TYPES_H_ + +#ifdef __cplusplus +extern "C" { +#endif #define MAX_OVERLAPS 16 @@ -47,4 +51,8 @@ typedef struct MV_REFERENCE_FRAME ref_frame; } EC_BLOCK; -#endif // VP8_DEC_EC_TYPES_H +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_DECODER_EC_TYPES_H_ diff --git a/media/libvpx/vp8/decoder/error_concealment.c b/media/libvpx/vp8/decoder/error_concealment.c index 0b58c98fd88a..4b304c83c78c 100644 --- a/media/libvpx/vp8/decoder/error_concealment.c +++ b/media/libvpx/vp8/decoder/error_concealment.c @@ -15,9 +15,7 @@ #include "decodemv.h" #include "vpx_mem/vpx_mem.h" #include "vp8/common/findnearmv.h" - -#define MIN(x,y) (((x)<(y))?(x):(y)) -#define MAX(x,y) (((x)>(y))?(x):(y)) +#include "vp8/common/common.h" #define FLOOR(x,q) ((x) & -(1 << (q))) diff --git a/media/libvpx/vp8/decoder/error_concealment.h b/media/libvpx/vp8/decoder/error_concealment.h index fb96b3605ec8..9a1e024865d9 100644 --- a/media/libvpx/vp8/decoder/error_concealment.h +++ b/media/libvpx/vp8/decoder/error_concealment.h @@ -9,12 +9,16 @@ */ -#ifndef ERROR_CONCEALMENT_H_ -#define ERROR_CONCEALMENT_H_ +#ifndef VP8_DECODER_ERROR_CONCEALMENT_H_ +#define VP8_DECODER_ERROR_CONCEALMENT_H_ #include "onyxd_int.h" #include "ec_types.h" +#ifdef __cplusplus +extern "C" { +#endif + /* Allocate memory for the overlap lists */ int vp8_alloc_overlap_lists(VP8D_COMP *pbi); @@ -38,4 +42,8 @@ void vp8_interpolate_motion(MACROBLOCKD *mb, */ void vp8_conceal_corrupt_mb(MACROBLOCKD *xd); -#endif // ERROR_CONCEALMENT_H_ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_DECODER_ERROR_CONCEALMENT_H_ diff --git a/media/libvpx/vp8/decoder/onyxd_if.c b/media/libvpx/vp8/decoder/onyxd_if.c index 2d9e343bc9f0..1d763b6bfae2 100644 --- a/media/libvpx/vp8/decoder/onyxd_if.c +++ b/media/libvpx/vp8/decoder/onyxd_if.c @@ -178,12 +178,6 @@ vpx_codec_err_t vp8dx_set_reference(VP8D_COMP *pbi, enum vpx_ref_frame_type ref_ return pbi->common.error.error_code; } -/*For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.*/ -#if HAVE_NEON -extern void vp8_push_neon(int64_t *store); -extern void vp8_pop_neon(int64_t *store); -#endif - static int get_free_fb (VP8_COMMON *cm) { int i; @@ -307,9 +301,6 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size, const uint8_t *source, int64_t time_stamp) { -#if HAVE_NEON - int64_t dx_store_reg[8]; -#endif VP8_COMMON *cm = &pbi->common; int retcode = -1; @@ -319,15 +310,6 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size, if(retcode <= 0) return retcode; -#if HAVE_NEON -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->cpu_caps & HAS_NEON) -#endif - { - vp8_push_neon(dx_store_reg); - } -#endif - cm->new_fb_idx = get_free_fb (cm); /* setup reference frames for vp8_decode_frame */ @@ -403,16 +385,8 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size, pbi->last_time_stamp = time_stamp; decode_exit: -#if HAVE_NEON -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->cpu_caps & HAS_NEON) -#endif - { - vp8_pop_neon(dx_store_reg); - } -#endif - pbi->common.error.setjmp = 0; + vp8_clear_system_state(); return retcode; } int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags) diff --git a/media/libvpx/vp8/decoder/onyxd_int.h b/media/libvpx/vp8/decoder/onyxd_int.h index 54a98f7cc321..aa2cc57f7b18 100644 --- a/media/libvpx/vp8/decoder/onyxd_int.h +++ b/media/libvpx/vp8/decoder/onyxd_int.h @@ -9,8 +9,8 @@ */ -#ifndef ONYXD_INT_H_ -#define ONYXD_INT_H_ +#ifndef VP8_DECODER_ONYXD_INT_H_ +#define VP8_DECODER_ONYXD_INT_H_ #include "vpx_config.h" #include "vp8/common/onyxd.h" @@ -22,6 +22,10 @@ #include "ec_types.h" #endif +#ifdef __cplusplus +extern "C" { +#endif + typedef struct { int ithread; @@ -122,7 +126,7 @@ typedef struct VP8D_COMP int independent_partitions; int frame_corrupt_residual; - vp8_decrypt_cb *decrypt_cb; + vpx_decrypt_cb decrypt_cb; void *decrypt_state; } VP8D_COMP; @@ -148,4 +152,8 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb); } while(0) #endif -#endif // ONYXD_INT_H_ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_DECODER_ONYXD_INT_H_ diff --git a/media/libvpx/vp8/decoder/treereader.h b/media/libvpx/vp8/decoder/treereader.h index 9393bb478570..35ee696000d9 100644 --- a/media/libvpx/vp8/decoder/treereader.h +++ b/media/libvpx/vp8/decoder/treereader.h @@ -9,12 +9,16 @@ */ -#ifndef TREEREADER_H_ -#define TREEREADER_H_ +#ifndef VP8_DECODER_TREEREADER_H_ +#define VP8_DECODER_TREEREADER_H_ #include "vp8/common/treecoder.h" #include "dboolhuff.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef BOOL_DECODER vp8_reader; #define vp8_read vp8dx_decode_bool @@ -37,4 +41,8 @@ static int vp8_treed_read( return -i; } -#endif // TREEREADER_H_ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_DECODER_TREEREADER_H_ diff --git a/media/libvpx/vp8/encoder/arm/neon/denoising_neon.c b/media/libvpx/vp8/encoder/arm/neon/denoising_neon.c new file mode 100644 index 000000000000..08be76e4338f --- /dev/null +++ b/media/libvpx/vp8/encoder/arm/neon/denoising_neon.c @@ -0,0 +1,478 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp8/encoder/denoising.h" +#include "vpx_mem/vpx_mem.h" +#include "./vp8_rtcd.h" + +/* + * The filter function was modified to reduce the computational complexity. + * + * Step 1: + * Instead of applying tap coefficients for each pixel, we calculated the + * pixel adjustments vs. pixel diff value ahead of time. + * adjustment = filtered_value - current_raw + * = (filter_coefficient * diff + 128) >> 8 + * where + * filter_coefficient = (255 << 8) / (256 + ((abs_diff * 330) >> 3)); + * filter_coefficient += filter_coefficient / + * (3 + motion_magnitude_adjustment); + * filter_coefficient is clamped to 0 ~ 255. + * + * Step 2: + * The adjustment vs. diff curve becomes flat very quick when diff increases. + * This allowed us to use only several levels to approximate the curve without + * changing the filtering algorithm too much. + * The adjustments were further corrected by checking the motion magnitude. + * The levels used are: + * diff level adjustment w/o adjustment w/ + * motion correction motion correction + * [-255, -16] 3 -6 -7 + * [-15, -8] 2 -4 -5 + * [-7, -4] 1 -3 -4 + * [-3, 3] 0 diff diff + * [4, 7] 1 3 4 + * [8, 15] 2 4 5 + * [16, 255] 3 6 7 + */ + +int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y, + int mc_running_avg_y_stride, + unsigned char *running_avg_y, + int running_avg_y_stride, + unsigned char *sig, int sig_stride, + unsigned int motion_magnitude, + int increase_denoising) { + /* If motion_magnitude is small, making the denoiser more aggressive by + * increasing the adjustment for each level, level1 adjustment is + * increased, the deltas stay the same. + */ + int shift_inc = (increase_denoising && + motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0; + const uint8x16_t v_level1_adjustment = vmovq_n_u8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3); + const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1); + const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2); + const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc); + const uint8x16_t v_level2_threshold = vdupq_n_u8(8); + const uint8x16_t v_level3_threshold = vdupq_n_u8(16); + int64x2_t v_sum_diff_total = vdupq_n_s64(0); + + /* Go over lines. */ + int r; + for (r = 0; r < 16; ++r) { + /* Load inputs. */ + const uint8x16_t v_sig = vld1q_u8(sig); + const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); + + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y); + + /* Figure out which level that put us in. */ + const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, + v_abs_diff); + const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, + v_abs_diff); + const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, + v_abs_diff); + + /* Calculate absolute adjustments for level 1, 2 and 3. */ + const uint8x16_t v_level2_adjustment = vandq_u8(v_level2_mask, + v_delta_level_1_and_2); + const uint8x16_t v_level3_adjustment = vandq_u8(v_level3_mask, + v_delta_level_2_and_3); + const uint8x16_t v_level1and2_adjustment = vaddq_u8(v_level1_adjustment, + v_level2_adjustment); + const uint8x16_t v_level1and2and3_adjustment = vaddq_u8( + v_level1and2_adjustment, v_level3_adjustment); + + /* Figure adjustment absolute value by selecting between the absolute + * difference if in level0 or the value for level 1, 2 and 3. + */ + const uint8x16_t v_abs_adjustment = vbslq_u8(v_level1_mask, + v_level1and2and3_adjustment, v_abs_diff); + + /* Calculate positive and negative adjustments. Apply them to the signal + * and accumulate them. Adjustments are less than eight and the maximum + * sum of them (7 * 16) can fit in a signed char. + */ + const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask, + v_abs_adjustment); + const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask, + v_abs_adjustment); + + uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment); + v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment); + + /* Store results. */ + vst1q_u8(running_avg_y, v_running_avg_y); + + /* Sum all the accumulators to have the sum of all pixel differences + * for this macroblock. + */ + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment), + vreinterpretq_s8_u8(v_neg_adjustment)); + + const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff); + + const int32x4_t fedc_ba98_7654_3210 = + vpaddlq_s16(fe_dc_ba_98_76_54_32_10); + + const int64x2_t fedcba98_76543210 = + vpaddlq_s32(fedc_ba98_7654_3210); + + v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210); + } + + /* Update pointers for next iteration. */ + sig += sig_stride; + mc_running_avg_y += mc_running_avg_y_stride; + running_avg_y += running_avg_y_stride; + } + + /* Too much adjustments => copy block. */ + { + int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total), + vget_low_s64(v_sum_diff_total)); + int sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0); + int sum_diff_thresh = SUM_DIFF_THRESHOLD; + + if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH; + if (sum_diff > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // checK if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the accceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + int delta = ((sum_diff - sum_diff_thresh) >> 8) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const uint8x16_t k_delta = vmovq_n_u8(delta); + sig -= sig_stride * 16; + mc_running_avg_y -= mc_running_avg_y_stride * 16; + running_avg_y -= running_avg_y_stride * 16; + for (r = 0; r < 16; ++r) { + uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y); + const uint8x16_t v_sig = vld1q_u8(sig); + const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); + + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, + v_mc_running_avg_y); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, + v_mc_running_avg_y); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, + v_mc_running_avg_y); + // Clamp absolute difference to delta to get the adjustment. + const uint8x16_t v_abs_adjustment = + vminq_u8(v_abs_diff, (k_delta)); + + const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask, + v_abs_adjustment); + const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask, + v_abs_adjustment); + + v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment); + v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment); + + /* Store results. */ + vst1q_u8(running_avg_y, v_running_avg_y); + + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment), + vreinterpretq_s8_u8(v_pos_adjustment)); + + const int16x8_t fe_dc_ba_98_76_54_32_10 = + vpaddlq_s8(v_sum_diff); + const int32x4_t fedc_ba98_7654_3210 = + vpaddlq_s16(fe_dc_ba_98_76_54_32_10); + const int64x2_t fedcba98_76543210 = + vpaddlq_s32(fedc_ba98_7654_3210); + + v_sum_diff_total = vqaddq_s64(v_sum_diff_total, + fedcba98_76543210); + } + /* Update pointers for next iteration. */ + sig += sig_stride; + mc_running_avg_y += mc_running_avg_y_stride; + running_avg_y += running_avg_y_stride; + } + { + // Update the sum of all pixel differences of this MB. + x = vqadd_s64(vget_high_s64(v_sum_diff_total), + vget_low_s64(v_sum_diff_total)); + sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0); + + if (sum_diff > sum_diff_thresh) { + return COPY_BLOCK; + } + } + } else { + return COPY_BLOCK; + } + } + } + + /* Tell above level that block was filtered. */ + running_avg_y -= running_avg_y_stride * 16; + sig -= sig_stride * 16; + + vp8_copy_mem16x16(running_avg_y, running_avg_y_stride, sig, sig_stride); + + return FILTER_BLOCK; +} + +int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg, + int mc_running_avg_stride, + unsigned char *running_avg, + int running_avg_stride, + unsigned char *sig, int sig_stride, + unsigned int motion_magnitude, + int increase_denoising) { + /* If motion_magnitude is small, making the denoiser more aggressive by + * increasing the adjustment for each level, level1 adjustment is + * increased, the deltas stay the same. + */ + int shift_inc = (increase_denoising && + motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 1 : 0; + const uint8x16_t v_level1_adjustment = vmovq_n_u8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 4 + shift_inc : 3); + + const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1); + const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2); + const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc); + const uint8x16_t v_level2_threshold = vdupq_n_u8(8); + const uint8x16_t v_level3_threshold = vdupq_n_u8(16); + int64x2_t v_sum_diff_total = vdupq_n_s64(0); + int r; + + { + uint16x4_t v_sum_block = vdup_n_u16(0); + + // Avoid denoising color signal if its close to average level. + for (r = 0; r < 8; ++r) { + const uint8x8_t v_sig = vld1_u8(sig); + const uint16x4_t _76_54_32_10 = vpaddl_u8(v_sig); + v_sum_block = vqadd_u16(v_sum_block, _76_54_32_10); + sig += sig_stride; + } + sig -= sig_stride * 8; + { + const uint32x2_t _7654_3210 = vpaddl_u16(v_sum_block); + const uint64x1_t _76543210 = vpaddl_u32(_7654_3210); + const int sum_block = + vget_lane_s32(vreinterpret_s32_u64(_76543210), 0); + if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) { + return COPY_BLOCK; + } + } + } + + /* Go over lines. */ + for (r = 0; r < 4; ++r) { + /* Load inputs. */ + const uint8x8_t v_sig_lo = vld1_u8(sig); + const uint8x8_t v_sig_hi = vld1_u8(&sig[sig_stride]); + const uint8x16_t v_sig = vcombine_u8(v_sig_lo, v_sig_hi); + const uint8x8_t v_mc_running_avg_lo = vld1_u8(mc_running_avg); + const uint8x8_t v_mc_running_avg_hi = + vld1_u8(&mc_running_avg[mc_running_avg_stride]); + const uint8x16_t v_mc_running_avg = + vcombine_u8(v_mc_running_avg_lo, v_mc_running_avg_hi); + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg); + + /* Figure out which level that put us in. */ + const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, + v_abs_diff); + const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, + v_abs_diff); + const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, + v_abs_diff); + + /* Calculate absolute adjustments for level 1, 2 and 3. */ + const uint8x16_t v_level2_adjustment = vandq_u8(v_level2_mask, + v_delta_level_1_and_2); + const uint8x16_t v_level3_adjustment = vandq_u8(v_level3_mask, + v_delta_level_2_and_3); + const uint8x16_t v_level1and2_adjustment = vaddq_u8(v_level1_adjustment, + v_level2_adjustment); + const uint8x16_t v_level1and2and3_adjustment = vaddq_u8( + v_level1and2_adjustment, v_level3_adjustment); + + /* Figure adjustment absolute value by selecting between the absolute + * difference if in level0 or the value for level 1, 2 and 3. + */ + const uint8x16_t v_abs_adjustment = vbslq_u8(v_level1_mask, + v_level1and2and3_adjustment, v_abs_diff); + + /* Calculate positive and negative adjustments. Apply them to the signal + * and accumulate them. Adjustments are less than eight and the maximum + * sum of them (7 * 16) can fit in a signed char. + */ + const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask, + v_abs_adjustment); + const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask, + v_abs_adjustment); + + uint8x16_t v_running_avg = vqaddq_u8(v_sig, v_pos_adjustment); + v_running_avg = vqsubq_u8(v_running_avg, v_neg_adjustment); + + /* Store results. */ + vst1_u8(running_avg, vget_low_u8(v_running_avg)); + vst1_u8(&running_avg[running_avg_stride], vget_high_u8(v_running_avg)); + + /* Sum all the accumulators to have the sum of all pixel differences + * for this macroblock. + */ + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment), + vreinterpretq_s8_u8(v_neg_adjustment)); + + const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff); + + const int32x4_t fedc_ba98_7654_3210 = + vpaddlq_s16(fe_dc_ba_98_76_54_32_10); + + const int64x2_t fedcba98_76543210 = + vpaddlq_s32(fedc_ba98_7654_3210); + + v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210); + } + + /* Update pointers for next iteration. */ + sig += sig_stride * 2; + mc_running_avg += mc_running_avg_stride * 2; + running_avg += running_avg_stride * 2; + } + + + /* Too much adjustments => copy block. */ + { + int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total), + vget_low_s64(v_sum_diff_total)); + int sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0); + int sum_diff_thresh = SUM_DIFF_THRESHOLD_UV; + if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV; + if (sum_diff > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // checK if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the accceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + int delta = ((sum_diff - sum_diff_thresh) >> 8) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const uint8x16_t k_delta = vmovq_n_u8(delta); + sig -= sig_stride * 8; + mc_running_avg -= mc_running_avg_stride * 8; + running_avg -= running_avg_stride * 8; + for (r = 0; r < 4; ++r) { + const uint8x8_t v_sig_lo = vld1_u8(sig); + const uint8x8_t v_sig_hi = vld1_u8(&sig[sig_stride]); + const uint8x16_t v_sig = vcombine_u8(v_sig_lo, v_sig_hi); + const uint8x8_t v_mc_running_avg_lo = vld1_u8(mc_running_avg); + const uint8x8_t v_mc_running_avg_hi = + vld1_u8(&mc_running_avg[mc_running_avg_stride]); + const uint8x16_t v_mc_running_avg = + vcombine_u8(v_mc_running_avg_lo, v_mc_running_avg_hi); + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, + v_mc_running_avg); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, + v_mc_running_avg); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, + v_mc_running_avg); + // Clamp absolute difference to delta to get the adjustment. + const uint8x16_t v_abs_adjustment = + vminq_u8(v_abs_diff, (k_delta)); + + const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask, + v_abs_adjustment); + const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask, + v_abs_adjustment); + const uint8x8_t v_running_avg_lo = vld1_u8(running_avg); + const uint8x8_t v_running_avg_hi = + vld1_u8(&running_avg[running_avg_stride]); + uint8x16_t v_running_avg = + vcombine_u8(v_running_avg_lo, v_running_avg_hi); + + v_running_avg = vqsubq_u8(v_running_avg, v_pos_adjustment); + v_running_avg = vqaddq_u8(v_running_avg, v_neg_adjustment); + + /* Store results. */ + vst1_u8(running_avg, vget_low_u8(v_running_avg)); + vst1_u8(&running_avg[running_avg_stride], + vget_high_u8(v_running_avg)); + + { + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment), + vreinterpretq_s8_u8(v_pos_adjustment)); + + const int16x8_t fe_dc_ba_98_76_54_32_10 = + vpaddlq_s8(v_sum_diff); + const int32x4_t fedc_ba98_7654_3210 = + vpaddlq_s16(fe_dc_ba_98_76_54_32_10); + const int64x2_t fedcba98_76543210 = + vpaddlq_s32(fedc_ba98_7654_3210); + + v_sum_diff_total = vqaddq_s64(v_sum_diff_total, + fedcba98_76543210); + } + /* Update pointers for next iteration. */ + sig += sig_stride * 2; + mc_running_avg += mc_running_avg_stride * 2; + running_avg += running_avg_stride * 2; + } + { + // Update the sum of all pixel differences of this MB. + x = vqadd_s64(vget_high_s64(v_sum_diff_total), + vget_low_s64(v_sum_diff_total)); + sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0); + + if (sum_diff > sum_diff_thresh) { + return COPY_BLOCK; + } + } + } else { + return COPY_BLOCK; + } + } + } + + /* Tell above level that block was filtered. */ + running_avg -= running_avg_stride * 8; + sig -= sig_stride * 8; + + vp8_copy_mem8x8(running_avg, running_avg_stride, sig, sig_stride); + + return FILTER_BLOCK; +} diff --git a/media/libvpx/vp8/encoder/arm/neon/picklpf_arm.c b/media/libvpx/vp8/encoder/arm/neon/picklpf_arm.c deleted file mode 100644 index ec8071e88b66..000000000000 --- a/media/libvpx/vp8/encoder/arm/neon/picklpf_arm.c +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vp8/common/loopfilter.h" -#include "vpx_scale/yv12config.h" - -extern void vp8_memcpy_partial_neon(unsigned char *dst_ptr, - unsigned char *src_ptr, - int sz); - - -void vp8_yv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, - YV12_BUFFER_CONFIG *dst_ybc) -{ - unsigned char *src_y, *dst_y; - int yheight; - int ystride; - int yoffset; - int linestocopy; - - yheight = src_ybc->y_height; - ystride = src_ybc->y_stride; - - /* number of MB rows to use in partial filtering */ - linestocopy = (yheight >> 4) / PARTIAL_FRAME_FRACTION; - linestocopy = linestocopy ? linestocopy << 4 : 16; /* 16 lines per MB */ - - /* Copy extra 4 so that full filter context is available if filtering done - * on the copied partial frame and not original. Partial filter does mb - * filtering for top row also, which can modify3 pixels above. - */ - linestocopy += 4; - /* partial image starts at ~middle of frame (macroblock border) */ - yoffset = ystride * (((yheight >> 5) * 16) - 4); - src_y = src_ybc->y_buffer + yoffset; - dst_y = dst_ybc->y_buffer + yoffset; - - vp8_memcpy_partial_neon(dst_y, src_y, ystride * linestocopy); -} diff --git a/media/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm b/media/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm deleted file mode 100644 index 5ea8dd83dd78..000000000000 --- a/media/libvpx/vp8/encoder/arm/neon/shortfdct_neon.asm +++ /dev/null @@ -1,221 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_short_fdct4x4_neon| - EXPORT |vp8_short_fdct8x4_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=4 - - - ALIGN 16 ; enable use of @128 bit aligned loads -coeff - DCW 5352, 5352, 5352, 5352 - DCW 2217, 2217, 2217, 2217 - DCD 14500, 14500, 14500, 14500 - DCD 7500, 7500, 7500, 7500 - DCD 12000, 12000, 12000, 12000 - DCD 51000, 51000, 51000, 51000 - -;void vp8_short_fdct4x4_c(short *input, short *output, int pitch) -|vp8_short_fdct4x4_neon| PROC - - ; Part one - vld1.16 {d0}, [r0@64], r2 - adr r12, coeff - vld1.16 {d1}, [r0@64], r2 - vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217 - vld1.16 {d2}, [r0@64], r2 - vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500 - vld1.16 {d3}, [r0@64], r2 - - ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3] - vtrn.32 d0, d2 - vtrn.32 d1, d3 - vld1.32 {q11,q12}, [r12@128] ; q11=12000, q12=51000 - vtrn.16 d0, d1 - vtrn.16 d2, d3 - - vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[3] - vadd.s16 d5, d1, d2 ; b1 = ip[1] + ip[2] - vsub.s16 d6, d1, d2 ; c1 = ip[1] - ip[2] - vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[3] - - vshl.s16 q2, q2, #3 ; (a1, b1) << 3 - vshl.s16 q3, q3, #3 ; (c1, d1) << 3 - - vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 - vsub.s16 d2, d4, d5 ; op[2] = a1 - b1 - - vmlal.s16 q9, d7, d16 ; d1*5352 + 14500 - vmlal.s16 q10, d7, d17 ; d1*2217 + 7500 - vmlal.s16 q9, d6, d17 ; c1*2217 + d1*5352 + 14500 - vmlsl.s16 q10, d6, d16 ; d1*2217 - c1*5352 + 7500 - - vshrn.s32 d1, q9, #12 ; op[1] = (c1*2217 + d1*5352 + 14500)>>12 - vshrn.s32 d3, q10, #12 ; op[3] = (d1*2217 - c1*5352 + 7500)>>12 - - - ; Part two - - ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12] - vtrn.32 d0, d2 - vtrn.32 d1, d3 - vtrn.16 d0, d1 - vtrn.16 d2, d3 - - vmov.s16 d26, #7 - - vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[12] - vadd.s16 d5, d1, d2 ; b1 = ip[4] + ip[8] - vsub.s16 d6, d1, d2 ; c1 = ip[4] - ip[8] - vadd.s16 d4, d4, d26 ; a1 + 7 - vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[12] - - vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 + 7 - vsub.s16 d2, d4, d5 ; op[8] = a1 - b1 + 7 - - vmlal.s16 q11, d7, d16 ; d1*5352 + 12000 - vmlal.s16 q12, d7, d17 ; d1*2217 + 51000 - - vceq.s16 d4, d7, #0 - - vshr.s16 d0, d0, #4 - vshr.s16 d2, d2, #4 - - vmlal.s16 q11, d6, d17 ; c1*2217 + d1*5352 + 12000 - vmlsl.s16 q12, d6, d16 ; d1*2217 - c1*5352 + 51000 - - vmvn d4, d4 - vshrn.s32 d1, q11, #16 ; op[4] = (c1*2217 + d1*5352 + 12000)>>16 - vsub.s16 d1, d1, d4 ; op[4] += (d1!=0) - vshrn.s32 d3, q12, #16 ; op[12]= (d1*2217 - c1*5352 + 51000)>>16 - - vst1.16 {q0, q1}, [r1@128] - - bx lr - - ENDP - -;void vp8_short_fdct8x4_c(short *input, short *output, int pitch) -|vp8_short_fdct8x4_neon| PROC - - ; Part one - - vld1.16 {q0}, [r0@128], r2 - adr r12, coeff - vld1.16 {q1}, [r0@128], r2 - vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217 - vld1.16 {q2}, [r0@128], r2 - vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500 - vld1.16 {q3}, [r0@128], r2 - - ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3] - vtrn.32 q0, q2 ; [A0|B0] - vtrn.32 q1, q3 ; [A1|B1] - vtrn.16 q0, q1 ; [A2|B2] - vtrn.16 q2, q3 ; [A3|B3] - - vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[3] - vadd.s16 q12, q1, q2 ; b1 = ip[1] + ip[2] - vsub.s16 q13, q1, q2 ; c1 = ip[1] - ip[2] - vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[3] - - vshl.s16 q11, q11, #3 ; a1 << 3 - vshl.s16 q12, q12, #3 ; b1 << 3 - vshl.s16 q13, q13, #3 ; c1 << 3 - vshl.s16 q14, q14, #3 ; d1 << 3 - - vadd.s16 q0, q11, q12 ; [A0 | B0] = a1 + b1 - vsub.s16 q2, q11, q12 ; [A2 | B2] = a1 - b1 - - vmov.s16 q11, q9 ; 14500 - vmov.s16 q12, q10 ; 7500 - - vmlal.s16 q9, d28, d16 ; A[1] = d1*5352 + 14500 - vmlal.s16 q10, d28, d17 ; A[3] = d1*2217 + 7500 - vmlal.s16 q11, d29, d16 ; B[1] = d1*5352 + 14500 - vmlal.s16 q12, d29, d17 ; B[3] = d1*2217 + 7500 - - vmlal.s16 q9, d26, d17 ; A[1] = c1*2217 + d1*5352 + 14500 - vmlsl.s16 q10, d26, d16 ; A[3] = d1*2217 - c1*5352 + 7500 - vmlal.s16 q11, d27, d17 ; B[1] = c1*2217 + d1*5352 + 14500 - vmlsl.s16 q12, d27, d16 ; B[3] = d1*2217 - c1*5352 + 7500 - - vshrn.s32 d2, q9, #12 ; A[1] = (c1*2217 + d1*5352 + 14500)>>12 - vshrn.s32 d6, q10, #12 ; A[3] = (d1*2217 - c1*5352 + 7500)>>12 - vshrn.s32 d3, q11, #12 ; B[1] = (c1*2217 + d1*5352 + 14500)>>12 - vshrn.s32 d7, q12, #12 ; B[3] = (d1*2217 - c1*5352 + 7500)>>12 - - - ; Part two - vld1.32 {q9,q10}, [r12@128] ; q9=12000, q10=51000 - - ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12] - vtrn.32 q0, q2 ; q0=[A0 | B0] - vtrn.32 q1, q3 ; q1=[A4 | B4] - vtrn.16 q0, q1 ; q2=[A8 | B8] - vtrn.16 q2, q3 ; q3=[A12|B12] - - vmov.s16 q15, #7 - - vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[12] - vadd.s16 q12, q1, q2 ; b1 = ip[4] + ip[8] - vadd.s16 q11, q11, q15 ; a1 + 7 - vsub.s16 q13, q1, q2 ; c1 = ip[4] - ip[8] - vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[12] - - vadd.s16 q0, q11, q12 ; a1 + b1 + 7 - vsub.s16 q1, q11, q12 ; a1 - b1 + 7 - - vmov.s16 q11, q9 ; 12000 - vmov.s16 q12, q10 ; 51000 - - vshr.s16 d0, d0, #4 ; A[0] = (a1 + b1 + 7)>>4 - vshr.s16 d4, d1, #4 ; B[0] = (a1 + b1 + 7)>>4 - vshr.s16 d2, d2, #4 ; A[8] = (a1 + b1 + 7)>>4 - vshr.s16 d6, d3, #4 ; B[8] = (a1 + b1 + 7)>>4 - - - vmlal.s16 q9, d28, d16 ; A[4] = d1*5352 + 12000 - vmlal.s16 q10, d28, d17 ; A[12] = d1*2217 + 51000 - vmlal.s16 q11, d29, d16 ; B[4] = d1*5352 + 12000 - vmlal.s16 q12, d29, d17 ; B[12] = d1*2217 + 51000 - - vceq.s16 q14, q14, #0 - - vmlal.s16 q9, d26, d17 ; A[4] = c1*2217 + d1*5352 + 12000 - vmlsl.s16 q10, d26, d16 ; A[12] = d1*2217 - c1*5352 + 51000 - vmlal.s16 q11, d27, d17 ; B[4] = c1*2217 + d1*5352 + 12000 - vmlsl.s16 q12, d27, d16 ; B[12] = d1*2217 - c1*5352 + 51000 - - vmvn q14, q14 - - vshrn.s32 d1, q9, #16 ; A[4] = (c1*2217 + d1*5352 + 12000)>>16 - vshrn.s32 d3, q10, #16 ; A[12]= (d1*2217 - c1*5352 + 51000)>>16 - vsub.s16 d1, d1, d28 ; A[4] += (d1!=0) - - vshrn.s32 d5, q11, #16 ; B[4] = (c1*2217 + d1*5352 + 12000)>>16 - vshrn.s32 d7, q12, #16 ; B[12]= (d1*2217 - c1*5352 + 51000)>>16 - vsub.s16 d5, d5, d29 ; B[4] += (d1!=0) - - vst1.16 {q0, q1}, [r1@128]! ; block A - vst1.16 {q2, q3}, [r1@128]! ; block B - - bx lr - - ENDP - - END - diff --git a/media/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c b/media/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c new file mode 100644 index 000000000000..391e5f990709 --- /dev/null +++ b/media/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +void vp8_short_fdct4x4_neon( + int16_t *input, + int16_t *output, + int pitch) { + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d16s16, d17s16, d26s16, dEmptys16; + uint16x4_t d4u16; + int16x8_t q0s16, q1s16; + int32x4_t q9s32, q10s32, q11s32, q12s32; + int16x4x2_t v2tmp0, v2tmp1; + int32x2x2_t v2tmp2, v2tmp3; + + d16s16 = vdup_n_s16(5352); + d17s16 = vdup_n_s16(2217); + q9s32 = vdupq_n_s32(14500); + q10s32 = vdupq_n_s32(7500); + q11s32 = vdupq_n_s32(12000); + q12s32 = vdupq_n_s32(51000); + + // Part one + pitch >>= 1; + d0s16 = vld1_s16(input); + input += pitch; + d1s16 = vld1_s16(input); + input += pitch; + d2s16 = vld1_s16(input); + input += pitch; + d3s16 = vld1_s16(input); + + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), + vreinterpret_s32_s16(d2s16)); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), + vreinterpret_s32_s16(d3s16)); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0 + vreinterpret_s16_s32(v2tmp3.val[0])); // d1 + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2 + vreinterpret_s16_s32(v2tmp3.val[1])); // d3 + + d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); + d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); + d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); + d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); + + d4s16 = vshl_n_s16(d4s16, 3); + d5s16 = vshl_n_s16(d5s16, 3); + d6s16 = vshl_n_s16(d6s16, 3); + d7s16 = vshl_n_s16(d7s16, 3); + + d0s16 = vadd_s16(d4s16, d5s16); + d2s16 = vsub_s16(d4s16, d5s16); + + q9s32 = vmlal_s16(q9s32, d7s16, d16s16); + q10s32 = vmlal_s16(q10s32, d7s16, d17s16); + q9s32 = vmlal_s16(q9s32, d6s16, d17s16); + q10s32 = vmlsl_s16(q10s32, d6s16, d16s16); + + d1s16 = vshrn_n_s32(q9s32, 12); + d3s16 = vshrn_n_s32(q10s32, 12); + + // Part two + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), + vreinterpret_s32_s16(d2s16)); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), + vreinterpret_s32_s16(d3s16)); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0 + vreinterpret_s16_s32(v2tmp3.val[0])); // d1 + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2 + vreinterpret_s16_s32(v2tmp3.val[1])); // d3 + + d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); + d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); + d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); + d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); + + d26s16 = vdup_n_s16(7); + d4s16 = vadd_s16(d4s16, d26s16); + + d0s16 = vadd_s16(d4s16, d5s16); + d2s16 = vsub_s16(d4s16, d5s16); + + q11s32 = vmlal_s16(q11s32, d7s16, d16s16); + q12s32 = vmlal_s16(q12s32, d7s16, d17s16); + + dEmptys16 = vdup_n_s16(0); + d4u16 = vceq_s16(d7s16, dEmptys16); + + d0s16 = vshr_n_s16(d0s16, 4); + d2s16 = vshr_n_s16(d2s16, 4); + + q11s32 = vmlal_s16(q11s32, d6s16, d17s16); + q12s32 = vmlsl_s16(q12s32, d6s16, d16s16); + + d4u16 = vmvn_u16(d4u16); + d1s16 = vshrn_n_s32(q11s32, 16); + d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d4u16)); + d3s16 = vshrn_n_s32(q12s32, 16); + + q0s16 = vcombine_s16(d0s16, d1s16); + q1s16 = vcombine_s16(d2s16, d3s16); + + vst1q_s16(output, q0s16); + vst1q_s16(output + 8, q1s16); + return; +} + +void vp8_short_fdct8x4_neon( + int16_t *input, + int16_t *output, + int pitch) { + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d16s16, d17s16, d26s16, d27s16, d28s16, d29s16; + uint16x4_t d28u16, d29u16; + uint16x8_t q14u16; + int16x8_t q0s16, q1s16, q2s16, q3s16; + int16x8_t q11s16, q12s16, q13s16, q14s16, q15s16, qEmptys16; + int32x4_t q9s32, q10s32, q11s32, q12s32; + int16x8x2_t v2tmp0, v2tmp1; + int32x4x2_t v2tmp2, v2tmp3; + + d16s16 = vdup_n_s16(5352); + d17s16 = vdup_n_s16(2217); + q9s32 = vdupq_n_s32(14500); + q10s32 = vdupq_n_s32(7500); + + // Part one + pitch >>= 1; + q0s16 = vld1q_s16(input); + input += pitch; + q1s16 = vld1q_s16(input); + input += pitch; + q2s16 = vld1q_s16(input); + input += pitch; + q3s16 = vld1q_s16(input); + + v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16), + vreinterpretq_s32_s16(q2s16)); + v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16), + vreinterpretq_s32_s16(q3s16)); + v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0 + vreinterpretq_s16_s32(v2tmp3.val[0])); // q1 + v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2 + vreinterpretq_s16_s32(v2tmp3.val[1])); // q3 + + q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]); + q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]); + q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]); + q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]); + + q11s16 = vshlq_n_s16(q11s16, 3); + q12s16 = vshlq_n_s16(q12s16, 3); + q13s16 = vshlq_n_s16(q13s16, 3); + q14s16 = vshlq_n_s16(q14s16, 3); + + q0s16 = vaddq_s16(q11s16, q12s16); + q2s16 = vsubq_s16(q11s16, q12s16); + + q11s32 = q9s32; + q12s32 = q10s32; + + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + q9s32 = vmlal_s16(q9s32, d28s16, d16s16); + q10s32 = vmlal_s16(q10s32, d28s16, d17s16); + q11s32 = vmlal_s16(q11s32, d29s16, d16s16); + q12s32 = vmlal_s16(q12s32, d29s16, d17s16); + + q9s32 = vmlal_s16(q9s32, d26s16, d17s16); + q10s32 = vmlsl_s16(q10s32, d26s16, d16s16); + q11s32 = vmlal_s16(q11s32, d27s16, d17s16); + q12s32 = vmlsl_s16(q12s32, d27s16, d16s16); + + d2s16 = vshrn_n_s32(q9s32, 12); + d6s16 = vshrn_n_s32(q10s32, 12); + d3s16 = vshrn_n_s32(q11s32, 12); + d7s16 = vshrn_n_s32(q12s32, 12); + q1s16 = vcombine_s16(d2s16, d3s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + // Part two + q9s32 = vdupq_n_s32(12000); + q10s32 = vdupq_n_s32(51000); + + v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16), + vreinterpretq_s32_s16(q2s16)); + v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16), + vreinterpretq_s32_s16(q3s16)); + v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0 + vreinterpretq_s16_s32(v2tmp3.val[0])); // q1 + v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2 + vreinterpretq_s16_s32(v2tmp3.val[1])); // q3 + + q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]); + q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]); + q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]); + q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]); + + q15s16 = vdupq_n_s16(7); + q11s16 = vaddq_s16(q11s16, q15s16); + q0s16 = vaddq_s16(q11s16, q12s16); + q1s16 = vsubq_s16(q11s16, q12s16); + + q11s32 = q9s32; + q12s32 = q10s32; + + d0s16 = vget_low_s16(q0s16); + d1s16 = vget_high_s16(q0s16); + d2s16 = vget_low_s16(q1s16); + d3s16 = vget_high_s16(q1s16); + + d0s16 = vshr_n_s16(d0s16, 4); + d4s16 = vshr_n_s16(d1s16, 4); + d2s16 = vshr_n_s16(d2s16, 4); + d6s16 = vshr_n_s16(d3s16, 4); + + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + q9s32 = vmlal_s16(q9s32, d28s16, d16s16); + q10s32 = vmlal_s16(q10s32, d28s16, d17s16); + q11s32 = vmlal_s16(q11s32, d29s16, d16s16); + q12s32 = vmlal_s16(q12s32, d29s16, d17s16); + + q9s32 = vmlal_s16(q9s32, d26s16, d17s16); + q10s32 = vmlsl_s16(q10s32, d26s16, d16s16); + q11s32 = vmlal_s16(q11s32, d27s16, d17s16); + q12s32 = vmlsl_s16(q12s32, d27s16, d16s16); + + d1s16 = vshrn_n_s32(q9s32, 16); + d3s16 = vshrn_n_s32(q10s32, 16); + d5s16 = vshrn_n_s32(q11s32, 16); + d7s16 = vshrn_n_s32(q12s32, 16); + + qEmptys16 = vdupq_n_s16(0); + q14u16 = vceqq_s16(q14s16, qEmptys16); + q14u16 = vmvnq_u16(q14u16); + + d28u16 = vget_low_u16(q14u16); + d29u16 = vget_high_u16(q14u16); + d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d28u16)); + d5s16 = vsub_s16(d5s16, vreinterpret_s16_u16(d29u16)); + + q0s16 = vcombine_s16(d0s16, d1s16); + q1s16 = vcombine_s16(d2s16, d3s16); + q2s16 = vcombine_s16(d4s16, d5s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + vst1q_s16(output, q0s16); + vst1q_s16(output + 8, q1s16); + vst1q_s16(output + 16, q2s16); + vst1q_s16(output + 24, q3s16); + return; +} diff --git a/media/libvpx/vp8/encoder/arm/neon/subtract_neon.asm b/media/libvpx/vp8/encoder/arm/neon/subtract_neon.asm deleted file mode 100644 index 5bda78678db9..000000000000 --- a/media/libvpx/vp8/encoder/arm/neon/subtract_neon.asm +++ /dev/null @@ -1,199 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vp8_subtract_b_neon| - EXPORT |vp8_subtract_mby_neon| - EXPORT |vp8_subtract_mbuv_neon| - - INCLUDE vp8_asm_enc_offsets.asm - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch) -|vp8_subtract_b_neon| PROC - - stmfd sp!, {r4-r7} - - ldr r3, [r0, #vp8_block_base_src] - ldr r4, [r0, #vp8_block_src] - ldr r5, [r0, #vp8_block_src_diff] - ldr r3, [r3] - ldr r6, [r0, #vp8_block_src_stride] - add r3, r3, r4 ; src = *base_src + src - ldr r7, [r1, #vp8_blockd_predictor] - - vld1.8 {d0}, [r3], r6 ;load src - vld1.8 {d1}, [r7], r2 ;load pred - vld1.8 {d2}, [r3], r6 - vld1.8 {d3}, [r7], r2 - vld1.8 {d4}, [r3], r6 - vld1.8 {d5}, [r7], r2 - vld1.8 {d6}, [r3], r6 - vld1.8 {d7}, [r7], r2 - - vsubl.u8 q10, d0, d1 - vsubl.u8 q11, d2, d3 - vsubl.u8 q12, d4, d5 - vsubl.u8 q13, d6, d7 - - mov r2, r2, lsl #1 - - vst1.16 {d20}, [r5], r2 ;store diff - vst1.16 {d22}, [r5], r2 - vst1.16 {d24}, [r5], r2 - vst1.16 {d26}, [r5], r2 - - ldmfd sp!, {r4-r7} - bx lr - - ENDP - - -;========================================== -;void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride -; unsigned char *pred, int pred_stride) -|vp8_subtract_mby_neon| PROC - push {r4-r7} - mov r12, #4 - ldr r4, [sp, #16] ; pred_stride - mov r6, #32 ; "diff" stride x2 - add r5, r0, #16 ; second diff pointer - -subtract_mby_loop - vld1.8 {q0}, [r1], r2 ;load src - vld1.8 {q1}, [r3], r4 ;load pred - vld1.8 {q2}, [r1], r2 - vld1.8 {q3}, [r3], r4 - vld1.8 {q4}, [r1], r2 - vld1.8 {q5}, [r3], r4 - vld1.8 {q6}, [r1], r2 - vld1.8 {q7}, [r3], r4 - - vsubl.u8 q8, d0, d2 - vsubl.u8 q9, d1, d3 - vsubl.u8 q10, d4, d6 - vsubl.u8 q11, d5, d7 - vsubl.u8 q12, d8, d10 - vsubl.u8 q13, d9, d11 - vsubl.u8 q14, d12, d14 - vsubl.u8 q15, d13, d15 - - vst1.16 {q8}, [r0], r6 ;store diff - vst1.16 {q9}, [r5], r6 - vst1.16 {q10}, [r0], r6 - vst1.16 {q11}, [r5], r6 - vst1.16 {q12}, [r0], r6 - vst1.16 {q13}, [r5], r6 - vst1.16 {q14}, [r0], r6 - vst1.16 {q15}, [r5], r6 - - subs r12, r12, #1 - bne subtract_mby_loop - - pop {r4-r7} - bx lr - ENDP - -;================================= -;void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, -; int src_stride, unsigned char *upred, -; unsigned char *vpred, int pred_stride) - -|vp8_subtract_mbuv_neon| PROC - push {r4-r7} - ldr r4, [sp, #16] ; upred - ldr r5, [sp, #20] ; vpred - ldr r6, [sp, #24] ; pred_stride - add r0, r0, #512 ; short *udiff = diff + 256; - mov r12, #32 ; "diff" stride x2 - add r7, r0, #16 ; second diff pointer - -;u - vld1.8 {d0}, [r1], r3 ;load usrc - vld1.8 {d1}, [r4], r6 ;load upred - vld1.8 {d2}, [r1], r3 - vld1.8 {d3}, [r4], r6 - vld1.8 {d4}, [r1], r3 - vld1.8 {d5}, [r4], r6 - vld1.8 {d6}, [r1], r3 - vld1.8 {d7}, [r4], r6 - vld1.8 {d8}, [r1], r3 - vld1.8 {d9}, [r4], r6 - vld1.8 {d10}, [r1], r3 - vld1.8 {d11}, [r4], r6 - vld1.8 {d12}, [r1], r3 - vld1.8 {d13}, [r4], r6 - vld1.8 {d14}, [r1], r3 - vld1.8 {d15}, [r4], r6 - - vsubl.u8 q8, d0, d1 - vsubl.u8 q9, d2, d3 - vsubl.u8 q10, d4, d5 - vsubl.u8 q11, d6, d7 - vsubl.u8 q12, d8, d9 - vsubl.u8 q13, d10, d11 - vsubl.u8 q14, d12, d13 - vsubl.u8 q15, d14, d15 - - vst1.16 {q8}, [r0], r12 ;store diff - vst1.16 {q9}, [r7], r12 - vst1.16 {q10}, [r0], r12 - vst1.16 {q11}, [r7], r12 - vst1.16 {q12}, [r0], r12 - vst1.16 {q13}, [r7], r12 - vst1.16 {q14}, [r0], r12 - vst1.16 {q15}, [r7], r12 - -;v - vld1.8 {d0}, [r2], r3 ;load vsrc - vld1.8 {d1}, [r5], r6 ;load vpred - vld1.8 {d2}, [r2], r3 - vld1.8 {d3}, [r5], r6 - vld1.8 {d4}, [r2], r3 - vld1.8 {d5}, [r5], r6 - vld1.8 {d6}, [r2], r3 - vld1.8 {d7}, [r5], r6 - vld1.8 {d8}, [r2], r3 - vld1.8 {d9}, [r5], r6 - vld1.8 {d10}, [r2], r3 - vld1.8 {d11}, [r5], r6 - vld1.8 {d12}, [r2], r3 - vld1.8 {d13}, [r5], r6 - vld1.8 {d14}, [r2], r3 - vld1.8 {d15}, [r5], r6 - - vsubl.u8 q8, d0, d1 - vsubl.u8 q9, d2, d3 - vsubl.u8 q10, d4, d5 - vsubl.u8 q11, d6, d7 - vsubl.u8 q12, d8, d9 - vsubl.u8 q13, d10, d11 - vsubl.u8 q14, d12, d13 - vsubl.u8 q15, d14, d15 - - vst1.16 {q8}, [r0], r12 ;store diff - vst1.16 {q9}, [r7], r12 - vst1.16 {q10}, [r0], r12 - vst1.16 {q11}, [r7], r12 - vst1.16 {q12}, [r0], r12 - vst1.16 {q13}, [r7], r12 - vst1.16 {q14}, [r0], r12 - vst1.16 {q15}, [r7], r12 - - pop {r4-r7} - bx lr - - ENDP - - END diff --git a/media/libvpx/vp8/encoder/arm/neon/subtract_neon.c b/media/libvpx/vp8/encoder/arm/neon/subtract_neon.c new file mode 100644 index 000000000000..d3ab7b165537 --- /dev/null +++ b/media/libvpx/vp8/encoder/arm/neon/subtract_neon.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "vp8/encoder/block.h" + +void vp8_subtract_b_neon( + BLOCK *be, + BLOCKD *bd, + int pitch) { + unsigned char *src_ptr, *predictor; + int src_stride; + int16_t *src_diff; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + uint16x8_t q10u16, q11u16, q12u16, q13u16; + + src_ptr = *be->base_src + be->src; + src_stride = be->src_stride; + predictor = bd->predictor; + + d0u8 = vld1_u8(src_ptr); + src_ptr += src_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += src_stride; + d4u8 = vld1_u8(src_ptr); + src_ptr += src_stride; + d6u8 = vld1_u8(src_ptr); + + d1u8 = vld1_u8(predictor); + predictor += pitch; + d3u8 = vld1_u8(predictor); + predictor += pitch; + d5u8 = vld1_u8(predictor); + predictor += pitch; + d7u8 = vld1_u8(predictor); + + q10u16 = vsubl_u8(d0u8, d1u8); + q11u16 = vsubl_u8(d2u8, d3u8); + q12u16 = vsubl_u8(d4u8, d5u8); + q13u16 = vsubl_u8(d6u8, d7u8); + + src_diff = be->src_diff; + vst1_u16((uint16_t *)src_diff, vget_low_u16(q10u16)); + src_diff += pitch; + vst1_u16((uint16_t *)src_diff, vget_low_u16(q11u16)); + src_diff += pitch; + vst1_u16((uint16_t *)src_diff, vget_low_u16(q12u16)); + src_diff += pitch; + vst1_u16((uint16_t *)src_diff, vget_low_u16(q13u16)); + return; +} + +void vp8_subtract_mby_neon( + int16_t *diff, + unsigned char *src, + int src_stride, + unsigned char *pred, + int pred_stride) { + int i; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + uint16x8_t q8u16, q9u16, q10u16, q11u16; + + for (i = 0; i < 8; i++) { // subtract_mby_loop + q0u8 = vld1q_u8(src); + src += src_stride; + q2u8 = vld1q_u8(src); + src += src_stride; + q1u8 = vld1q_u8(pred); + pred += pred_stride; + q3u8 = vld1q_u8(pred); + pred += pred_stride; + + q8u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q1u8)); + q9u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q1u8)); + q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q3u8)); + q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q3u8)); + + vst1q_u16((uint16_t *)diff, q8u16); + diff += 8; + vst1q_u16((uint16_t *)diff, q9u16); + diff += 8; + vst1q_u16((uint16_t *)diff, q10u16); + diff += 8; + vst1q_u16((uint16_t *)diff, q11u16); + diff += 8; + } + return; +} + +void vp8_subtract_mbuv_neon( + int16_t *diff, + unsigned char *usrc, + unsigned char *vsrc, + int src_stride, + unsigned char *upred, + unsigned char *vpred, + int pred_stride) { + int i, j; + unsigned char *src_ptr, *pred_ptr; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + uint16x8_t q8u16, q9u16, q10u16, q11u16; + + diff += 256; + for (i = 0; i < 2; i++) { + if (i == 0) { + src_ptr = usrc; + pred_ptr = upred; + } else if (i == 1) { + src_ptr = vsrc; + pred_ptr = vpred; + } + + for (j = 0; j < 2; j++) { + d0u8 = vld1_u8(src_ptr); + src_ptr += src_stride; + d1u8 = vld1_u8(pred_ptr); + pred_ptr += pred_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += src_stride; + d3u8 = vld1_u8(pred_ptr); + pred_ptr += pred_stride; + d4u8 = vld1_u8(src_ptr); + src_ptr += src_stride; + d5u8 = vld1_u8(pred_ptr); + pred_ptr += pred_stride; + d6u8 = vld1_u8(src_ptr); + src_ptr += src_stride; + d7u8 = vld1_u8(pred_ptr); + pred_ptr += pred_stride; + + q8u16 = vsubl_u8(d0u8, d1u8); + q9u16 = vsubl_u8(d2u8, d3u8); + q10u16 = vsubl_u8(d4u8, d5u8); + q11u16 = vsubl_u8(d6u8, d7u8); + + vst1q_u16((uint16_t *)diff, q8u16); + diff += 8; + vst1q_u16((uint16_t *)diff, q9u16); + diff += 8; + vst1q_u16((uint16_t *)diff, q10u16); + diff += 8; + vst1q_u16((uint16_t *)diff, q11u16); + diff += 8; + } + } + return; +} diff --git a/media/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm b/media/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm deleted file mode 100644 index 5b9f11e59352..000000000000 --- a/media/libvpx/vp8/encoder/arm/neon/vp8_memcpy_neon.asm +++ /dev/null @@ -1,70 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_memcpy_partial_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;========================================= -;this is not a full memcpy function!!! -;void vp8_memcpy_partial_neon(unsigned char *dst_ptr, unsigned char *src_ptr, -; int sz); -|vp8_memcpy_partial_neon| PROC - ;pld [r1] ;preload pred data - ;pld [r1, #128] - ;pld [r1, #256] - ;pld [r1, #384] - - mov r12, r2, lsr #8 ;copy 256 bytes data at one time - -memcpy_neon_loop - vld1.8 {q0, q1}, [r1]! ;load src data - subs r12, r12, #1 - vld1.8 {q2, q3}, [r1]! - vst1.8 {q0, q1}, [r0]! ;copy to dst_ptr - vld1.8 {q4, q5}, [r1]! - vst1.8 {q2, q3}, [r0]! - vld1.8 {q6, q7}, [r1]! - vst1.8 {q4, q5}, [r0]! - vld1.8 {q8, q9}, [r1]! - vst1.8 {q6, q7}, [r0]! - vld1.8 {q10, q11}, [r1]! - vst1.8 {q8, q9}, [r0]! - vld1.8 {q12, q13}, [r1]! - vst1.8 {q10, q11}, [r0]! - vld1.8 {q14, q15}, [r1]! - vst1.8 {q12, q13}, [r0]! - vst1.8 {q14, q15}, [r0]! - - ;pld [r1] ;preload pred data -- need to adjust for real device - ;pld [r1, #128] - ;pld [r1, #256] - ;pld [r1, #384] - - bne memcpy_neon_loop - - ands r3, r2, #0xff ;extra copy - beq done_copy_neon_loop - -extra_copy_neon_loop - vld1.8 {q0}, [r1]! ;load src data - subs r3, r3, #16 - vst1.8 {q0}, [r0]! - bne extra_copy_neon_loop - -done_copy_neon_loop - bx lr - ENDP - - END diff --git a/media/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm b/media/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm index 55edbf5129ed..f82af3ee333a 100644 --- a/media/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm +++ b/media/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm @@ -27,6 +27,8 @@ ;from vp8_variance(). |vp8_mse16x16_neon| PROC + vpush {q7} + vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse vmov.i8 q8, #0 vmov.i8 q9, #0 @@ -62,7 +64,7 @@ mse16x16_neon_loop vadd.u32 q7, q7, q8 vadd.u32 q9, q9, q10 - ldr r12, [sp] ;load *sse from stack + ldr r12, [sp, #16] ;load *sse from stack vadd.u32 q10, q7, q9 vpaddl.u32 q1, q10 @@ -71,6 +73,7 @@ mse16x16_neon_loop vst1.32 {d0[0]}, [r12] vmov.32 r0, d0[0] + vpop {q7} bx lr ENDP @@ -82,6 +85,8 @@ mse16x16_neon_loop ; r2 unsigned char *ref_ptr, ; r3 int recon_stride |vp8_get4x4sse_cs_neon| PROC + vpush {q7} + vld1.8 {d0}, [r0], r1 ;Load up source and reference vld1.8 {d4}, [r2], r3 vld1.8 {d1}, [r0], r1 @@ -109,6 +114,8 @@ mse16x16_neon_loop vadd.u64 d0, d2, d3 vmov.32 r0, d0[0] + + vpop {q7} bx lr ENDP diff --git a/media/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm b/media/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm deleted file mode 100644 index 22266297a867..000000000000 --- a/media/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm +++ /dev/null @@ -1,103 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_short_walsh4x4_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch) -; r0 short *input, -; r1 short *output, -; r2 int pitch -|vp8_short_walsh4x4_neon| PROC - - vld1.16 {d0}, [r0@64], r2 ; load input - vld1.16 {d1}, [r0@64], r2 - vld1.16 {d2}, [r0@64], r2 - vld1.16 {d3}, [r0@64] - - ;First for-loop - ;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3] - vtrn.32 d0, d2 - vtrn.32 d1, d3 - - vmov.s32 q15, #3 ; add 3 to all values - - vtrn.16 d0, d1 - vtrn.16 d2, d3 - - vadd.s16 d4, d0, d2 ; ip[0] + ip[2] - vadd.s16 d5, d1, d3 ; ip[1] + ip[3] - vsub.s16 d6, d1, d3 ; ip[1] - ip[3] - vsub.s16 d7, d0, d2 ; ip[0] - ip[2] - - vshl.s16 d4, d4, #2 ; a1 = (ip[0] + ip[2]) << 2 - vshl.s16 d5, d5, #2 ; d1 = (ip[1] + ip[3]) << 2 - vshl.s16 d6, d6, #2 ; c1 = (ip[1] - ip[3]) << 2 - vceq.s16 d16, d4, #0 ; a1 == 0 - vshl.s16 d7, d7, #2 ; b1 = (ip[0] - ip[2]) << 2 - - vadd.s16 d0, d4, d5 ; a1 + d1 - vmvn d16, d16 ; a1 != 0 - vsub.s16 d3, d4, d5 ; op[3] = a1 - d1 - vadd.s16 d1, d7, d6 ; op[1] = b1 + c1 - vsub.s16 d2, d7, d6 ; op[2] = b1 - c1 - vsub.s16 d0, d0, d16 ; op[0] = a1 + d1 + (a1 != 0) - - ;Second for-loop - ;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12] - vtrn.32 d1, d3 - vtrn.32 d0, d2 - vtrn.16 d2, d3 - vtrn.16 d0, d1 - - vaddl.s16 q8, d0, d2 ; a1 = ip[0]+ip[8] - vaddl.s16 q9, d1, d3 ; d1 = ip[4]+ip[12] - vsubl.s16 q10, d1, d3 ; c1 = ip[4]-ip[12] - vsubl.s16 q11, d0, d2 ; b1 = ip[0]-ip[8] - - vadd.s32 q0, q8, q9 ; a2 = a1 + d1 - vadd.s32 q1, q11, q10 ; b2 = b1 + c1 - vsub.s32 q2, q11, q10 ; c2 = b1 - c1 - vsub.s32 q3, q8, q9 ; d2 = a1 - d1 - - vclt.s32 q8, q0, #0 - vclt.s32 q9, q1, #0 - vclt.s32 q10, q2, #0 - vclt.s32 q11, q3, #0 - - ; subtract -1 (or 0) - vsub.s32 q0, q0, q8 ; a2 += a2 < 0 - vsub.s32 q1, q1, q9 ; b2 += b2 < 0 - vsub.s32 q2, q2, q10 ; c2 += c2 < 0 - vsub.s32 q3, q3, q11 ; d2 += d2 < 0 - - vadd.s32 q8, q0, q15 ; a2 + 3 - vadd.s32 q9, q1, q15 ; b2 + 3 - vadd.s32 q10, q2, q15 ; c2 + 3 - vadd.s32 q11, q3, q15 ; d2 + 3 - - ; vrshrn? would add 1 << 3-1 = 2 - vshrn.s32 d0, q8, #3 - vshrn.s32 d1, q9, #3 - vshrn.s32 d2, q10, #3 - vshrn.s32 d3, q11, #3 - - vst1.16 {q0, q1}, [r1@128] - - bx lr - - ENDP - - END diff --git a/media/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c b/media/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c new file mode 100644 index 000000000000..d6b67f895034 --- /dev/null +++ b/media/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +void vp8_short_walsh4x4_neon( + int16_t *input, + int16_t *output, + int pitch) { + uint16x4_t d16u16; + int16x8_t q0s16, q1s16; + int16x4_t dEmptys16, d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int32x4_t qEmptys32, q0s32, q1s32, q2s32, q3s32, q8s32; + int32x4_t q9s32, q10s32, q11s32, q15s32; + uint32x4_t q8u32, q9u32, q10u32, q11u32; + int16x4x2_t v2tmp0, v2tmp1; + int32x2x2_t v2tmp2, v2tmp3; + + dEmptys16 = vdup_n_s16(0); + qEmptys32 = vdupq_n_s32(0); + q15s32 = vdupq_n_s32(3); + + d0s16 = vld1_s16(input); + input += pitch/2; + d1s16 = vld1_s16(input); + input += pitch/2; + d2s16 = vld1_s16(input); + input += pitch/2; + d3s16 = vld1_s16(input); + + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), + vreinterpret_s32_s16(d2s16)); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), + vreinterpret_s32_s16(d3s16)); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0 + vreinterpret_s16_s32(v2tmp3.val[0])); // d1 + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2 + vreinterpret_s16_s32(v2tmp3.val[1])); // d3 + + d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[0]); + d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[1]); + d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[1]); + d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[0]); + + d4s16 = vshl_n_s16(d4s16, 2); + d5s16 = vshl_n_s16(d5s16, 2); + d6s16 = vshl_n_s16(d6s16, 2); + d7s16 = vshl_n_s16(d7s16, 2); + + d16u16 = vceq_s16(d4s16, dEmptys16); + d16u16 = vmvn_u16(d16u16); + + d0s16 = vadd_s16(d4s16, d5s16); + d3s16 = vsub_s16(d4s16, d5s16); + d1s16 = vadd_s16(d7s16, d6s16); + d2s16 = vsub_s16(d7s16, d6s16); + + d0s16 = vsub_s16(d0s16, vreinterpret_s16_u16(d16u16)); + + // Second for-loop + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d1s16), + vreinterpret_s32_s16(d3s16)); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d0s16), + vreinterpret_s32_s16(d2s16)); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[1]), // d2 + vreinterpret_s16_s32(v2tmp2.val[1])); // d3 + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[0]), // d0 + vreinterpret_s16_s32(v2tmp2.val[0])); // d1 + + q8s32 = vaddl_s16(v2tmp1.val[0], v2tmp0.val[0]); + q9s32 = vaddl_s16(v2tmp1.val[1], v2tmp0.val[1]); + q10s32 = vsubl_s16(v2tmp1.val[1], v2tmp0.val[1]); + q11s32 = vsubl_s16(v2tmp1.val[0], v2tmp0.val[0]); + + q0s32 = vaddq_s32(q8s32, q9s32); + q1s32 = vaddq_s32(q11s32, q10s32); + q2s32 = vsubq_s32(q11s32, q10s32); + q3s32 = vsubq_s32(q8s32, q9s32); + + q8u32 = vcltq_s32(q0s32, qEmptys32); + q9u32 = vcltq_s32(q1s32, qEmptys32); + q10u32 = vcltq_s32(q2s32, qEmptys32); + q11u32 = vcltq_s32(q3s32, qEmptys32); + + q8s32 = vreinterpretq_s32_u32(q8u32); + q9s32 = vreinterpretq_s32_u32(q9u32); + q10s32 = vreinterpretq_s32_u32(q10u32); + q11s32 = vreinterpretq_s32_u32(q11u32); + + q0s32 = vsubq_s32(q0s32, q8s32); + q1s32 = vsubq_s32(q1s32, q9s32); + q2s32 = vsubq_s32(q2s32, q10s32); + q3s32 = vsubq_s32(q3s32, q11s32); + + q8s32 = vaddq_s32(q0s32, q15s32); + q9s32 = vaddq_s32(q1s32, q15s32); + q10s32 = vaddq_s32(q2s32, q15s32); + q11s32 = vaddq_s32(q3s32, q15s32); + + d0s16 = vshrn_n_s32(q8s32, 3); + d1s16 = vshrn_n_s32(q9s32, 3); + d2s16 = vshrn_n_s32(q10s32, 3); + d3s16 = vshrn_n_s32(q11s32, 3); + + q0s16 = vcombine_s16(d0s16, d1s16); + q1s16 = vcombine_s16(d2s16, d3s16); + + vst1q_s16(output, q0s16); + vst1q_s16(output + 8, q1s16); + return; +} diff --git a/media/libvpx/vp8/encoder/bitstream.c b/media/libvpx/vp8/encoder/bitstream.c index 78e54e2488b2..9d0e69cf44c6 100644 --- a/media/libvpx/vp8/encoder/bitstream.c +++ b/media/libvpx/vp8/encoder/bitstream.c @@ -18,7 +18,6 @@ #include #include #include -#include "vp8/common/pragmas.h" #include "vpx/vpx_encoder.h" #include "vpx_mem/vpx_mem.h" #include "bitstream.h" diff --git a/media/libvpx/vp8/encoder/bitstream.h b/media/libvpx/vp8/encoder/bitstream.h index 455a94fbed31..66f4bf67ee42 100644 --- a/media/libvpx/vp8/encoder/bitstream.h +++ b/media/libvpx/vp8/encoder/bitstream.h @@ -9,23 +9,27 @@ */ -#ifndef __INC_BITSTREAM_H -#define __INC_BITSTREAM_H +#ifndef VP8_ENCODER_BITSTREAM_H_ +#define VP8_ENCODER_BITSTREAM_H_ + +#ifdef __cplusplus +extern "C" { +#endif #if HAVE_EDSP void vp8cx_pack_tokens_armv5(vp8_writer *w, const TOKENEXTRA *p, int xcount, - const vp8_token *, + vp8_token *, const vp8_extra_bit_struct *, const vp8_tree_index *); void vp8cx_pack_tokens_into_partitions_armv5(VP8_COMP *, unsigned char * cx_data, const unsigned char *cx_data_end, int num_parts, - const vp8_token *, + vp8_token *, const vp8_extra_bit_struct *, const vp8_tree_index *); void vp8cx_pack_mb_row_tokens_armv5(VP8_COMP *cpi, vp8_writer *w, - const vp8_token *, + vp8_token *, const vp8_extra_bit_struct *, const vp8_tree_index *); # define pack_tokens(a,b,c) \ @@ -43,4 +47,8 @@ void vp8_pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount); # define pack_mb_row_tokens(a,b) pack_mb_row_tokens_c(a,b) #endif +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_BITSTREAM_H_ diff --git a/media/libvpx/vp8/encoder/block.h b/media/libvpx/vp8/encoder/block.h index cf74c7aafc46..1f212cae8bea 100644 --- a/media/libvpx/vp8/encoder/block.h +++ b/media/libvpx/vp8/encoder/block.h @@ -9,8 +9,8 @@ */ -#ifndef __INC_BLOCK_H -#define __INC_BLOCK_H +#ifndef VP8_ENCODER_BLOCK_H_ +#define VP8_ENCODER_BLOCK_H_ #include "vp8/common/onyx.h" #include "vp8/common/blockd.h" @@ -18,6 +18,10 @@ #include "vp8/common/entropy.h" #include "vpx_ports/mem.h" +#ifdef __cplusplus +extern "C" { +#endif + #define MAX_MODES 20 #define MAX_ERROR_BINS 1024 @@ -123,6 +127,7 @@ typedef struct macroblock int q_index; #if CONFIG_TEMPORAL_DENOISING + int increase_denoising; MB_PREDICTION_MODE best_sse_inter_mode; int_mv best_sse_mv; MV_REFERENCE_FRAME best_reference_frame; @@ -160,4 +165,8 @@ typedef struct macroblock } MACROBLOCK; +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_BLOCK_H_ diff --git a/media/libvpx/vp8/encoder/boolhuff.h b/media/libvpx/vp8/encoder/boolhuff.h index 39ab586b52b3..7c012a829696 100644 --- a/media/libvpx/vp8/encoder/boolhuff.h +++ b/media/libvpx/vp8/encoder/boolhuff.h @@ -16,12 +16,16 @@ * Description : Bool Coder header file. * ****************************************************************************/ -#ifndef __INC_BOOLHUFF_H -#define __INC_BOOLHUFF_H +#ifndef VP8_ENCODER_BOOLHUFF_H_ +#define VP8_ENCODER_BOOLHUFF_H_ #include "vpx_ports/mem.h" #include "vpx/internal/vpx_codec_internal.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct { unsigned int lowvalue; @@ -31,10 +35,6 @@ typedef struct unsigned char *buffer; unsigned char *buffer_end; struct vpx_internal_error_info *error; - - /* Variables used to track bit costs without outputing to the bitstream */ - unsigned int measure_cost; - unsigned long bit_counter; } BOOL_CODER; extern void vp8_start_encode(BOOL_CODER *bc, unsigned char *buffer, unsigned char *buffer_end); @@ -125,4 +125,8 @@ static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability) br->range = range; } +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_BOOLHUFF_H_ diff --git a/media/libvpx/vp8/encoder/dct_value_cost.h b/media/libvpx/vp8/encoder/dct_value_cost.h index e892765c69ff..1cd3eec84ac3 100644 --- a/media/libvpx/vp8/encoder/dct_value_cost.h +++ b/media/libvpx/vp8/encoder/dct_value_cost.h @@ -8,6 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP8_ENCODER_DCT_VALUE_COST_H_ +#define VP8_ENCODER_DCT_VALUE_COST_H_ + +#ifdef __cplusplus +extern "C" { +#endif + /* Generated file, included by tokenize.c */ /* Values generated by fill_value_tokens() */ @@ -356,3 +363,9 @@ static const short dct_value_cost[2048*2] = 8134, 8140, 8148, 8170, 8178, 8184, 8192, 8202, 8210, 8216, 8224, 8243, 8251, 8257, 8265, 8275 }; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_ENCODER_DCT_VALUE_COST_H_ diff --git a/media/libvpx/vp8/encoder/dct_value_tokens.h b/media/libvpx/vp8/encoder/dct_value_tokens.h index ef08eeddc77c..c2aadefca79c 100644 --- a/media/libvpx/vp8/encoder/dct_value_tokens.h +++ b/media/libvpx/vp8/encoder/dct_value_tokens.h @@ -8,6 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP8_ENCODER_DCT_VALUE_TOKENS_H_ +#define VP8_ENCODER_DCT_VALUE_TOKENS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + /* Generated file, included by tokenize.c */ /* Values generated by fill_value_tokens() */ @@ -697,3 +704,9 @@ static const TOKENVALUE dct_value_tokens[2048*2] = {10, 3942}, {10, 3944}, {10, 3946}, {10, 3948}, {10, 3950}, {10, 3952}, {10, 3954}, {10, 3956}, {10, 3958}, {10, 3960} }; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_ENCODER_DCT_VALUE_TOKENS_H_ diff --git a/media/libvpx/vp8/encoder/defaultcoefcounts.h b/media/libvpx/vp8/encoder/defaultcoefcounts.h index 2c0f3ddf3b3f..1e8e80484ae2 100644 --- a/media/libvpx/vp8/encoder/defaultcoefcounts.h +++ b/media/libvpx/vp8/encoder/defaultcoefcounts.h @@ -8,6 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP8_ENCODER_DEFAULTCOEFCOUNTS_H_ +#define VP8_ENCODER_DEFAULTCOEFCOUNTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + /* Generated file, included by entropy.c */ static const unsigned int default_coef_counts[BLOCK_TYPES] @@ -221,3 +228,9 @@ static const unsigned int default_coef_counts[BLOCK_TYPES] }, }, }; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_ENCODER_DEFAULTCOEFCOUNTS_H_ diff --git a/media/libvpx/vp8/encoder/denoising.c b/media/libvpx/vp8/encoder/denoising.c index 7819265471e1..0c98eb1a55f9 100644 --- a/media/libvpx/vp8/encoder/denoising.c +++ b/media/libvpx/vp8/encoder/denoising.c @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include + #include "denoising.h" #include "vp8/common/reconinter.h" @@ -21,6 +23,7 @@ static const unsigned int NOISE_MOTION_THRESHOLD = 25 * 25; */ static const unsigned int SSE_DIFF_THRESHOLD = 16 * 16 * 20; static const unsigned int SSE_THRESHOLD = 16 * 16 * 40; +static const unsigned int SSE_THRESHOLD_HIGH = 16 * 16 * 60; /* * The filter function was modified to reduce the computational complexity. @@ -51,27 +54,32 @@ static const unsigned int SSE_THRESHOLD = 16 * 16 * 40; * [16, 255] 6 7 */ -int vp8_denoiser_filter_c(YV12_BUFFER_CONFIG *mc_running_avg, - YV12_BUFFER_CONFIG *running_avg, MACROBLOCK *signal, - unsigned int motion_magnitude, int y_offset, - int uv_offset) +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, + unsigned char *running_avg_y, int avg_y_stride, + unsigned char *sig, int sig_stride, + unsigned int motion_magnitude, + int increase_denoising) { - unsigned char *sig = signal->thismb; - int sig_stride = 16; - unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset; - int mc_avg_y_stride = mc_running_avg->y_stride; - unsigned char *running_avg_y = running_avg->y_buffer + y_offset; - int avg_y_stride = running_avg->y_stride; - int r, c, i; + unsigned char *running_avg_y_start = running_avg_y; + unsigned char *sig_start = sig; + int sum_diff_thresh; + int r, c; int sum_diff = 0; int adj_val[3] = {3, 4, 6}; - + int shift_inc1 = 0; + int shift_inc2 = 1; /* If motion_magnitude is small, making the denoiser more aggressive by - * increasing the adjustment for each level. */ + * increasing the adjustment for each level. Add another increment for + * blocks that are labeled for increase denoising. */ if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) { - for (i = 0; i < 3; i++) - adj_val[i] += 1; + if (increase_denoising) { + shift_inc1 = 1; + shift_inc2 = 2; + } + adj_val[0] += shift_inc2; + adj_val[1] += shift_inc2; + adj_val[2] += shift_inc2; } for (r = 0; r < 16; ++r) @@ -85,8 +93,9 @@ int vp8_denoiser_filter_c(YV12_BUFFER_CONFIG *mc_running_avg, diff = mc_running_avg_y[c] - sig[c]; absdiff = abs(diff); - /* When |diff| < 4, use pixel value from last denoised raw. */ - if (absdiff <= 3) + // When |diff| <= |3 + shift_inc1|, use pixel value from + // last denoised raw. + if (absdiff <= 3 + shift_inc1) { running_avg_y[c] = mc_running_avg_y[c]; sum_diff += diff; @@ -127,18 +136,241 @@ int vp8_denoiser_filter_c(YV12_BUFFER_CONFIG *mc_running_avg, running_avg_y += avg_y_stride; } - if (abs(sum_diff) > SUM_DIFF_THRESHOLD) - return COPY_BLOCK; + sum_diff_thresh= SUM_DIFF_THRESHOLD; + if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH; + if (abs(sum_diff) > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), check + // if we can still apply some (weaker) temporal filtering to this block, + // that would otherwise not be denoised at all. Simplest is to apply + // an additional adjustment to running_avg_y to bring it closer to sig. + // The adjustment is capped by a maximum delta, and chosen such that + // in most cases the resulting sum_diff will be within the + // accceptable range given by sum_diff_thresh. - vp8_copy_mem16x16(running_avg->y_buffer + y_offset, avg_y_stride, - signal->thismb, sig_stride); + // The delta is set by the excess of absolute pixel diff over threshold. + int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + sig -= sig_stride * 16; + mc_running_avg_y -= mc_avg_y_stride * 16; + running_avg_y -= avg_y_stride * 16; + for (r = 0; r < 16; ++r) { + for (c = 0; c < 16; ++c) { + int diff = mc_running_avg_y[c] - sig[c]; + int adjustment = abs(diff); + if (adjustment > delta) + adjustment = delta; + if (diff > 0) { + // Bring denoised signal down. + if (running_avg_y[c] - adjustment < 0) + running_avg_y[c] = 0; + else + running_avg_y[c] = running_avg_y[c] - adjustment; + sum_diff -= adjustment; + } else if (diff < 0) { + // Bring denoised signal up. + if (running_avg_y[c] + adjustment > 255) + running_avg_y[c] = 255; + else + running_avg_y[c] = running_avg_y[c] + adjustment; + sum_diff += adjustment; + } + } + // TODO(marpan): Check here if abs(sum_diff) has gone below the + // threshold sum_diff_thresh, and if so, we can exit the row loop. + sig += sig_stride; + mc_running_avg_y += mc_avg_y_stride; + running_avg_y += avg_y_stride; + } + if (abs(sum_diff) > sum_diff_thresh) + return COPY_BLOCK; + } else { + return COPY_BLOCK; + } + } + + vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride); return FILTER_BLOCK; } -int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height) +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, + int mc_avg_uv_stride, + unsigned char *running_avg_uv, + int avg_uv_stride, + unsigned char *sig, + int sig_stride, + unsigned int motion_magnitude, + int increase_denoising) { + unsigned char *running_avg_uv_start = running_avg_uv; + unsigned char *sig_start = sig; + int sum_diff_thresh; + int r, c; + int sum_diff = 0; + int sum_block = 0; + int adj_val[3] = {3, 4, 6}; + int shift_inc1 = 0; + int shift_inc2 = 1; + /* If motion_magnitude is small, making the denoiser more aggressive by + * increasing the adjustment for each level. Add another increment for + * blocks that are labeled for increase denoising. */ + if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) { + if (increase_denoising) { + shift_inc1 = 1; + shift_inc2 = 2; + } + adj_val[0] += shift_inc2; + adj_val[1] += shift_inc2; + adj_val[2] += shift_inc2; + } + + // Avoid denoising color signal if its close to average level. + for (r = 0; r < 8; ++r) { + for (c = 0; c < 8; ++c) { + sum_block += sig[c]; + } + sig += sig_stride; + } + if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) { + return COPY_BLOCK; + } + + sig -= sig_stride * 8; + for (r = 0; r < 8; ++r) { + for (c = 0; c < 8; ++c) { + int diff = 0; + int adjustment = 0; + int absdiff = 0; + + diff = mc_running_avg_uv[c] - sig[c]; + absdiff = abs(diff); + + // When |diff| <= |3 + shift_inc1|, use pixel value from + // last denoised raw. + if (absdiff <= 3 + shift_inc1) { + running_avg_uv[c] = mc_running_avg_uv[c]; + sum_diff += diff; + } else { + if (absdiff >= 4 && absdiff <= 7) + adjustment = adj_val[0]; + else if (absdiff >= 8 && absdiff <= 15) + adjustment = adj_val[1]; + else + adjustment = adj_val[2]; + if (diff > 0) { + if ((sig[c] + adjustment) > 255) + running_avg_uv[c] = 255; + else + running_avg_uv[c] = sig[c] + adjustment; + sum_diff += adjustment; + } else { + if ((sig[c] - adjustment) < 0) + running_avg_uv[c] = 0; + else + running_avg_uv[c] = sig[c] - adjustment; + sum_diff -= adjustment; + } + } + } + /* Update pointers for next iteration. */ + sig += sig_stride; + mc_running_avg_uv += mc_avg_uv_stride; + running_avg_uv += avg_uv_stride; + } + + sum_diff_thresh= SUM_DIFF_THRESHOLD_UV; + if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV; + if (abs(sum_diff) > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), check + // if we can still apply some (weaker) temporal filtering to this block, + // that would otherwise not be denoised at all. Simplest is to apply + // an additional adjustment to running_avg_y to bring it closer to sig. + // The adjustment is capped by a maximum delta, and chosen such that + // in most cases the resulting sum_diff will be within the + // accceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over threshold. + int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + sig -= sig_stride * 8; + mc_running_avg_uv -= mc_avg_uv_stride * 8; + running_avg_uv -= avg_uv_stride * 8; + for (r = 0; r < 8; ++r) { + for (c = 0; c < 8; ++c) { + int diff = mc_running_avg_uv[c] - sig[c]; + int adjustment = abs(diff); + if (adjustment > delta) + adjustment = delta; + if (diff > 0) { + // Bring denoised signal down. + if (running_avg_uv[c] - adjustment < 0) + running_avg_uv[c] = 0; + else + running_avg_uv[c] = running_avg_uv[c] - adjustment; + sum_diff -= adjustment; + } else if (diff < 0) { + // Bring denoised signal up. + if (running_avg_uv[c] + adjustment > 255) + running_avg_uv[c] = 255; + else + running_avg_uv[c] = running_avg_uv[c] + adjustment; + sum_diff += adjustment; + } + } + // TODO(marpan): Check here if abs(sum_diff) has gone below the + // threshold sum_diff_thresh, and if so, we can exit the row loop. + sig += sig_stride; + mc_running_avg_uv += mc_avg_uv_stride; + running_avg_uv += avg_uv_stride; + } + if (abs(sum_diff) > sum_diff_thresh) + return COPY_BLOCK; + } else { + return COPY_BLOCK; + } + } + + vp8_copy_mem8x8(running_avg_uv_start, avg_uv_stride, sig_start, + sig_stride); + return FILTER_BLOCK; +} + +void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser, int mode) { + assert(mode > 0); // Denoiser is allocated only if mode > 0. + if (mode == 1) { + denoiser->denoiser_mode = kDenoiserOnYOnly; + } else if (mode == 2) { + denoiser->denoiser_mode = kDenoiserOnYUV; + } else if (mode == 3) { + denoiser->denoiser_mode = kDenoiserOnYUVAggressive; + } else { + denoiser->denoiser_mode = kDenoiserOnAdaptive; + } + if (denoiser->denoiser_mode != kDenoiserOnYUVAggressive) { + denoiser->denoise_pars.scale_sse_thresh = 1; + denoiser->denoise_pars.scale_motion_thresh = 8; + denoiser->denoise_pars.scale_increase_filter = 0; + denoiser->denoise_pars.denoise_mv_bias = 95; + denoiser->denoise_pars.pickmode_mv_bias = 100; + denoiser->denoise_pars.qp_thresh = 0; + denoiser->denoise_pars.consec_zerolast = UINT_MAX; + } else { + denoiser->denoise_pars.scale_sse_thresh = 2; + denoiser->denoise_pars.scale_motion_thresh = 16; + denoiser->denoise_pars.scale_increase_filter = 1; + denoiser->denoise_pars.denoise_mv_bias = 60; + denoiser->denoise_pars.pickmode_mv_bias = 60; + denoiser->denoise_pars.qp_thresh = 100; + denoiser->denoise_pars.consec_zerolast = 10; + } +} + +int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height, + int num_mb_rows, int num_mb_cols, int mode) { int i; assert(denoiser); + denoiser->num_mb_cols = num_mb_cols; for (i = 0; i < MAX_REF_FRAMES; i++) { @@ -166,9 +398,30 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height) vpx_memset(denoiser->yv12_mc_running_avg.buffer_alloc, 0, denoiser->yv12_mc_running_avg.frame_size); + + if (vp8_yv12_alloc_frame_buffer(&denoiser->yv12_last_source, width, + height, VP8BORDERINPIXELS) < 0) { + vp8_denoiser_free(denoiser); + return 1; + } + vpx_memset(denoiser->yv12_last_source.buffer_alloc, 0, + denoiser->yv12_last_source.frame_size); + + denoiser->denoise_state = vpx_calloc((num_mb_rows * num_mb_cols), 1); + vpx_memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols)); + vp8_denoiser_set_parameters(denoiser, mode); + denoiser->nmse_source_diff = 0; + denoiser->nmse_source_diff_count = 0; + // TODO(marpan): Adjust thresholds, including effect on resolution. + denoiser->threshold_aggressive_mode = 35; + if (width * height > 640 * 480) + denoiser->threshold_aggressive_mode = 150; + else if (width * height > 1280 * 720) + denoiser->threshold_aggressive_mode = 1400; return 0; } + void vp8_denoiser_free(VP8_DENOISER *denoiser) { int i; @@ -179,6 +432,8 @@ void vp8_denoiser_free(VP8_DENOISER *denoiser) vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_running_avg[i]); } vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_mc_running_avg); + vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_last_source); + vpx_free(denoiser->denoise_state); } @@ -187,16 +442,28 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, unsigned int best_sse, unsigned int zero_mv_sse, int recon_yoffset, - int recon_uvoffset) + int recon_uvoffset, + loop_filter_info_n *lfi_n, + int mb_row, + int mb_col, + int block_index) + { int mv_row; int mv_col; + unsigned int motion_threshold; unsigned int motion_magnitude2; - + unsigned int sse_thresh; + int sse_diff_thresh = 0; + // Spatial loop filter: only applied selectively based on + // temporal filter state of block relative to top/left neighbors. + int apply_spatial_loop_filter = 1; MV_REFERENCE_FRAME frame = x->best_reference_frame; MV_REFERENCE_FRAME zero_frame = x->best_zeromv_reference_frame; enum vp8_denoiser_decision decision = FILTER_BLOCK; + enum vp8_denoiser_decision decision_u = COPY_BLOCK; + enum vp8_denoiser_decision decision_v = COPY_BLOCK; if (zero_frame) { @@ -206,7 +473,11 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, MB_MODE_INFO saved_mbmi; MACROBLOCKD *filter_xd = &x->e_mbd; MB_MODE_INFO *mbmi = &filter_xd->mode_info_context->mbmi; - int sse_diff = zero_mv_sse - best_sse; + int sse_diff = 0; + // Bias on zero motion vector sse. + const int zero_bias = denoiser->denoise_pars.denoise_mv_bias; + zero_mv_sse = (unsigned int)((int64_t)zero_mv_sse * zero_bias / 100); + sse_diff = zero_mv_sse - best_sse; saved_mbmi = *mbmi; @@ -217,11 +488,16 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, mbmi->need_to_clamp_mvs = x->need_to_clamp_best_mvs; mv_col = x->best_sse_mv.as_mv.col; mv_row = x->best_sse_mv.as_mv.row; + // Bias to zero_mv if small amount of motion. + // Note sse_diff_thresh is intialized to zero, so this ensures + // we will always choose zero_mv for denoising if + // zero_mv_see <= best_sse (i.e., sse_diff <= 0). + if ((unsigned int)(mv_row * mv_row + mv_col * mv_col) + <= NOISE_MOTION_THRESHOLD) + sse_diff_thresh = (int)SSE_DIFF_THRESHOLD; if (frame == INTRA_FRAME || - ((unsigned int)(mv_row *mv_row + mv_col *mv_col) - <= NOISE_MOTION_THRESHOLD && - sse_diff < (int)SSE_DIFF_THRESHOLD)) + sse_diff <= sse_diff_thresh) { /* * Handle intra blocks as referring to last frame with zero motion @@ -277,20 +553,62 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, mv_row = x->best_sse_mv.as_mv.row; mv_col = x->best_sse_mv.as_mv.col; motion_magnitude2 = mv_row * mv_row + mv_col * mv_col; - if (best_sse > SSE_THRESHOLD || motion_magnitude2 - > 8 * NOISE_MOTION_THRESHOLD) - { - decision = COPY_BLOCK; - } + motion_threshold = denoiser->denoise_pars.scale_motion_thresh * + NOISE_MOTION_THRESHOLD; + + if (motion_magnitude2 < + denoiser->denoise_pars.scale_increase_filter * NOISE_MOTION_THRESHOLD) + x->increase_denoising = 1; + + sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD; + if (x->increase_denoising) + sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD_HIGH; + + if (best_sse > sse_thresh || motion_magnitude2 > motion_threshold) + decision = COPY_BLOCK; if (decision == FILTER_BLOCK) { + unsigned char *mc_running_avg_y = + denoiser->yv12_mc_running_avg.y_buffer + recon_yoffset; + int mc_avg_y_stride = denoiser->yv12_mc_running_avg.y_stride; + unsigned char *running_avg_y = + denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset; + int avg_y_stride = denoiser->yv12_running_avg[INTRA_FRAME].y_stride; + /* Filter. */ - decision = vp8_denoiser_filter(&denoiser->yv12_mc_running_avg, - &denoiser->yv12_running_avg[INTRA_FRAME], - x, - motion_magnitude2, - recon_yoffset, recon_uvoffset); + decision = vp8_denoiser_filter(mc_running_avg_y, mc_avg_y_stride, + running_avg_y, avg_y_stride, + x->thismb, 16, motion_magnitude2, + x->increase_denoising); + denoiser->denoise_state[block_index] = motion_magnitude2 > 0 ? + kFilterNonZeroMV : kFilterZeroMV; + // Only denoise UV for zero motion, and if y channel was denoised. + if (denoiser->denoiser_mode != kDenoiserOnYOnly && + motion_magnitude2 == 0 && + decision == FILTER_BLOCK) { + unsigned char *mc_running_avg_u = + denoiser->yv12_mc_running_avg.u_buffer + recon_uvoffset; + unsigned char *running_avg_u = + denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset; + unsigned char *mc_running_avg_v = + denoiser->yv12_mc_running_avg.v_buffer + recon_uvoffset; + unsigned char *running_avg_v = + denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset; + int mc_avg_uv_stride = denoiser->yv12_mc_running_avg.uv_stride; + int avg_uv_stride = denoiser->yv12_running_avg[INTRA_FRAME].uv_stride; + int signal_stride = x->block[16].src_stride; + decision_u = + vp8_denoiser_filter_uv(mc_running_avg_u, mc_avg_uv_stride, + running_avg_u, avg_uv_stride, + x->block[16].src + *x->block[16].base_src, + signal_stride, motion_magnitude2, 0); + decision_v = + vp8_denoiser_filter_uv(mc_running_avg_v, mc_avg_uv_stride, + running_avg_v, avg_uv_stride, + x->block[20].src + *x->block[20].base_src, + signal_stride, motion_magnitude2, 0); + } } if (decision == COPY_BLOCK) { @@ -301,5 +619,73 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, x->thismb, 16, denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset, denoiser->yv12_running_avg[INTRA_FRAME].y_stride); + denoiser->denoise_state[block_index] = kNoFilter; + } + if (denoiser->denoiser_mode != kDenoiserOnYOnly) { + if (decision_u == COPY_BLOCK) { + vp8_copy_mem8x8( + x->block[16].src + *x->block[16].base_src, x->block[16].src_stride, + denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset, + denoiser->yv12_running_avg[INTRA_FRAME].uv_stride); + } + if (decision_v == COPY_BLOCK) { + vp8_copy_mem8x8( + x->block[20].src + *x->block[20].base_src, x->block[16].src_stride, + denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset, + denoiser->yv12_running_avg[INTRA_FRAME].uv_stride); + } + } + // Option to selectively deblock the denoised signal, for y channel only. + if (apply_spatial_loop_filter) { + loop_filter_info lfi; + int apply_filter_col = 0; + int apply_filter_row = 0; + int apply_filter = 0; + int y_stride = denoiser->yv12_running_avg[INTRA_FRAME].y_stride; + int uv_stride =denoiser->yv12_running_avg[INTRA_FRAME].uv_stride; + + // Fix filter level to some nominal value for now. + int filter_level = 32; + + int hev_index = lfi_n->hev_thr_lut[INTER_FRAME][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + // Apply filter if there is a difference in the denoiser filter state + // between the current and left/top block, or if non-zero motion vector + // is used for the motion-compensated filtering. + if (mb_col > 0) { + apply_filter_col = !((denoiser->denoise_state[block_index] == + denoiser->denoise_state[block_index - 1]) && + denoiser->denoise_state[block_index] != kFilterNonZeroMV); + if (apply_filter_col) { + // Filter left vertical edge. + apply_filter = 1; + vp8_loop_filter_mbv( + denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset, + NULL, NULL, y_stride, uv_stride, &lfi); + } + } + if (mb_row > 0) { + apply_filter_row = !((denoiser->denoise_state[block_index] == + denoiser->denoise_state[block_index - denoiser->num_mb_cols]) && + denoiser->denoise_state[block_index] != kFilterNonZeroMV); + if (apply_filter_row) { + // Filter top horizontal edge. + apply_filter = 1; + vp8_loop_filter_mbh( + denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset, + NULL, NULL, y_stride, uv_stride, &lfi); + } + } + if (apply_filter) { + // Update the signal block |x|. Pixel changes are only to top and/or + // left boundary pixels: can we avoid full block copy here. + vp8_copy_mem16x16( + denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset, + y_stride, x->thismb, 16); + } } } diff --git a/media/libvpx/vp8/encoder/denoising.h b/media/libvpx/vp8/encoder/denoising.h index b025f5cdf01b..e1844f18ad3a 100644 --- a/media/libvpx/vp8/encoder/denoising.h +++ b/media/libvpx/vp8/encoder/denoising.h @@ -12,31 +12,98 @@ #define VP8_ENCODER_DENOISING_H_ #include "block.h" +#include "vp8/common/loopfilter.h" + +#ifdef __cplusplus +extern "C" { +#endif #define SUM_DIFF_THRESHOLD (16 * 16 * 2) +#define SUM_DIFF_THRESHOLD_HIGH (16 * 16 * 3) #define MOTION_MAGNITUDE_THRESHOLD (8*3) +#define SUM_DIFF_THRESHOLD_UV (96) // (8 * 8 * 1.5) +#define SUM_DIFF_THRESHOLD_HIGH_UV (8 * 8 * 2) +#define SUM_DIFF_FROM_AVG_THRESH_UV (8 * 8 * 8) +#define MOTION_MAGNITUDE_THRESHOLD_UV (8*3) + enum vp8_denoiser_decision { COPY_BLOCK, FILTER_BLOCK }; +enum vp8_denoiser_filter_state { + kNoFilter, + kFilterZeroMV, + kFilterNonZeroMV +}; + +enum vp8_denoiser_mode { + kDenoiserOff, + kDenoiserOnYOnly, + kDenoiserOnYUV, + kDenoiserOnYUVAggressive, + kDenoiserOnAdaptive +}; + +typedef struct { + // Scale factor on sse threshold above which no denoising is done. + unsigned int scale_sse_thresh; + // Scale factor on motion magnitude threshold above which no + // denoising is done. + unsigned int scale_motion_thresh; + // Scale factor on motion magnitude below which we increase the strength of + // the temporal filter (in function vp8_denoiser_filter). + unsigned int scale_increase_filter; + // Scale factor to bias to ZEROMV for denoising. + unsigned int denoise_mv_bias; + // Scale factor to bias to ZEROMV for coding mode selection. + unsigned int pickmode_mv_bias; + // Quantizer threshold below which we use the segmentation map to switch off + // loop filter for blocks that have been coded as ZEROMV-LAST a certain number + // (consec_zerolast) of consecutive frames. Note that the delta-QP is set to + // 0 when segmentation map is used for shutting off loop filter. + unsigned int qp_thresh; + // Threshold for number of consecutive frames for blocks coded as ZEROMV-LAST. + unsigned int consec_zerolast; +} denoise_params; + typedef struct vp8_denoiser { YV12_BUFFER_CONFIG yv12_running_avg[MAX_REF_FRAMES]; YV12_BUFFER_CONFIG yv12_mc_running_avg; + // TODO(marpan): Should remove yv12_last_source and use vp8_lookahead_peak. + YV12_BUFFER_CONFIG yv12_last_source; + unsigned char* denoise_state; + int num_mb_cols; + int denoiser_mode; + int threshold_aggressive_mode; + int nmse_source_diff; + int nmse_source_diff_count; + denoise_params denoise_pars; } VP8_DENOISER; -int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height); +int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height, + int num_mb_rows, int num_mb_cols, int mode); void vp8_denoiser_free(VP8_DENOISER *denoiser); +void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser, int mode); + void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, MACROBLOCK *x, unsigned int best_sse, unsigned int zero_mv_sse, int recon_yoffset, - int recon_uvoffset); + int recon_uvoffset, + loop_filter_info_n *lfi_n, + int mb_row, + int mb_col, + int block_index); -#endif /* VP8_ENCODER_DENOISING_H_ */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_ENCODER_DENOISING_H_ diff --git a/media/libvpx/vp8/encoder/encodeframe.c b/media/libvpx/vp8/encoder/encodeframe.c index b550f6be1cfd..aec6b9880c11 100644 --- a/media/libvpx/vp8/encoder/encodeframe.c +++ b/media/libvpx/vp8/encoder/encodeframe.c @@ -522,6 +522,19 @@ void encode_mb_row(VP8_COMP *cpi, } #endif + // Keep track of how many (consecutive) times a block is coded + // as ZEROMV_LASTREF, for base layer frames. + // Reset to 0 if its coded as anything else. + if (cpi->current_layer == 0) { + if (xd->mode_info_context->mbmi.mode == ZEROMV && + xd->mode_info_context->mbmi.ref_frame == LAST_FRAME) { + // Increment, check for wrap-around. + if (cpi->consec_zero_last[map_index+mb_col] < 255) + cpi->consec_zero_last[map_index+mb_col] += 1; + } else { + cpi->consec_zero_last[map_index+mb_col] = 0; + } + } /* Special case code for cyclic refresh * If cyclic update enabled then copy xd->mbmi.segment_id; (which @@ -1246,7 +1259,7 @@ int vp8cx_encode_inter_macroblock x->zbin_mode_boost_enabled = 0; } vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, - &distortion, &intra_error); + &distortion, &intra_error, mb_row, mb_col); /* switch back to the regular quantizer for the encode */ if (cpi->sf.improved_quant) diff --git a/media/libvpx/vp8/encoder/encodeframe.h b/media/libvpx/vp8/encoder/encodeframe.h index 4dd6ba0de181..e185c1035c09 100644 --- a/media/libvpx/vp8/encoder/encodeframe.h +++ b/media/libvpx/vp8/encoder/encodeframe.h @@ -7,8 +7,12 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef ENCODEFRAME_H -#define ENCODEFRAME_H +#ifndef VP8_ENCODER_ENCODEFRAME_H_ +#define VP8_ENCODER_ENCODEFRAME_H_ + +#ifdef __cplusplus +extern "C" { +#endif extern void vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x); extern void vp8_build_block_offsets(MACROBLOCK *x); @@ -24,4 +28,8 @@ extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, extern int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_ENCODEFRAME_H_ diff --git a/media/libvpx/vp8/encoder/encodeintra.h b/media/libvpx/vp8/encoder/encodeintra.h index be2141f2c633..a8d0284d29b5 100644 --- a/media/libvpx/vp8/encoder/encodeintra.h +++ b/media/libvpx/vp8/encoder/encodeintra.h @@ -9,13 +9,21 @@ */ -#ifndef _ENCODEINTRA_H_ -#define _ENCODEINTRA_H_ +#ifndef VP8_ENCODER_ENCODEINTRA_H_ +#define VP8_ENCODER_ENCODEINTRA_H_ #include "onyx_int.h" +#ifdef __cplusplus +extern "C" { +#endif + int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred); void vp8_encode_intra16x16mby(MACROBLOCK *x); void vp8_encode_intra16x16mbuv(MACROBLOCK *x); void vp8_encode_intra4x4mby(MACROBLOCK *mb); void vp8_encode_intra4x4block(MACROBLOCK *x, int ib); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_ENCODEINTRA_H_ diff --git a/media/libvpx/vp8/encoder/encodemb.h b/media/libvpx/vp8/encoder/encodemb.h index 6badf7d901aa..0b3ec875e7a4 100644 --- a/media/libvpx/vp8/encoder/encodemb.h +++ b/media/libvpx/vp8/encoder/encodemb.h @@ -9,10 +9,14 @@ */ -#ifndef __INC_ENCODEMB_H -#define __INC_ENCODEMB_H +#ifndef VP8_ENCODER_ENCODEMB_H_ +#define VP8_ENCODER_ENCODEMB_H_ #include "onyx_int.h" + +#ifdef __cplusplus +extern "C" { +#endif void vp8_encode_inter16x16(MACROBLOCK *x); void vp8_build_dcblock(MACROBLOCK *b); @@ -23,4 +27,8 @@ void vp8_transform_intra_mby(MACROBLOCK *x); void vp8_optimize_mby(MACROBLOCK *x); void vp8_optimize_mbuv(MACROBLOCK *x); void vp8_encode_inter16x16y(MACROBLOCK *x); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_ENCODEMB_H_ diff --git a/media/libvpx/vp8/encoder/encodemv.h b/media/libvpx/vp8/encoder/encodemv.h index a6116c133d01..722162ba212e 100644 --- a/media/libvpx/vp8/encoder/encodemv.h +++ b/media/libvpx/vp8/encoder/encodemv.h @@ -9,13 +9,21 @@ */ -#ifndef __INC_ENCODEMV_H -#define __INC_ENCODEMV_H +#ifndef VP8_ENCODER_ENCODEMV_H_ +#define VP8_ENCODER_ENCODEMV_H_ #include "onyx_int.h" +#ifdef __cplusplus +extern "C" { +#endif + void vp8_write_mvprobs(VP8_COMP *); void vp8_encode_motion_vector(vp8_writer *, const MV *, const MV_CONTEXT *); void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc, int mvc_flag[2]); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_ENCODEMV_H_ diff --git a/media/libvpx/vp8/encoder/ethreading.c b/media/libvpx/vp8/encoder/ethreading.c index d4b17cef1a11..7b8b51f308bd 100644 --- a/media/libvpx/vp8/encoder/ethreading.c +++ b/media/libvpx/vp8/encoder/ethreading.c @@ -206,6 +206,21 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data) } #endif + // Keep track of how many (consecutive) times a block + // is coded as ZEROMV_LASTREF, for base layer frames. + // Reset to 0 if its coded as anything else. + if (cpi->current_layer == 0) { + if (xd->mode_info_context->mbmi.mode == ZEROMV && + xd->mode_info_context->mbmi.ref_frame == + LAST_FRAME) { + // Increment, check for wrap-around. + if (cpi->consec_zero_last[map_index+mb_col] < 255) + cpi->consec_zero_last[map_index+mb_col] += + 1; + } else { + cpi->consec_zero_last[map_index+mb_col] = 0; + } + } /* Special case code for cyclic refresh * If cyclic update enabled then copy diff --git a/media/libvpx/vp8/encoder/firstpass.c b/media/libvpx/vp8/encoder/firstpass.c index 968c7f3652e1..98e5a7115dbb 100644 --- a/media/libvpx/vp8/encoder/firstpass.c +++ b/media/libvpx/vp8/encoder/firstpass.c @@ -940,9 +940,9 @@ static int64_t estimate_modemvcost(VP8_COMP *cpi, /* Crude estimate of overhead cost from modes * << 9 is the normalization to (bits * 512) used in vp8_bits_per_mb */ - mode_cost =((((av_pct_inter - av_pct_motion) * zz_cost) + - (av_pct_motion * motion_cost) + - (av_intra * intra_cost)) * cpi->common.MBs) * 512; + mode_cost = (int64_t)((((av_pct_inter - av_pct_motion) * zz_cost) + + (av_pct_motion * motion_cost) + + (av_intra * intra_cost)) * cpi->common.MBs) * 512; return mv_cost + mode_cost; } @@ -2310,7 +2310,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) pct_extra = (pct_extra > 20) ? 20 : pct_extra; cpi->twopass.alt_extra_bits = - (cpi->twopass.gf_group_bits * pct_extra) / 100; + (int)(cpi->twopass.gf_group_bits * pct_extra) / 100; cpi->twopass.gf_group_bits -= cpi->twopass.alt_extra_bits; cpi->twopass.alt_extra_bits /= ((cpi->baseline_gf_interval-1)>>1); @@ -2386,7 +2386,7 @@ static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) target_frame_size = max_bits; if (target_frame_size > cpi->twopass.gf_group_bits) - target_frame_size = cpi->twopass.gf_group_bits; + target_frame_size = (int)cpi->twopass.gf_group_bits; } /* Adjust error and bits remaining */ @@ -2444,10 +2444,10 @@ void vp8_second_pass(VP8_COMP *cpi) find_next_key_frame(cpi, &this_frame_copy); /* Special case: Error error_resilient_mode mode does not make much - * sense for two pass but with its current meaning but this code is + * sense for two pass but with its current meaning this code is * designed to stop outlandish behaviour if someone does set it when * using two pass. It effectively disables GF groups. This is - * temporary code till we decide what should really happen in this + * temporary code until we decide what should really happen in this * case. */ if (cpi->oxcf.error_resilient_mode) @@ -2773,7 +2773,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) kf_group_intra_err += this_frame->intra_error; kf_group_coded_err += this_frame->coded_error; - /* load a the next frame's stats */ + /* Load the next frame's stats. */ vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame)); input_stats(cpi, this_frame); diff --git a/media/libvpx/vp8/encoder/firstpass.h b/media/libvpx/vp8/encoder/firstpass.h index 95e1e5463c6d..c409ebca8f1b 100644 --- a/media/libvpx/vp8/encoder/firstpass.h +++ b/media/libvpx/vp8/encoder/firstpass.h @@ -9,8 +9,12 @@ */ -#if !defined __INC_FIRSTPASS_H -#define __INC_FIRSTPASS_H +#ifndef VP8_ENCODER_FIRSTPASS_H_ +#define VP8_ENCODER_FIRSTPASS_H_ + +#ifdef __cplusplus +extern "C" { +#endif extern void vp8_init_first_pass(VP8_COMP *cpi); extern void vp8_first_pass(VP8_COMP *cpi); @@ -21,4 +25,8 @@ extern void vp8_second_pass(VP8_COMP *cpi); extern void vp8_end_second_pass(VP8_COMP *cpi); extern size_t vp8_firstpass_stats_sz(unsigned int mb_count); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_FIRSTPASS_H_ diff --git a/media/libvpx/vp8/encoder/lookahead.h b/media/libvpx/vp8/encoder/lookahead.h index cf56b75b7d5e..cad68e639fbf 100644 --- a/media/libvpx/vp8/encoder/lookahead.h +++ b/media/libvpx/vp8/encoder/lookahead.h @@ -7,11 +7,15 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef LOOKAHEAD_H -#define LOOKAHEAD_H +#ifndef VP8_ENCODER_LOOKAHEAD_H_ +#define VP8_ENCODER_LOOKAHEAD_H_ #include "vpx_scale/yv12config.h" #include "vpx/vpx_integer.h" +#ifdef __cplusplus +extern "C" { +#endif + struct lookahead_entry { YV12_BUFFER_CONFIG img; @@ -106,4 +110,8 @@ unsigned int vp8_lookahead_depth(struct lookahead_ctx *ctx); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_LOOKAHEAD_H_ diff --git a/media/libvpx/vp8/encoder/mcomp.c b/media/libvpx/vp8/encoder/mcomp.c index 0b11ea64a65a..54abe76acd2a 100644 --- a/media/libvpx/vp8/encoder/mcomp.c +++ b/media/libvpx/vp8/encoder/mcomp.c @@ -17,6 +17,7 @@ #include #include #include "vp8/common/findnearmv.h" +#include "vp8/common/common.h" #ifdef VP8_ENTROPY_STATS static int mv_ref_ct [31] [4] [2]; diff --git a/media/libvpx/vp8/encoder/mcomp.h b/media/libvpx/vp8/encoder/mcomp.h index e36c51543cb0..f284f7c38075 100644 --- a/media/libvpx/vp8/encoder/mcomp.h +++ b/media/libvpx/vp8/encoder/mcomp.h @@ -9,12 +9,16 @@ */ -#ifndef __INC_MCOMP_H -#define __INC_MCOMP_H +#ifndef VP8_ENCODER_MCOMP_H_ +#define VP8_ENCODER_MCOMP_H_ #include "block.h" #include "vp8/common/variance.h" +#ifdef __cplusplus +extern "C" { +#endif + #ifdef VP8_ENTROPY_STATS extern void init_mv_ref_counts(); extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]); @@ -104,4 +108,8 @@ typedef int (*vp8_diamond_search_fn_t) int_mv *center_mv ); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_MCOMP_H_ diff --git a/media/libvpx/vp8/encoder/modecosts.h b/media/libvpx/vp8/encoder/modecosts.h index 99ef119d5ddc..9281551c8d5f 100644 --- a/media/libvpx/vp8/encoder/modecosts.h +++ b/media/libvpx/vp8/encoder/modecosts.h @@ -9,9 +9,17 @@ */ -#ifndef __INC_MODECOSTS_H -#define __INC_MODECOSTS_H +#ifndef VP8_ENCODER_MODECOSTS_H_ +#define VP8_ENCODER_MODECOSTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif void vp8_init_mode_costs(VP8_COMP *x); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_MODECOSTS_H_ diff --git a/media/libvpx/vp8/encoder/mr_dissim.c b/media/libvpx/vp8/encoder/mr_dissim.c index 71218cca146f..8d96445f53ce 100644 --- a/media/libvpx/vp8/encoder/mr_dissim.c +++ b/media/libvpx/vp8/encoder/mr_dissim.c @@ -15,6 +15,7 @@ #include "mr_dissim.h" #include "vpx_mem/vpx_mem.h" #include "rdopt.h" +#include "vp8/common/common.h" void vp8_cal_low_res_mb_cols(VP8_COMP *cpi) { diff --git a/media/libvpx/vp8/encoder/mr_dissim.h b/media/libvpx/vp8/encoder/mr_dissim.h index f8cb135d52c3..5a59ce62a61a 100644 --- a/media/libvpx/vp8/encoder/mr_dissim.h +++ b/media/libvpx/vp8/encoder/mr_dissim.h @@ -9,12 +9,20 @@ */ -#ifndef __INC_MR_DISSIM_H -#define __INC_MR_DISSIM_H +#ifndef VP8_ENCODER_MR_DISSIM_H_ +#define VP8_ENCODER_MR_DISSIM_H_ #include "vpx_config.h" +#ifdef __cplusplus +extern "C" { +#endif + extern void vp8_cal_low_res_mb_cols(VP8_COMP *cpi); extern void vp8_cal_dissimilarity(VP8_COMP *cpi); extern void vp8_store_drop_frame_info(VP8_COMP *cpi); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_MR_DISSIM_H_ diff --git a/media/libvpx/vp8/encoder/onyx_if.c b/media/libvpx/vp8/encoder/onyx_if.c index 4b60cfd3291c..74e75c4cfceb 100644 --- a/media/libvpx/vp8/encoder/onyx_if.c +++ b/media/libvpx/vp8/encoder/onyx_if.c @@ -19,7 +19,7 @@ #include "vp8/common/alloccommon.h" #include "mcomp.h" #include "firstpass.h" -#include "psnr.h" +#include "vpx/internal/vpx_psnr.h" #include "vpx_scale/vpx_scale.h" #include "vp8/common/extend.h" #include "ratectrl.h" @@ -98,6 +98,9 @@ extern double vp8_calc_ssimg #ifdef OUTPUT_YUV_SRC FILE *yuv_file; #endif +#ifdef OUTPUT_YUV_DENOISED +FILE *yuv_denoised_file; +#endif #if 0 FILE *framepsnr; @@ -610,6 +613,26 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment) while(block_count && i != cpi->cyclic_refresh_mode_index); cpi->cyclic_refresh_mode_index = i; + +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0) { + if (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive && + Q < (int)cpi->denoiser.denoise_pars.qp_thresh) { + // Under aggressive denoising, use segmentation to turn off loop + // filter below some qp thresh. The filter is turned off for all + // blocks that have been encoded as ZEROMV LAST x frames in a row, + // where x is set by cpi->denoiser.denoise_pars.consec_zerolast. + // This is to avoid "dot" artifacts that can occur from repeated + // loop filtering on noisy input source. + cpi->cyclic_refresh_q = Q; + lf_adjustment = -MAX_LOOP_FILTER; + for (i = 0; i < mbs_in_frame; ++i) { + seg_map[i] = (cpi->consec_zero_last[i] > + cpi->denoiser.denoise_pars.consec_zerolast) ? 1 : 0; + } + } + } +#endif } /* Activate segmentation. */ @@ -1256,6 +1279,15 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) vpx_free(cpi->tplist); CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows)); + +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0) { + vp8_denoiser_free(&cpi->denoiser); + vp8_denoiser_allocate(&cpi->denoiser, width, height, + cm->mb_rows, cm->mb_cols, + cpi->oxcf.noise_sensitivity); + } +#endif } @@ -1401,7 +1433,8 @@ static void update_layer_contexts (VP8_COMP *cpi) unsigned int i; double prev_layer_framerate=0; - for (i=0; inumber_of_layers; i++) + assert(oxcf->number_of_layers <= VPX_TS_MAX_LAYERS); + for (i = 0; i < oxcf->number_of_layers && i < VPX_TS_MAX_LAYERS; ++i) { LAYER_CONTEXT *lc = &cpi->layer_context[i]; @@ -1623,6 +1656,12 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) cpi->oxcf.maximum_buffer_size = rescale((int)cpi->oxcf.maximum_buffer_size, cpi->oxcf.target_bandwidth, 1000); + // Under a configuration change, where maximum_buffer_size may change, + // keep buffer level clipped to the maximum allowed buffer size. + if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) { + cpi->bits_off_target = cpi->oxcf.maximum_buffer_size; + cpi->buffer_level = cpi->bits_off_target; + } /* Set up frame rate and related parameters rate control values. */ vp8_new_framerate(cpi, cpi->framerate); @@ -1741,7 +1780,9 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { int width = (cpi->oxcf.Width + 15) & ~15; int height = (cpi->oxcf.Height + 15) & ~15; - vp8_denoiser_allocate(&cpi->denoiser, width, height); + vp8_denoiser_allocate(&cpi->denoiser, width, height, + cm->mb_rows, cm->mb_cols, + cpi->oxcf.noise_sensitivity); } } #endif @@ -1754,8 +1795,11 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) } +#ifndef M_LOG2_E #define M_LOG2_E 0.693147180559945309417 +#endif #define log2f(x) (log (x) / (float) M_LOG2_E) + static void cal_mvsadcosts(int *mvsadcost[2]) { int i = 1; @@ -1865,6 +1909,13 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) */ cpi->cyclic_refresh_mode_enabled = cpi->oxcf.error_resilient_mode; cpi->cyclic_refresh_mode_max_mbs_perframe = (cpi->common.mb_rows * cpi->common.mb_cols) / 5; + if (cpi->oxcf.number_of_layers == 1) { + cpi->cyclic_refresh_mode_max_mbs_perframe = + (cpi->common.mb_rows * cpi->common.mb_cols) / 20; + } else if (cpi->oxcf.number_of_layers == 2) { + cpi->cyclic_refresh_mode_max_mbs_perframe = + (cpi->common.mb_rows * cpi->common.mb_cols) / 10; + } cpi->cyclic_refresh_mode_index = 0; cpi->cyclic_refresh_q = 32; @@ -1875,6 +1926,9 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) else cpi->cyclic_refresh_map = (signed char *) NULL; + CHECK_MEM_ERROR(cpi->consec_zero_last, + vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1)); + #ifdef VP8_ENTROPY_STATS init_context_counters(); #endif @@ -1951,6 +2005,9 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) #ifdef OUTPUT_YUV_SRC yuv_file = fopen("bd.yuv", "ab"); #endif +#ifdef OUTPUT_YUV_DENOISED + yuv_denoised_file = fopen("denoised.yuv", "ab"); +#endif #if 0 framepsnr = fopen("framepsnr.stt", "a"); @@ -2164,10 +2221,12 @@ void vp8_remove_compressor(VP8_COMP **ptr) 8.0 / 1000.0 / time_encoded; double samples = 3.0 / 2 * cpi->frames_in_layer[i] * lst_yv12->y_width * lst_yv12->y_height; - double total_psnr = vp8_mse2psnr(samples, 255.0, - cpi->total_error2[i]); - double total_psnr2 = vp8_mse2psnr(samples, 255.0, - cpi->total_error2_p[i]); + double total_psnr = + vpx_sse_to_psnr(samples, 255.0, + cpi->total_error2[i]); + double total_psnr2 = + vpx_sse_to_psnr(samples, 255.0, + cpi->total_error2_p[i]); double total_ssim = 100 * pow(cpi->sum_ssim[i] / cpi->sum_weights[i], 8.0); @@ -2184,9 +2243,9 @@ void vp8_remove_compressor(VP8_COMP **ptr) { double samples = 3.0 / 2 * cpi->count * lst_yv12->y_width * lst_yv12->y_height; - double total_psnr = vp8_mse2psnr(samples, 255.0, - cpi->total_sq_error); - double total_psnr2 = vp8_mse2psnr(samples, 255.0, + double total_psnr = vpx_sse_to_psnr(samples, 255.0, + cpi->total_sq_error); + double total_psnr2 = vpx_sse_to_psnr(samples, 255.0, cpi->total_sq_error2); double total_ssim = 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0); @@ -2390,6 +2449,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) vpx_free(cpi->mb.ss); vpx_free(cpi->tok); vpx_free(cpi->cyclic_refresh_map); + vpx_free(cpi->consec_zero_last); vp8_remove_common(&cpi->common); vpx_free(cpi); @@ -2398,6 +2458,9 @@ void vp8_remove_compressor(VP8_COMP **ptr) #ifdef OUTPUT_YUV_SRC fclose(yuv_file); #endif +#ifdef OUTPUT_YUV_DENOISED + fclose(yuv_denoised_file); +#endif #if 0 @@ -2516,8 +2579,8 @@ static void generate_psnr_packet(VP8_COMP *cpi) pkt.data.psnr.samples[3] = width * height; for (i = 0; i < 4; i++) - pkt.data.psnr.psnr[i] = vp8_mse2psnr(pkt.data.psnr.samples[i], 255.0, - (double)(pkt.data.psnr.sse[i])); + pkt.data.psnr.psnr[i] = vpx_sse_to_psnr(pkt.data.psnr.samples[i], 255.0, + (double)(pkt.data.psnr.sse[i])); vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt); } @@ -2598,10 +2661,9 @@ int vp8_update_entropy(VP8_COMP *cpi, int update) } -#if OUTPUT_YUV_SRC -void vp8_write_yuv_frame(const char *name, YV12_BUFFER_CONFIG *s) +#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED) +void vp8_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) { - FILE *yuv_file = fopen(name, "ab"); unsigned char *src = s->y_buffer; int h = s->y_height; @@ -2631,12 +2693,9 @@ void vp8_write_yuv_frame(const char *name, YV12_BUFFER_CONFIG *s) src += s->uv_stride; } while (--h); - - fclose(yuv_file); } #endif - static void scale_and_extend_source(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) { VP8_COMMON *cm = &cpi->common; @@ -2675,8 +2734,8 @@ static int resize_key_frame(VP8_COMP *cpi) VP8_COMMON *cm = &cpi->common; /* Do we need to apply resampling for one pass cbr. - * In one pass this is more limited than in two pass cbr - * The test and any change is only made one per key frame sequence + * In one pass this is more limited than in two pass cbr. + * The test and any change is only made once per key frame sequence. */ if (cpi->oxcf.allow_spatial_resampling && (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) { @@ -2699,7 +2758,7 @@ static int resize_key_frame(VP8_COMP *cpi) cm->vert_scale = (cm->vert_scale > NORMAL) ? cm->vert_scale - 1 : NORMAL; } - /* Get the new hieght and width */ + /* Get the new height and width */ Scale2Ratio(cm->horiz_scale, &hr, &hs); Scale2Ratio(cm->vert_scale, &vr, &vs); new_width = ((hs - 1) + (cpi->oxcf.Width * hr)) / hs; @@ -3093,10 +3152,8 @@ static void update_reference_frames(VP8_COMP *cpi) cm->alt_fb_idx = cm->gld_fb_idx = cm->new_fb_idx; -#if CONFIG_MULTI_RES_ENCODING cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame; cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame; -#endif } else /* For non key frames */ { @@ -3108,9 +3165,7 @@ static void update_reference_frames(VP8_COMP *cpi) cm->yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME; cm->alt_fb_idx = cm->new_fb_idx; -#if CONFIG_MULTI_RES_ENCODING cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame; -#endif } else if (cm->copy_buffer_to_arf) { @@ -3124,10 +3179,8 @@ static void update_reference_frames(VP8_COMP *cpi) yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME; cm->alt_fb_idx = cm->lst_fb_idx; -#if CONFIG_MULTI_RES_ENCODING cpi->current_ref_frames[ALTREF_FRAME] = cpi->current_ref_frames[LAST_FRAME]; -#endif } } else /* if (cm->copy_buffer_to_arf == 2) */ @@ -3138,10 +3191,8 @@ static void update_reference_frames(VP8_COMP *cpi) yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME; cm->alt_fb_idx = cm->gld_fb_idx; -#if CONFIG_MULTI_RES_ENCODING cpi->current_ref_frames[ALTREF_FRAME] = cpi->current_ref_frames[GOLDEN_FRAME]; -#endif } } } @@ -3154,9 +3205,7 @@ static void update_reference_frames(VP8_COMP *cpi) cm->yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME; cm->gld_fb_idx = cm->new_fb_idx; -#if CONFIG_MULTI_RES_ENCODING cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame; -#endif } else if (cm->copy_buffer_to_gf) { @@ -3170,10 +3219,8 @@ static void update_reference_frames(VP8_COMP *cpi) yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME; cm->gld_fb_idx = cm->lst_fb_idx; -#if CONFIG_MULTI_RES_ENCODING cpi->current_ref_frames[GOLDEN_FRAME] = cpi->current_ref_frames[LAST_FRAME]; -#endif } } else /* if (cm->copy_buffer_to_gf == 2) */ @@ -3184,10 +3231,8 @@ static void update_reference_frames(VP8_COMP *cpi) yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME; cm->gld_fb_idx = cm->alt_fb_idx; -#if CONFIG_MULTI_RES_ENCODING cpi->current_ref_frames[GOLDEN_FRAME] = cpi->current_ref_frames[ALTREF_FRAME]; -#endif } } } @@ -3199,9 +3244,7 @@ static void update_reference_frames(VP8_COMP *cpi) cm->yv12_fb[cm->lst_fb_idx].flags &= ~VP8_LAST_FRAME; cm->lst_fb_idx = cm->new_fb_idx; -#if CONFIG_MULTI_RES_ENCODING cpi->current_ref_frames[LAST_FRAME] = cm->current_video_frame; -#endif } #if CONFIG_TEMPORAL_DENOISING @@ -3214,17 +3257,9 @@ static void update_reference_frames(VP8_COMP *cpi) if (cm->frame_type == KEY_FRAME) { int i; - vp8_yv12_copy_frame( - cpi->Source, - &cpi->denoiser.yv12_running_avg[LAST_FRAME]); - - vp8_yv12_extend_frame_borders( - &cpi->denoiser.yv12_running_avg[LAST_FRAME]); - - for (i = 2; i < MAX_REF_FRAMES - 1; i++) - vp8_yv12_copy_frame( - &cpi->denoiser.yv12_running_avg[LAST_FRAME], - &cpi->denoiser.yv12_running_avg[i]); + for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i) + vp8_yv12_copy_frame(cpi->Source, + &cpi->denoiser.yv12_running_avg[i]); } else /* For non key frames */ { @@ -3250,12 +3285,123 @@ static void update_reference_frames(VP8_COMP *cpi) &cpi->denoiser.yv12_running_avg[LAST_FRAME]); } } + if (cpi->oxcf.noise_sensitivity == 4) + vp8_yv12_copy_frame(cpi->Source, &cpi->denoiser.yv12_last_source); } #endif } +#if CONFIG_TEMPORAL_DENOISING +static void process_denoiser_mode_change(VP8_COMP *cpi) { + const VP8_COMMON *const cm = &cpi->common; + int i, j; + int total = 0; + int num_blocks = 0; + // Number of blocks skipped along row/column in computing the + // nmse (normalized mean square error) of source. + int skip = 2; + // Only select blocks for computing nmse that have been encoded + // as ZERO LAST min_consec_zero_last frames in a row. + // Scale with number of temporal layers. + int min_consec_zero_last = 8 / cpi->oxcf.number_of_layers; + // Decision is tested for changing the denoising mode every + // num_mode_change times this function is called. Note that this + // function called every 8 frames, so (8 * num_mode_change) is number + // of frames where denoising mode change is tested for switch. + int num_mode_change = 15; + // Framerate factor, to compensate for larger mse at lower framerates. + // Use ref_framerate, which is full source framerate for temporal layers. + // TODO(marpan): Adjust this factor. + int fac_framerate = cpi->ref_framerate < 25.0f ? 80 : 100; + int tot_num_blocks = cm->mb_rows * cm->mb_cols; + int ystride = cpi->Source->y_stride; + unsigned char *src = cpi->Source->y_buffer; + unsigned char *dst = cpi->denoiser.yv12_last_source.y_buffer; + static const unsigned char const_source[16] = { + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128}; + + // Loop through the Y plane, every skip blocks along rows and columns, + // summing the normalized mean square error, only for blocks that have + // been encoded as ZEROMV LAST at least min_consec_zero_last least frames in + // a row and have small sum difference between current and previous frame. + // Normalization here is by the contrast of the current frame block. + for (i = 0; i < cm->Height; i += 16 * skip) { + int block_index_row = (i >> 4) * cm->mb_cols; + for (j = 0; j < cm->Width; j += 16 * skip) { + int index = block_index_row + (j >> 4); + if (cpi->consec_zero_last[index] >= min_consec_zero_last) { + unsigned int sse; + const unsigned int mse = vp8_mse16x16(src + j, + ystride, + dst + j, + ystride, + &sse); + const unsigned int var = vp8_variance16x16(src + j, + ystride, + dst + j, + ystride, + &sse); + // Only consider this block as valid for noise measurement + // if the sum_diff average of the current and previous frame + // is small (to avoid effects from lighting change). + if ((mse - var) < 256) { + const unsigned int act = vp8_variance16x16(src + j, + ystride, + const_source, + 0, + &sse); + if (act > 0) + total += mse / act; + num_blocks++; + } + } + } + src += 16 * skip * ystride; + dst += 16 * skip * ystride; + } + total = total * fac_framerate / 100; + + // Only consider this frame as valid sample if we have computed nmse over + // at least ~1/16 blocks, and Total > 0 (Total == 0 can happen if the + // application inputs duplicate frames, or contrast is all zero). + if (total > 0 && + (num_blocks > (tot_num_blocks >> 4))) { + // Update the recursive mean square source_diff. + if (cpi->denoiser.nmse_source_diff_count == 0) + // First sample in new interval. + cpi->denoiser.nmse_source_diff = total; + else + // For subsequent samples, use average with weight ~1/4 for new sample. + cpi->denoiser.nmse_source_diff = (int)((total >> 2) + + 3 * (cpi->denoiser.nmse_source_diff >> 2)); + cpi->denoiser.nmse_source_diff_count++; + } + // Check for changing the denoiser mode, when we have obtained #samples = + // num_mode_change. + if (cpi->denoiser.nmse_source_diff_count == num_mode_change) { + // Check for going up: from normal to aggressive mode. + if ((cpi->denoiser.denoiser_mode == kDenoiserOnYUV) && + (cpi->denoiser.nmse_source_diff > + cpi->denoiser.threshold_aggressive_mode)) { + vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUVAggressive); + } else { + // Check for going down: from aggressive to normal mode. + if ((cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) && + (cpi->denoiser.nmse_source_diff < + cpi->denoiser.threshold_aggressive_mode)) { + vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUV); + } + } + // Reset metric and counter for next interval. + cpi->denoiser.nmse_source_diff = 0; + cpi->denoiser.nmse_source_diff_count = 0; + } +} +#endif + void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) { const FRAME_TYPE frame_type = cm->frame_type; @@ -3412,6 +3558,12 @@ static void encode_frame_to_data_rate { /* Key frame from VFW/auto-keyframe/first frame */ cm->frame_type = KEY_FRAME; +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity == 4) { + // For adaptive mode, reset denoiser to normal mode on key frame. + vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUV); + } +#endif } #if CONFIG_MULTI_RES_ENCODING @@ -3445,6 +3597,31 @@ static void encode_frame_to_data_rate } #endif + // Find the reference frame closest to the current frame. + cpi->closest_reference_frame = LAST_FRAME; + if (cm->frame_type != KEY_FRAME) { + int i; + MV_REFERENCE_FRAME closest_ref = INTRA_FRAME; + if (cpi->ref_frame_flags & VP8_LAST_FRAME) { + closest_ref = LAST_FRAME; + } else if (cpi->ref_frame_flags & VP8_GOLD_FRAME) { + closest_ref = GOLDEN_FRAME; + } else if (cpi->ref_frame_flags & VP8_ALTR_FRAME) { + closest_ref = ALTREF_FRAME; + } + for (i = 1; i <= 3; i++) { + vpx_ref_frame_type_t ref_frame_type = (vpx_ref_frame_type_t) + ((i == 3) ? 4 : i); + if (cpi->ref_frame_flags & ref_frame_type) { + if ((cm->current_video_frame - cpi->current_ref_frames[i]) < + (cm->current_video_frame - cpi->current_ref_frames[closest_ref])) { + closest_ref = i; + } + } + } + cpi->closest_reference_frame = closest_ref; + } + /* Set various flags etc to special state if it is a key frame */ if (cm->frame_type == KEY_FRAME) { @@ -3461,6 +3638,9 @@ static void encode_frame_to_data_rate { cpi->mb.rd_thresh_mult[i] = 128; } + + // Reset the zero_last counter to 0 on key frame. + vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols); } #if 0 @@ -3808,7 +3988,7 @@ static void encode_frame_to_data_rate /* Setup background Q adjustment for error resilient mode. * For multi-layer encodes only enable this for the base layer. - */ + */ if (cpi->cyclic_refresh_mode_enabled) { if (cpi->current_layer==0) @@ -3882,8 +4062,9 @@ static void encode_frame_to_data_rate #endif + #ifdef OUTPUT_YUV_SRC - vp8_write_yuv_frame(cpi->Source); + vp8_write_yuv_frame(yuv_file, cpi->Source); #endif do @@ -3977,6 +4158,8 @@ static void encode_frame_to_data_rate else disable_segmentation(cpi); } + // Reset the consec_zero_last counter on key frame. + vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols); vp8_set_quantizer(cpi, Q); } @@ -4365,7 +4548,8 @@ static void encode_frame_to_data_rate { for (mb_col = 0; mb_col < cm->mb_cols; mb_col ++) { - if(tmp->mbmi.mode == ZEROMV) + if (tmp->mbmi.mode == ZEROMV && + tmp->mbmi.ref_frame == LAST_FRAME) cpi->zeromv_count++; tmp++; } @@ -4407,6 +4591,21 @@ static void encode_frame_to_data_rate cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx]; +#if CONFIG_TEMPORAL_DENOISING + // For the adaptive denoising mode (noise_sensitivity == 4), sample the mse + // of source diff (between current and previous frame), and determine if we + // should switch the denoiser mode. Sampling refers to computing the mse for + // a sub-sample of the frame (i.e., skip x blocks along row/column), and + // only for blocks in that set that have used ZEROMV LAST, along with some + // constraint on the sum diff between blocks. This process is called every + // ~8 frames, to further reduce complexity. + if (cpi->oxcf.noise_sensitivity == 4 && + cpi->frames_since_key % 8 == 0 && + cm->frame_type != KEY_FRAME) { + process_denoiser_mode_change(cpi); + } +#endif + #if CONFIG_MULTITHREAD if (cpi->b_multi_threaded) { @@ -4422,6 +4621,11 @@ static void encode_frame_to_data_rate update_reference_frames(cpi); +#ifdef OUTPUT_YUV_DENOISED + vp8_write_yuv_frame(yuv_denoised_file, + &cpi->denoiser.yv12_running_avg[INTRA_FRAME]); +#endif + #if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) if (cpi->oxcf.error_resilient_mode) { @@ -4621,45 +4825,43 @@ static void encode_frame_to_data_rate vp8_clear_system_state(); if (cpi->twopass.total_left_stats.coded_error != 0.0) - fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d %6d %6d" - "%6d %6d %6d %5d %5d %5d %8d %8.2f %10d %10.3f" - "%10.3f %8d\n", + fprintf(f, "%10d %10d %10d %10d %10d %10"PRId64" %10"PRId64 + "%10"PRId64" %10d %6d %6d %6d %6d %5d %5d %5d %8d " + "%8.2lf %"PRId64" %10.3lf %10"PRId64" %8d\n", cpi->common.current_video_frame, cpi->this_frame_target, cpi->projected_frame_size, (cpi->projected_frame_size - cpi->this_frame_target), - (int)cpi->total_target_vs_actual, + cpi->total_target_vs_actual, cpi->buffer_level, (cpi->oxcf.starting_buffer_level-cpi->bits_off_target), - (int)cpi->total_actual_bits, cm->base_qindex, + cpi->total_actual_bits, cm->base_qindex, cpi->active_best_quality, cpi->active_worst_quality, cpi->ni_av_qi, cpi->cq_target_quality, - cpi->zbin_over_quant, cm->refresh_golden_frame, cm->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost, cpi->twopass.est_max_qcorrection_factor, - (int)cpi->twopass.bits_left, + cpi->twopass.bits_left, cpi->twopass.total_left_stats.coded_error, (double)cpi->twopass.bits_left / cpi->twopass.total_left_stats.coded_error, cpi->tot_recode_hits); else - fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d %6d %6d" - "%6d %6d %6d %5d %5d %5d %8d %8.2f %10d %10.3f" - "%8d\n", - cpi->common.current_video_frame, - cpi->this_frame_target, cpi->projected_frame_size, + fprintf(f, "%10d %10d %10d %10d %10d %10"PRId64" %10"PRId64 + "%10"PRId64" %10d %6d %6d %6d %6d %5d %5d %5d %8d " + "%8.2lf %"PRId64" %10.3lf %8d\n", + cpi->common.current_video_frame, cpi->this_frame_target, + cpi->projected_frame_size, (cpi->projected_frame_size - cpi->this_frame_target), - (int)cpi->total_target_vs_actual, + cpi->total_target_vs_actual, cpi->buffer_level, (cpi->oxcf.starting_buffer_level-cpi->bits_off_target), - (int)cpi->total_actual_bits, cm->base_qindex, + cpi->total_actual_bits, cm->base_qindex, cpi->active_best_quality, cpi->active_worst_quality, cpi->ni_av_qi, cpi->cq_target_quality, - cpi->zbin_over_quant, cm->refresh_golden_frame, cm->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost, cpi->twopass.est_max_qcorrection_factor, - (int)cpi->twopass.bits_left, + cpi->twopass.bits_left, cpi->twopass.total_left_stats.coded_error, cpi->tot_recode_hits); @@ -4667,7 +4869,6 @@ static void encode_frame_to_data_rate { FILE *fmodes = fopen("Modes.stt", "a"); - int i; fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame, @@ -4811,33 +5012,11 @@ static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, } #endif -/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ -#if HAVE_NEON -extern void vp8_push_neon(int64_t *store); -extern void vp8_pop_neon(int64_t *store); -#endif - - int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time) { -#if HAVE_NEON - int64_t store_reg[8]; -#if CONFIG_RUNTIME_CPU_DETECT - VP8_COMMON *cm = &cpi->common; -#endif -#endif struct vpx_usec_timer timer; int res = 0; -#if HAVE_NEON -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->cpu_caps & HAS_NEON) -#endif - { - vp8_push_neon(store_reg); - } -#endif - vpx_usec_timer_start(&timer); /* Reinit the lookahead buffer if the frame size changes */ @@ -4854,15 +5033,6 @@ int vp8_receive_raw_frame(VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_C vpx_usec_timer_mark(&timer); cpi->time_receive_data += vpx_usec_timer_elapsed(&timer); -#if HAVE_NEON -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->cpu_caps & HAS_NEON) -#endif - { - vp8_pop_neon(store_reg); - } -#endif - return res; } @@ -4883,9 +5053,6 @@ static int frame_is_reference(const VP8_COMP *cpi) int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush) { -#if HAVE_NEON - int64_t store_reg[8]; -#endif VP8_COMMON *cm; struct vpx_usec_timer tsctimer; struct vpx_usec_timer ticktimer; @@ -4900,20 +5067,12 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l if (setjmp(cpi->common.error.jmp)) { cpi->common.error.setjmp = 0; + vp8_clear_system_state(); return VPX_CODEC_CORRUPT_FRAME; } cpi->common.error.setjmp = 1; -#if HAVE_NEON -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->cpu_caps & HAS_NEON) -#endif - { - vp8_push_neon(store_reg); - } -#endif - vpx_usec_timer_start(&cmptimer); cpi->source = NULL; @@ -4996,14 +5155,6 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l #endif -#if HAVE_NEON -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->cpu_caps & HAS_NEON) -#endif - { - vp8_pop_neon(store_reg); - } -#endif return -1; } @@ -5066,7 +5217,9 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l unsigned int i; /* Update frame rates for each layer */ - for (i=0; ioxcf.number_of_layers; i++) + assert(cpi->oxcf.number_of_layers <= VPX_TS_MAX_LAYERS); + for (i = 0; i < cpi->oxcf.number_of_layers && + i < VPX_TS_MAX_LAYERS; ++i) { LAYER_CONTEXT *lc = &cpi->layer_context[i]; lc->framerate = cpi->ref_framerate / @@ -5268,7 +5421,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l int y_samples = orig->y_height * orig->y_width ; int uv_samples = orig->uv_height * orig->uv_width ; int t_samples = y_samples + 2 * uv_samples; - double sq_error, sq_error2; + double sq_error; ye = calc_plane_error(orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height); @@ -5281,16 +5434,17 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l sq_error = (double)(ye + ue + ve); - frame_psnr = vp8_mse2psnr(t_samples, 255.0, sq_error); + frame_psnr = vpx_sse_to_psnr(t_samples, 255.0, sq_error); - cpi->total_y += vp8_mse2psnr(y_samples, 255.0, (double)ye); - cpi->total_u += vp8_mse2psnr(uv_samples, 255.0, (double)ue); - cpi->total_v += vp8_mse2psnr(uv_samples, 255.0, (double)ve); + cpi->total_y += vpx_sse_to_psnr(y_samples, 255.0, (double)ye); + cpi->total_u += vpx_sse_to_psnr(uv_samples, 255.0, (double)ue); + cpi->total_v += vpx_sse_to_psnr(uv_samples, 255.0, (double)ve); cpi->total_sq_error += sq_error; cpi->total += frame_psnr; #if CONFIG_POSTPROC { YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer; + double sq_error2; double frame_psnr2, frame_ssim2 = 0; double weight = 0; @@ -5308,14 +5462,14 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l sq_error2 = (double)(ye + ue + ve); - frame_psnr2 = vp8_mse2psnr(t_samples, 255.0, sq_error2); + frame_psnr2 = vpx_sse_to_psnr(t_samples, 255.0, sq_error2); - cpi->totalp_y += vp8_mse2psnr(y_samples, - 255.0, (double)ye); - cpi->totalp_u += vp8_mse2psnr(uv_samples, - 255.0, (double)ue); - cpi->totalp_v += vp8_mse2psnr(uv_samples, - 255.0, (double)ve); + cpi->totalp_y += vpx_sse_to_psnr(y_samples, + 255.0, (double)ye); + cpi->totalp_u += vpx_sse_to_psnr(uv_samples, + 255.0, (double)ue); + cpi->totalp_v += vpx_sse_to_psnr(uv_samples, + 255.0, (double)ve); cpi->total_sq_error2 += sq_error2; cpi->totalp += frame_psnr2; @@ -5406,15 +5560,6 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l #endif #endif -#if HAVE_NEON -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->cpu_caps & HAS_NEON) -#endif - { - vp8_pop_neon(store_reg); - } -#endif - cpi->common.error.setjmp = 0; return 0; diff --git a/media/libvpx/vp8/encoder/onyx_int.h b/media/libvpx/vp8/encoder/onyx_int.h index 3ab0fe8bfa1c..f0424e69ca5e 100644 --- a/media/libvpx/vp8/encoder/onyx_int.h +++ b/media/libvpx/vp8/encoder/onyx_int.h @@ -9,8 +9,8 @@ */ -#ifndef __INC_VP8_INT_H -#define __INC_VP8_INT_H +#ifndef VP8_ENCODER_ONYX_INT_H_ +#define VP8_ENCODER_ONYX_INT_H_ #include #include "vpx_config.h" @@ -33,6 +33,10 @@ #include "vp8/encoder/denoising.h" #endif +#ifdef __cplusplus +extern "C" { +#endif + #define MIN_GF_INTERVAL 4 #define DEFAULT_GF_INTERVAL 7 @@ -57,9 +61,6 @@ #define VP8_TEMPORAL_ALT_REF 1 #endif -#define MAX(x,y) (((x)>(y))?(x):(y)) -#define MIN(x,y) (((x)<(y))?(x):(y)) - typedef struct { int kf_indicated; @@ -510,6 +511,8 @@ typedef struct VP8_COMP int cyclic_refresh_mode_index; int cyclic_refresh_q; signed char *cyclic_refresh_map; + // Count on how many (consecutive) times a macroblock uses ZER0MV_LAST. + unsigned char *consec_zero_last; // Frame counter for the temporal pattern. Counter is rest when the temporal // layers are changed dynamically (run-time change). @@ -681,9 +684,10 @@ typedef struct VP8_COMP int mr_low_res_mb_cols; /* Indicate if lower-res mv info is available */ unsigned char mr_low_res_mv_avail; +#endif /* The frame number of each reference frames */ unsigned int current_ref_frames[MAX_REF_FRAMES]; -#endif + MV_REFERENCE_FRAME closest_reference_frame; struct rd_costs_struct { @@ -721,4 +725,8 @@ void vp8_set_speed_features(VP8_COMP *cpi); "Failed to allocate "#lval);\ } while(0) #endif +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_ONYX_INT_H_ diff --git a/media/libvpx/vp8/encoder/pickinter.c b/media/libvpx/vp8/encoder/pickinter.c index c5279fed2a88..43f8957d1417 100644 --- a/media/libvpx/vp8/encoder/pickinter.c +++ b/media/libvpx/vp8/encoder/pickinter.c @@ -14,6 +14,7 @@ #include "onyx_int.h" #include "modecosts.h" #include "encodeintra.h" +#include "vp8/common/common.h" #include "vp8/common/entropymode.h" #include "pickinter.h" #include "vp8/common/findnearmv.h" @@ -39,7 +40,6 @@ extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES]; extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]); - int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv, int error_per_bit, @@ -487,6 +487,7 @@ static int evaluate_inter_mode(unsigned int* sse, int rate2, int* distortion2, MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode; int_mv mv = x->e_mbd.mode_info_context->mbmi.mv; int this_rd; + int denoise_aggressive = 0; /* Exit early and don't compute the distortion if this macroblock * is marked inactive. */ if (cpi->active_map_enabled && x->active_ptr[0] == 0) @@ -505,16 +506,18 @@ static int evaluate_inter_mode(unsigned int* sse, int rate2, int* distortion2, this_rd = RDCOST(x->rdmult, x->rddiv, rate2, *distortion2); - /* Adjust rd to bias to ZEROMV */ - if(this_mode == ZEROMV) - { - /* Bias to ZEROMV on LAST_FRAME reference when it is available. */ - if ((cpi->ref_frame_flags & VP8_LAST_FRAME & - cpi->common.refresh_last_frame) - && x->e_mbd.mode_info_context->mbmi.ref_frame != LAST_FRAME) - rd_adj = 100; +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0) { + denoise_aggressive = + (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) ? 1 : 0; + } +#endif - // rd_adj <= 100 + // Adjust rd for ZEROMV and LAST, if LAST is the closest reference frame. + if (this_mode == ZEROMV && + x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME && + (denoise_aggressive || cpi->closest_reference_frame == LAST_FRAME)) + { this_rd = ((int64_t)this_rd) * rd_adj / 100; } @@ -589,9 +592,9 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int distortion2; int bestsme = INT_MAX; int best_mode_index = 0; - unsigned int sse = INT_MAX, best_rd_sse = INT_MAX; + unsigned int sse = UINT_MAX, best_rd_sse = UINT_MAX; #if CONFIG_TEMPORAL_DENOISING - unsigned int zero_mv_sse = INT_MAX, best_sse = INT_MAX; + unsigned int zero_mv_sse = UINT_MAX, best_sse = UINT_MAX; #endif int sf_improved_mv_pred = cpi->sf.improved_mv_pred; @@ -693,6 +696,13 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, */ calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment); +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity) { + rd_adjustment = (int)(rd_adjustment * + cpi->denoiser.denoise_pars.pickmode_mv_bias / 100); + } +#endif + /* if we encode a new mv this is important * find the best new motion vector */ @@ -1167,6 +1177,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, #if CONFIG_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity) { + int block_index = mb_row * cpi->common.mb_cols + mb_col; if (x->best_sse_inter_mode == DC_PRED) { /* No best MV found. */ @@ -1176,9 +1187,11 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, x->best_reference_frame = best_mbmode.ref_frame; best_sse = best_rd_sse; } + x->increase_denoising = 0; vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse, - recon_yoffset, recon_uvoffset); - + recon_yoffset, recon_uvoffset, + &cpi->common.lf_info, mb_row, mb_col, + block_index); /* Reevaluate ZEROMV after denoising. */ if (best_mbmode.ref_frame == INTRA_FRAME && diff --git a/media/libvpx/vp8/encoder/pickinter.h b/media/libvpx/vp8/encoder/pickinter.h index 35011cab3588..cf3b1f8d49cd 100644 --- a/media/libvpx/vp8/encoder/pickinter.h +++ b/media/libvpx/vp8/encoder/pickinter.h @@ -9,11 +9,15 @@ */ -#ifndef __INC_PICKINTER_H -#define __INC_PICKINTER_H +#ifndef VP8_ENCODER_PICKINTER_H_ +#define VP8_ENCODER_PICKINTER_H_ #include "vpx_config.h" #include "vp8/common/onyxc_int.h" +#ifdef __cplusplus +extern "C" { +#endif + extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra, @@ -24,4 +28,8 @@ extern int vp8_get_inter_mbpred_error(MACROBLOCK *mb, const vp8_variance_fn_ptr_t *vfp, unsigned int *sse, int_mv this_mv); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_PICKINTER_H_ diff --git a/media/libvpx/vp8/encoder/picklpf.c b/media/libvpx/vp8/encoder/picklpf.c index 250d04c7f906..f0c8f28fc968 100644 --- a/media/libvpx/vp8/encoder/picklpf.c +++ b/media/libvpx/vp8/encoder/picklpf.c @@ -23,8 +23,8 @@ extern int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest); -void vp8_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc, - YV12_BUFFER_CONFIG *dst_ybc) +static void yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc) { unsigned char *src_y, *dst_y; int yheight; @@ -173,7 +173,7 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) /* Get the err using the previous frame's filter value. */ /* Copy the unfiltered / processed recon buffer to the new buffer */ - vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show); + yv12_copy_partial_frame(saved_frame, cm->frame_to_show); vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val); best_err = calc_partial_ssl_err(sd, cm->frame_to_show); @@ -184,7 +184,7 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) while (filt_val >= min_filter_level) { /* Apply the loop filter */ - vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show); + yv12_copy_partial_frame(saved_frame, cm->frame_to_show); vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val); /* Get the err for filtered frame */ @@ -214,7 +214,7 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) while (filt_val < max_filter_level) { /* Apply the loop filter */ - vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show); + yv12_copy_partial_frame(saved_frame, cm->frame_to_show); vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val); diff --git a/media/libvpx/vp8/encoder/psnr.c b/media/libvpx/vp8/encoder/psnr.c deleted file mode 100644 index b3a3d955290f..000000000000 --- a/media/libvpx/vp8/encoder/psnr.c +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_scale/yv12config.h" -#include "math.h" -#include "vp8/common/systemdependent.h" /* for vp8_clear_system_state() */ - -#define MAX_PSNR 100 - -double vp8_mse2psnr(double Samples, double Peak, double Mse) -{ - double psnr; - - if ((double)Mse > 0.0) - psnr = 10.0 * log10(Peak * Peak * Samples / Mse); - else - psnr = MAX_PSNR; /* Limit to prevent / 0 */ - - if (psnr > MAX_PSNR) - psnr = MAX_PSNR; - - return psnr; -} diff --git a/media/libvpx/vp8/encoder/psnr.h b/media/libvpx/vp8/encoder/psnr.h deleted file mode 100644 index 7f6269abe4c8..000000000000 --- a/media/libvpx/vp8/encoder/psnr.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef __INC_PSNR_H -#define __INC_PSNR_H - -extern double vp8_mse2psnr(double Samples, double Peak, double Mse); - -#endif diff --git a/media/libvpx/vp8/encoder/quantize.h b/media/libvpx/vp8/encoder/quantize.h index d55496c5ffe5..c739b2627b61 100644 --- a/media/libvpx/vp8/encoder/quantize.h +++ b/media/libvpx/vp8/encoder/quantize.h @@ -9,8 +9,12 @@ */ -#ifndef __INC_QUANTIZE_H -#define __INC_QUANTIZE_H +#ifndef VP8_ENCODER_QUANTIZE_H_ +#define VP8_ENCODER_QUANTIZE_H_ + +#ifdef __cplusplus +extern "C" { +#endif struct VP8_COMP; struct macroblock; @@ -20,4 +24,8 @@ extern void vp8_update_zbin_extra(struct VP8_COMP *cpi, struct macroblock *x); extern void vp8cx_mb_init_quantizer(struct VP8_COMP *cpi, struct macroblock *x, int ok_to_skip); extern void vp8cx_init_quantizer(struct VP8_COMP *cpi); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_QUANTIZE_H_ diff --git a/media/libvpx/vp8/encoder/ratectrl.c b/media/libvpx/vp8/encoder/ratectrl.c index fe4db13b3983..c51650c3c262 100644 --- a/media/libvpx/vp8/encoder/ratectrl.c +++ b/media/libvpx/vp8/encoder/ratectrl.c @@ -174,14 +174,6 @@ static const int kf_gf_boost_qlimits[QINDEX_RANGE] = 600, 600, 600, 600, 600, 600, 600, 600, }; -/* % adjustment to target kf size based on seperation from previous frame */ -static const int kf_boost_seperation_adjustment[16] = -{ - 30, 40, 50, 55, 60, 65, 70, 75, - 80, 85, 90, 95, 100, 100, 100, 100, -}; - - static const int gf_adjust_table[101] = { 100, @@ -1238,7 +1230,6 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) { Q = cpi->oxcf.gold_q; } - } else { diff --git a/media/libvpx/vp8/encoder/ratectrl.h b/media/libvpx/vp8/encoder/ratectrl.h index c43f08d6dd26..829697f391f6 100644 --- a/media/libvpx/vp8/encoder/ratectrl.h +++ b/media/libvpx/vp8/encoder/ratectrl.h @@ -9,10 +9,15 @@ */ -#if !defined __INC_RATECTRL_H +#ifndef VP8_ENCODER_RATECTRL_H_ +#define VP8_ENCODER_RATECTRL_H_ #include "onyx_int.h" +#ifdef __cplusplus +extern "C" { +#endif + extern void vp8_save_coding_context(VP8_COMP *cpi); extern void vp8_restore_coding_context(VP8_COMP *cpi); @@ -25,4 +30,8 @@ extern void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_ /* return of 0 means drop frame */ extern int vp8_pick_frame_size(VP8_COMP *cpi); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_RATECTRL_H_ diff --git a/media/libvpx/vp8/encoder/rdopt.c b/media/libvpx/vp8/encoder/rdopt.c index 5016cc4220f2..2f6f5d07c81d 100644 --- a/media/libvpx/vp8/encoder/rdopt.c +++ b/media/libvpx/vp8/encoder/rdopt.c @@ -15,7 +15,6 @@ #include #include "vpx_config.h" #include "vp8_rtcd.h" -#include "vp8/common/pragmas.h" #include "tokenize.h" #include "treewriter.h" #include "onyx_int.h" @@ -528,19 +527,16 @@ static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, int type, ENTROPY_CONTEXT *a, VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); -# define QC( I) ( qcoeff_ptr [vp8_default_zig_zag1d[I]] ) - + assert(eob <= 16); for (; c < eob; c++) { - int v = QC(c); - int t = vp8_dct_value_tokens_ptr[v].Token; + const int v = qcoeff_ptr[vp8_default_zig_zag1d[c]]; + const int t = vp8_dct_value_tokens_ptr[v].Token; cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [t]; cost += vp8_dct_value_cost_ptr[v]; pt = vp8_prev_token_class[t]; } -# undef QC - if (c < 16) cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [DCT_EOB_TOKEN]; @@ -1939,7 +1935,8 @@ static void update_best_mode(BEST_MODE* best_mode, int this_rd, void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, - int *returndistortion, int *returnintra) + int *returndistortion, int *returnintra, + int mb_row, int mb_col) { BLOCK *b = &x->block[0]; BLOCKD *d = &x->e_mbd.block[0]; @@ -1977,8 +1974,8 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, cpi->common.y1dc_delta_q); #if CONFIG_TEMPORAL_DENOISING - unsigned int zero_mv_sse = INT_MAX, best_sse = INT_MAX, - best_rd_sse = INT_MAX; + unsigned int zero_mv_sse = UINT_MAX, best_sse = UINT_MAX, + best_rd_sse = UINT_MAX; #endif mode_mv = mode_mv_sb[sign_bias]; @@ -2514,6 +2511,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, #if CONFIG_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity) { + int block_index = mb_row * cpi->common.mb_cols + mb_col; if (x->best_sse_inter_mode == DC_PRED) { /* No best MV found. */ @@ -2524,8 +2522,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, best_sse = best_rd_sse; } vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse, - recon_yoffset, recon_uvoffset); - + recon_yoffset, recon_uvoffset, + &cpi->common.lf_info, mb_row, mb_col, + block_index); /* Reevaluate ZEROMV after denoising. */ if (best_mode.mbmode.ref_frame == INTRA_FRAME && diff --git a/media/libvpx/vp8/encoder/rdopt.h b/media/libvpx/vp8/encoder/rdopt.h index 1e11fa77dfce..e0da35e203c8 100644 --- a/media/libvpx/vp8/encoder/rdopt.h +++ b/media/libvpx/vp8/encoder/rdopt.h @@ -9,8 +9,12 @@ */ -#ifndef __INC_RDOPT_H -#define __INC_RDOPT_H +#ifndef VP8_ENCODER_RDOPT_H_ +#define VP8_ENCODER_RDOPT_H_ + +#ifdef __cplusplus +extern "C" { +#endif #define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) ) @@ -66,7 +70,10 @@ static void insertsortsad(int arr[],int idx[], int len) } extern void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue); -extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra); +extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, + int recon_yoffset, int recon_uvoffset, + int *returnrate, int *returndistortion, + int *returnintra, int mb_row, int mb_col); extern void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate); @@ -130,4 +137,8 @@ extern void vp8_mv_pred ); void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffset, int near_sadidx[]); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_RDOPT_H_ diff --git a/media/libvpx/vp8/encoder/segmentation.h b/media/libvpx/vp8/encoder/segmentation.h index 12815b087f40..6b5500594e6e 100644 --- a/media/libvpx/vp8/encoder/segmentation.h +++ b/media/libvpx/vp8/encoder/segmentation.h @@ -8,9 +8,21 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP8_ENCODER_SEGMENTATION_H_ +#define VP8_ENCODER_SEGMENTATION_H_ #include "string.h" #include "vp8/common/blockd.h" #include "onyx_int.h" +#ifdef __cplusplus +extern "C" { +#endif + extern void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_ENCODER_SEGMENTATION_H_ diff --git a/media/libvpx/vp8/encoder/temporal_filter.c b/media/libvpx/vp8/encoder/temporal_filter.c index 7e3af71ecba8..4dc0d9592211 100644 --- a/media/libvpx/vp8/encoder/temporal_filter.c +++ b/media/libvpx/vp8/encoder/temporal_filter.c @@ -16,7 +16,6 @@ #include "vp8/common/alloccommon.h" #include "mcomp.h" #include "firstpass.h" -#include "psnr.h" #include "vpx_scale/vpx_scale.h" #include "vp8/common/extend.h" #include "ratectrl.h" @@ -99,6 +98,7 @@ void vp8_temporal_filter_apply_c unsigned int i, j, k; int modifier; int byte = 0; + const int rounding = strength > 0 ? 1 << (strength - 1) : 0; for (i = 0,k = 0; i < block_size; i++) { @@ -115,7 +115,7 @@ void vp8_temporal_filter_apply_c */ modifier *= modifier; modifier *= 3; - modifier += 1 << (strength - 1); + modifier += rounding; modifier >>= strength; if (modifier > 16) diff --git a/media/libvpx/vp8/encoder/tokenize.c b/media/libvpx/vp8/encoder/tokenize.c index 11559a72083c..2dc8205278b5 100644 --- a/media/libvpx/vp8/encoder/tokenize.c +++ b/media/libvpx/vp8/encoder/tokenize.c @@ -213,6 +213,7 @@ static void tokenize1st_order_b /* Luma */ for (block = 0; block < 16; block++, b++) { + const int eob = *b->eob; tmp1 = vp8_block2above[block]; tmp2 = vp8_block2left[block]; qcoeff_ptr = b->qcoeff; @@ -223,7 +224,7 @@ static void tokenize1st_order_b c = type ? 0 : 1; - if(c >= *b->eob) + if(c >= eob) { /* c = band for this case */ t->Token = DCT_EOB_TOKEN; @@ -250,7 +251,8 @@ static void tokenize1st_order_b t++; c++; - for (; c < *b->eob; c++) + assert(eob <= 16); + for (; c < eob; c++) { rc = vp8_default_zig_zag1d[c]; band = vp8_coef_bands[c]; @@ -286,6 +288,7 @@ static void tokenize1st_order_b /* Chroma */ for (block = 16; block < 24; block++, b++) { + const int eob = *b->eob; tmp1 = vp8_block2above[block]; tmp2 = vp8_block2left[block]; qcoeff_ptr = b->qcoeff; @@ -294,7 +297,7 @@ static void tokenize1st_order_b VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); - if(!(*b->eob)) + if(!eob) { /* c = band for this case */ t->Token = DCT_EOB_TOKEN; @@ -321,7 +324,8 @@ static void tokenize1st_order_b t++; c = 1; - for (; c < *b->eob; c++) + assert(eob <= 16); + for (; c < eob; c++) { rc = vp8_default_zig_zag1d[c]; band = vp8_coef_bands[c]; diff --git a/media/libvpx/vp8/encoder/tokenize.h b/media/libvpx/vp8/encoder/tokenize.h index 1e6cea114657..b73a9ee1c896 100644 --- a/media/libvpx/vp8/encoder/tokenize.h +++ b/media/libvpx/vp8/encoder/tokenize.h @@ -9,12 +9,16 @@ */ -#ifndef tokenize_h -#define tokenize_h +#ifndef VP8_ENCODER_TOKENIZE_H_ +#define VP8_ENCODER_TOKENIZE_H_ #include "vp8/common/entropy.h" #include "block.h" +#ifdef __cplusplus +extern "C" { +#endif + void vp8_tokenize_initialize(); typedef struct @@ -47,4 +51,8 @@ extern const short *const vp8_dct_value_cost_ptr; */ extern const TOKENVALUE *const vp8_dct_value_tokens_ptr; -#endif /* tokenize_h */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_ENCODER_TOKENIZE_H_ diff --git a/media/libvpx/vp8/encoder/treewriter.h b/media/libvpx/vp8/encoder/treewriter.h index 48574f33cbae..cfb2730ab3b2 100644 --- a/media/libvpx/vp8/encoder/treewriter.h +++ b/media/libvpx/vp8/encoder/treewriter.h @@ -9,8 +9,8 @@ */ -#ifndef __INC_TREEWRITER_H -#define __INC_TREEWRITER_H +#ifndef VP8_ENCODER_TREEWRITER_H_ +#define VP8_ENCODER_TREEWRITER_H_ /* Trees map alphabets into huffman-like codes suitable for an arithmetic bit coder. Timothy S Murphy 11 October 2004 */ @@ -19,6 +19,10 @@ #include "boolhuff.h" /* for now */ +#ifdef __cplusplus +extern "C" { +#endif + typedef BOOL_CODER vp8_writer; #define vp8_write vp8_encode_bool @@ -123,4 +127,8 @@ void vp8_cost_tokens2( int *Costs, const vp8_prob *, vp8_tree, int ); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_TREEWRITER_H_ diff --git a/media/libvpx/vp8/encoder/x86/denoising_sse2.c b/media/libvpx/vp8/encoder/x86/denoising_sse2.c index cceb8263f24e..3a4cf7ee792f 100644 --- a/media/libvpx/vp8/encoder/x86/denoising_sse2.c +++ b/media/libvpx/vp8/encoder/x86/denoising_sse2.c @@ -17,31 +17,46 @@ #include #include "vpx_ports/emmintrin_compat.h" -union sum_union { - __m128i v; - signed char e[16]; -}; +/* Compute the sum of all pixel differences of this MB. */ +static INLINE unsigned int abs_sum_diff_16x1(__m128i acc_diff) { + const __m128i k_1 = _mm_set1_epi16(1); + const __m128i acc_diff_lo = _mm_srai_epi16( + _mm_unpacklo_epi8(acc_diff, acc_diff), 8); + const __m128i acc_diff_hi = _mm_srai_epi16( + _mm_unpackhi_epi8(acc_diff, acc_diff), 8); + const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi); + const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1); + const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba, + _mm_srli_si128(hg_fe_dc_ba, 8)); + const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba, + _mm_srli_si128(hgfe_dcba, 4)); + unsigned int sum_diff = abs(_mm_cvtsi128_si32(hgfedcba)); -int vp8_denoiser_filter_sse2(YV12_BUFFER_CONFIG *mc_running_avg, - YV12_BUFFER_CONFIG *running_avg, - MACROBLOCK *signal, unsigned int motion_magnitude, - int y_offset, int uv_offset) + return sum_diff; +} + +int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, + int mc_avg_y_stride, + unsigned char *running_avg_y, int avg_y_stride, + unsigned char *sig, int sig_stride, + unsigned int motion_magnitude, + int increase_denoising) { - unsigned char *sig = signal->thismb; - int sig_stride = 16; - unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset; - int mc_avg_y_stride = mc_running_avg->y_stride; - unsigned char *running_avg_y = running_avg->y_buffer + y_offset; - int avg_y_stride = running_avg->y_stride; + unsigned char *running_avg_y_start = running_avg_y; + unsigned char *sig_start = sig; + unsigned int sum_diff_thresh; int r; + int shift_inc = (increase_denoising && + motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0; __m128i acc_diff = _mm_setzero_si128(); const __m128i k_0 = _mm_setzero_si128(); - const __m128i k_4 = _mm_set1_epi8(4); + const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); const __m128i k_8 = _mm_set1_epi8(8); const __m128i k_16 = _mm_set1_epi8(16); /* Modify each level's adjustment according to motion_magnitude. */ const __m128i l3 = _mm_set1_epi8( - (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 : 6); + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? + 7 + shift_inc : 6); /* Difference between level 3 and level 2 is 2. */ const __m128i l32 = _mm_set1_epi8(2); /* Difference between level 2 and level 1 is 1. */ @@ -101,20 +116,264 @@ int vp8_denoiser_filter_sse2(YV12_BUFFER_CONFIG *mc_running_avg, { /* Compute the sum of all pixel differences of this MB. */ - union sum_union s; - int sum_diff = 0; - s.v = acc_diff; - sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5] - + s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11] - + s.e[12] + s.e[13] + s.e[14] + s.e[15]; + unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff); + sum_diff_thresh = SUM_DIFF_THRESHOLD; + if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH; + if (abs_sum_diff > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // checK if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the accceptable range given by sum_diff_thresh. - if (abs(sum_diff) > SUM_DIFF_THRESHOLD) - { + // The delta is set by the excess of absolute pixel diff over the + // threshold. + int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const __m128i k_delta = _mm_set1_epi8(delta); + sig -= sig_stride * 16; + mc_running_avg_y -= mc_avg_y_stride * 16; + running_avg_y -= avg_y_stride * 16; + for (r = 0; r < 16; ++r) { + __m128i v_running_avg_y = + _mm_loadu_si128((__m128i *)(&running_avg_y[0])); + // Calculate differences. + const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0])); + const __m128i v_mc_running_avg_y = + _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0])); + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); + // Obtain the sign. FF if diff is negative. + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); + // Clamp absolute difference to delta to get the adjustment. + const __m128i adj = + _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); + // Restore the sign and get positive and negative adjustments. + __m128i padj, nadj; + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + // Calculate filtered value. + v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj); + v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj); + _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); + + // Accumulate the adjustments. + acc_diff = _mm_subs_epi8(acc_diff, padj); + acc_diff = _mm_adds_epi8(acc_diff, nadj); + + // Update pointers for next iteration. + sig += sig_stride; + mc_running_avg_y += mc_avg_y_stride; + running_avg_y += avg_y_stride; + } + abs_sum_diff = abs_sum_diff_16x1(acc_diff); + if (abs_sum_diff > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { return COPY_BLOCK; + } } } - vp8_copy_mem16x16(running_avg->y_buffer + y_offset, avg_y_stride, - signal->thismb, sig_stride); + vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride); + return FILTER_BLOCK; +} + +int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, + int mc_avg_stride, + unsigned char *running_avg, int avg_stride, + unsigned char *sig, int sig_stride, + unsigned int motion_magnitude, + int increase_denoising) { + unsigned char *running_avg_start = running_avg; + unsigned char *sig_start = sig; + unsigned int sum_diff_thresh; + int r; + int shift_inc = (increase_denoising && + motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 1 : 0; + __m128i acc_diff = _mm_setzero_si128(); + const __m128i k_0 = _mm_setzero_si128(); + const __m128i k_4 = _mm_set1_epi8(4 + shift_inc); + const __m128i k_8 = _mm_set1_epi8(8); + const __m128i k_16 = _mm_set1_epi8(16); + /* Modify each level's adjustment according to motion_magnitude. */ + const __m128i l3 = _mm_set1_epi8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? + 7 + shift_inc : 6); + /* Difference between level 3 and level 2 is 2. */ + const __m128i l32 = _mm_set1_epi8(2); + /* Difference between level 2 and level 1 is 1. */ + const __m128i l21 = _mm_set1_epi8(1); + + { + const __m128i k_1 = _mm_set1_epi16(1); + __m128i vec_sum_block = _mm_setzero_si128(); + + // Avoid denoising color signal if its close to average level. + for (r = 0; r < 8; ++r) { + const __m128i v_sig = _mm_loadl_epi64((__m128i *)(&sig[0])); + const __m128i v_sig_unpack = _mm_unpacklo_epi8(v_sig, k_0); + vec_sum_block = _mm_add_epi16(vec_sum_block, v_sig_unpack); + sig += sig_stride; + } + sig -= sig_stride * 8; + { + const __m128i hg_fe_dc_ba = _mm_madd_epi16(vec_sum_block, k_1); + const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba, + _mm_srli_si128(hg_fe_dc_ba, 8)); + const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba, + _mm_srli_si128(hgfe_dcba, 4)); + const int sum_block = _mm_cvtsi128_si32(hgfedcba); + if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) { + return COPY_BLOCK; + } + } + } + + for (r = 0; r < 4; ++r) { + /* Calculate differences */ + const __m128i v_sig_low = _mm_castpd_si128( + _mm_load_sd((double *)(&sig[0]))); + const __m128i v_sig = _mm_castpd_si128( + _mm_loadh_pd(_mm_castsi128_pd(v_sig_low), + (double *)(&sig[sig_stride]))); + const __m128i v_mc_running_avg_low = _mm_castpd_si128( + _mm_load_sd((double *)(&mc_running_avg[0]))); + const __m128i v_mc_running_avg = _mm_castpd_si128( + _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low), + (double *)(&mc_running_avg[mc_avg_stride]))); + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg); + /* Obtain the sign. FF if diff is negative. */ + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); + /* Clamp absolute difference to 16 to be used to get mask. Doing this + * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */ + const __m128i clamped_absdiff = _mm_min_epu8( + _mm_or_si128(pdiff, ndiff), k_16); + /* Get masks for l2 l1 and l0 adjustments */ + const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff); + const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff); + const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff); + /* Get adjustments for l2, l1, and l0 */ + __m128i adj2 = _mm_and_si128(mask2, l32); + const __m128i adj1 = _mm_and_si128(mask1, l21); + const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); + __m128i adj, padj, nadj; + __m128i v_running_avg; + + /* Combine the adjustments and get absolute adjustments. */ + adj2 = _mm_add_epi8(adj2, adj1); + adj = _mm_sub_epi8(l3, adj2); + adj = _mm_andnot_si128(mask0, adj); + adj = _mm_or_si128(adj, adj0); + + /* Restore the sign and get positive and negative adjustments. */ + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + + /* Calculate filtered value. */ + v_running_avg = _mm_adds_epu8(v_sig, padj); + v_running_avg = _mm_subs_epu8(v_running_avg, nadj); + + _mm_storel_pd((double *)&running_avg[0], + _mm_castsi128_pd(v_running_avg)); + _mm_storeh_pd((double *)&running_avg[avg_stride], + _mm_castsi128_pd(v_running_avg)); + + /* Adjustments <=7, and each element in acc_diff can fit in signed + * char. + */ + acc_diff = _mm_adds_epi8(acc_diff, padj); + acc_diff = _mm_subs_epi8(acc_diff, nadj); + + /* Update pointers for next iteration. */ + sig += sig_stride * 2; + mc_running_avg += mc_avg_stride * 2; + running_avg += avg_stride * 2; + } + + { + unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff); + sum_diff_thresh = SUM_DIFF_THRESHOLD_UV; + if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV; + if (abs_sum_diff > sum_diff_thresh) { + // Before returning to copy the block (i.e., apply no denoising), + // checK if we can still apply some (weaker) temporal filtering to + // this block, that would otherwise not be denoised at all. Simplest + // is to apply an additional adjustment to running_avg_y to bring it + // closer to sig. The adjustment is capped by a maximum delta, and + // chosen such that in most cases the resulting sum_diff will be + // within the accceptable range given by sum_diff_thresh. + + // The delta is set by the excess of absolute pixel diff over the + // threshold. + int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1; + // Only apply the adjustment for max delta up to 3. + if (delta < 4) { + const __m128i k_delta = _mm_set1_epi8(delta); + sig -= sig_stride * 8; + mc_running_avg -= mc_avg_stride * 8; + running_avg -= avg_stride * 8; + for (r = 0; r < 4; ++r) { + // Calculate differences. + const __m128i v_sig_low = _mm_castpd_si128( + _mm_load_sd((double *)(&sig[0]))); + const __m128i v_sig = _mm_castpd_si128( + _mm_loadh_pd(_mm_castsi128_pd(v_sig_low), + (double *)(&sig[sig_stride]))); + const __m128i v_mc_running_avg_low = _mm_castpd_si128( + _mm_load_sd((double *)(&mc_running_avg[0]))); + const __m128i v_mc_running_avg = _mm_castpd_si128( + _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low), + (double *)(&mc_running_avg[mc_avg_stride]))); + const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig); + const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg); + // Obtain the sign. FF if diff is negative. + const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0); + // Clamp absolute difference to delta to get the adjustment. + const __m128i adj = + _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta); + // Restore the sign and get positive and negative adjustments. + __m128i padj, nadj; + const __m128i v_running_avg_low = _mm_castpd_si128( + _mm_load_sd((double *)(&running_avg[0]))); + __m128i v_running_avg = _mm_castpd_si128( + _mm_loadh_pd(_mm_castsi128_pd(v_running_avg_low), + (double *)(&running_avg[avg_stride]))); + padj = _mm_andnot_si128(diff_sign, adj); + nadj = _mm_and_si128(diff_sign, adj); + // Calculate filtered value. + v_running_avg = _mm_subs_epu8(v_running_avg, padj); + v_running_avg = _mm_adds_epu8(v_running_avg, nadj); + + _mm_storel_pd((double *)&running_avg[0], + _mm_castsi128_pd(v_running_avg)); + _mm_storeh_pd((double *)&running_avg[avg_stride], + _mm_castsi128_pd(v_running_avg)); + + // Accumulate the adjustments. + acc_diff = _mm_subs_epi8(acc_diff, padj); + acc_diff = _mm_adds_epi8(acc_diff, nadj); + + // Update pointers for next iteration. + sig += sig_stride * 2; + mc_running_avg += mc_avg_stride * 2; + running_avg += avg_stride * 2; + } + abs_sum_diff = abs_sum_diff_16x1(acc_diff); + if (abs_sum_diff > sum_diff_thresh) { + return COPY_BLOCK; + } + } else { + return COPY_BLOCK; + } + } + } + + vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride); return FILTER_BLOCK; } diff --git a/media/libvpx/vp8/encoder/x86/quantize_sse2.c b/media/libvpx/vp8/encoder/x86/quantize_sse2.c index f495bf287d00..291d21992fe7 100644 --- a/media/libvpx/vp8/encoder/x86/quantize_sse2.c +++ b/media/libvpx/vp8/encoder/x86/quantize_sse2.c @@ -26,11 +26,10 @@ int cmp = (x[z] < boost) | (y[z] == 0); \ zbin_boost_ptr++; \ if (cmp) \ - goto select_eob_end_##i; \ + break; \ qcoeff_ptr[z] = y[z]; \ eob = i; \ zbin_boost_ptr = b->zrun_zbin_boost; \ - select_eob_end_##i:; \ } while (0) void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d) diff --git a/media/libvpx/vp8/encoder/x86/quantize_sse4.asm b/media/libvpx/vp8/encoder/x86/quantize_sse4.asm deleted file mode 100644 index dbd171bfcdee..000000000000 --- a/media/libvpx/vp8/encoder/x86/quantize_sse4.asm +++ /dev/null @@ -1,256 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" -%include "vp8_asm_enc_offsets.asm" - - -; void vp8_regular_quantize_b_sse4 | arg -; (BLOCK *b, | 0 -; BLOCKD *d) | 1 - -global sym(vp8_regular_quantize_b_sse4) PRIVATE -sym(vp8_regular_quantize_b_sse4): - -%if ABI_IS_32BIT - push rbp - mov rbp, rsp - GET_GOT rbx - push rdi - push rsi - - ALIGN_STACK 16, rax - %define qcoeff 0 ; 32 - %define stack_size 32 - sub rsp, stack_size -%else - %if LIBVPX_YASM_WIN64 - SAVE_XMM 8, u - push rdi - push rsi - %endif -%endif - ; end prolog - -%if ABI_IS_32BIT - mov rdi, arg(0) ; BLOCK *b - mov rsi, arg(1) ; BLOCKD *d -%else - %if LIBVPX_YASM_WIN64 - mov rdi, rcx ; BLOCK *b - mov rsi, rdx ; BLOCKD *d - %else - ;mov rdi, rdi ; BLOCK *b - ;mov rsi, rsi ; BLOCKD *d - %endif -%endif - - mov rax, [rdi + vp8_block_coeff] - mov rcx, [rdi + vp8_block_zbin] - mov rdx, [rdi + vp8_block_round] - movd xmm7, [rdi + vp8_block_zbin_extra] - - ; z - movdqa xmm0, [rax] - movdqa xmm1, [rax + 16] - - ; duplicate zbin_oq_value - pshuflw xmm7, xmm7, 0 - punpcklwd xmm7, xmm7 - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - ; sz - psraw xmm0, 15 - psraw xmm1, 15 - - ; (z ^ sz) - pxor xmm2, xmm0 - pxor xmm3, xmm1 - - ; x = abs(z) - psubw xmm2, xmm0 - psubw xmm3, xmm1 - - ; zbin - movdqa xmm4, [rcx] - movdqa xmm5, [rcx + 16] - - ; *zbin_ptr + zbin_oq_value - paddw xmm4, xmm7 - paddw xmm5, xmm7 - - movdqa xmm6, xmm2 - movdqa xmm7, xmm3 - - ; x - (*zbin_ptr + zbin_oq_value) - psubw xmm6, xmm4 - psubw xmm7, xmm5 - - ; round - movdqa xmm4, [rdx] - movdqa xmm5, [rdx + 16] - - mov rax, [rdi + vp8_block_quant_shift] - mov rcx, [rdi + vp8_block_quant] - mov rdx, [rdi + vp8_block_zrun_zbin_boost] - - ; x + round - paddw xmm2, xmm4 - paddw xmm3, xmm5 - - ; quant - movdqa xmm4, [rcx] - movdqa xmm5, [rcx + 16] - - ; y = x * quant_ptr >> 16 - pmulhw xmm4, xmm2 - pmulhw xmm5, xmm3 - - ; y += x - paddw xmm2, xmm4 - paddw xmm3, xmm5 - - pxor xmm4, xmm4 -%if ABI_IS_32BIT - movdqa [rsp + qcoeff], xmm4 - movdqa [rsp + qcoeff + 16], xmm4 -%else - pxor xmm8, xmm8 -%endif - - ; quant_shift - movdqa xmm5, [rax] - - ; zrun_zbin_boost - mov rax, rdx - -%macro ZIGZAG_LOOP 5 - ; x - pextrw ecx, %4, %2 - - ; if (x >= zbin) - sub cx, WORD PTR[rdx] ; x - zbin - lea rdx, [rdx + 2] ; zbin_boost_ptr++ - jl .rq_zigzag_loop_%1 ; x < zbin - - pextrw edi, %3, %2 ; y - - ; downshift by quant_shift[rc] - pextrb ecx, xmm5, %1 ; quant_shift[rc] - sar edi, cl ; also sets Z bit - je .rq_zigzag_loop_%1 ; !y -%if ABI_IS_32BIT - mov WORD PTR[rsp + qcoeff + %1 *2], di -%else - pinsrw %5, edi, %2 ; qcoeff[rc] -%endif - mov rdx, rax ; reset to b->zrun_zbin_boost -.rq_zigzag_loop_%1: -%endmacro -; in vp8_default_zig_zag1d order: see vp8/common/entropy.c -ZIGZAG_LOOP 0, 0, xmm2, xmm6, xmm4 -ZIGZAG_LOOP 1, 1, xmm2, xmm6, xmm4 -ZIGZAG_LOOP 4, 4, xmm2, xmm6, xmm4 -ZIGZAG_LOOP 8, 0, xmm3, xmm7, xmm8 -ZIGZAG_LOOP 5, 5, xmm2, xmm6, xmm4 -ZIGZAG_LOOP 2, 2, xmm2, xmm6, xmm4 -ZIGZAG_LOOP 3, 3, xmm2, xmm6, xmm4 -ZIGZAG_LOOP 6, 6, xmm2, xmm6, xmm4 -ZIGZAG_LOOP 9, 1, xmm3, xmm7, xmm8 -ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8 -ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8 -ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8 -ZIGZAG_LOOP 7, 7, xmm2, xmm6, xmm4 -ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8 -ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8 -ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8 - - mov rcx, [rsi + vp8_blockd_dequant] - mov rdi, [rsi + vp8_blockd_dqcoeff] - -%if ABI_IS_32BIT - movdqa xmm4, [rsp + qcoeff] - movdqa xmm5, [rsp + qcoeff + 16] -%else - %define xmm5 xmm8 -%endif - - ; y ^ sz - pxor xmm4, xmm0 - pxor xmm5, xmm1 - ; x = (y ^ sz) - sz - psubw xmm4, xmm0 - psubw xmm5, xmm1 - - ; dequant - movdqa xmm0, [rcx] - movdqa xmm1, [rcx + 16] - - mov rcx, [rsi + vp8_blockd_qcoeff] - - pmullw xmm0, xmm4 - pmullw xmm1, xmm5 - - ; store qcoeff - movdqa [rcx], xmm4 - movdqa [rcx + 16], xmm5 - - ; store dqcoeff - movdqa [rdi], xmm0 - movdqa [rdi + 16], xmm1 - - mov rcx, [rsi + vp8_blockd_eob] - - ; select the last value (in zig_zag order) for EOB - pxor xmm6, xmm6 - pcmpeqw xmm4, xmm6 - pcmpeqw xmm5, xmm6 - - packsswb xmm4, xmm5 - pshufb xmm4, [GLOBAL(zig_zag1d)] - pmovmskb edx, xmm4 - xor rdi, rdi - mov eax, -1 - xor dx, ax - bsr eax, edx - sub edi, edx - sar edi, 31 - add eax, 1 - and eax, edi - - mov BYTE PTR [rcx], al ; store eob - - ; begin epilog -%if ABI_IS_32BIT - add rsp, stack_size - pop rsp - - pop rsi - pop rdi - RESTORE_GOT - pop rbp -%else - %undef xmm5 - %if LIBVPX_YASM_WIN64 - pop rsi - pop rdi - RESTORE_XMM - %endif -%endif - - ret - -SECTION_RODATA -align 16 -; vp8/common/entropy.c: vp8_default_zig_zag1d -zig_zag1d: - db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 diff --git a/media/libvpx/vp8/encoder/x86/quantize_sse4.c b/media/libvpx/vp8/encoder/x86/quantize_sse4.c new file mode 100644 index 000000000000..601dd23a2f78 --- /dev/null +++ b/media/libvpx/vp8/encoder/x86/quantize_sse4.c @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include /* SSE4.1 */ + +#include "./vp8_rtcd.h" +#include "vp8/encoder/block.h" +#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */ + +#define SELECT_EOB(i, z, x, y, q) \ + do { \ + short boost = *zbin_boost_ptr; \ + short x_z = _mm_extract_epi16(x, z); \ + short y_z = _mm_extract_epi16(y, z); \ + int cmp = (x_z < boost) | (y_z == 0); \ + zbin_boost_ptr++; \ + if (cmp) \ + break; \ + q = _mm_insert_epi16(q, y_z, z); \ + eob = i; \ + zbin_boost_ptr = b->zrun_zbin_boost; \ + } while (0) + +void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) { + char eob = 0; + short *zbin_boost_ptr = b->zrun_zbin_boost; + + __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, + dqcoeff0, dqcoeff1; + __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); + __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); + __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); + __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8)); + __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra); + __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin)); + __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8)); + __m128i round0 = _mm_load_si128((__m128i *)(b->round)); + __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); + __m128i quant0 = _mm_load_si128((__m128i *)(b->quant)); + __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); + __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); + __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); + __m128i qcoeff0 = _mm_setzero_si128(); + __m128i qcoeff1 = _mm_setzero_si128(); + + /* Duplicate to all lanes. */ + zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); + zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); + + /* Sign of z: z >> 15 */ + sz0 = _mm_srai_epi16(z0, 15); + sz1 = _mm_srai_epi16(z1, 15); + + /* x = abs(z): (z ^ sz) - sz */ + x0 = _mm_xor_si128(z0, sz0); + x1 = _mm_xor_si128(z1, sz1); + x0 = _mm_sub_epi16(x0, sz0); + x1 = _mm_sub_epi16(x1, sz1); + + /* zbin[] + zbin_extra */ + zbin0 = _mm_add_epi16(zbin0, zbin_extra); + zbin1 = _mm_add_epi16(zbin1, zbin_extra); + + /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance + * the equation because boost is the only value which can change: + * x - (zbin[] + extra) >= boost */ + x_minus_zbin0 = _mm_sub_epi16(x0, zbin0); + x_minus_zbin1 = _mm_sub_epi16(x1, zbin1); + + /* All the remaining calculations are valid whether they are done now with + * simd or later inside the loop one at a time. */ + x0 = _mm_add_epi16(x0, round0); + x1 = _mm_add_epi16(x1, round1); + + y0 = _mm_mulhi_epi16(x0, quant0); + y1 = _mm_mulhi_epi16(x1, quant1); + + y0 = _mm_add_epi16(y0, x0); + y1 = _mm_add_epi16(y1, x1); + + /* Instead of shifting each value independently we convert the scaling + * factor with 1 << (16 - shift) so we can use multiply/return high half. */ + y0 = _mm_mulhi_epi16(y0, quant_shift0); + y1 = _mm_mulhi_epi16(y1, quant_shift1); + + /* Return the sign: (y ^ sz) - sz */ + y0 = _mm_xor_si128(y0, sz0); + y1 = _mm_xor_si128(y1, sz1); + y0 = _mm_sub_epi16(y0, sz0); + y1 = _mm_sub_epi16(y1, sz1); + + /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */ + SELECT_EOB(1, 0, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(2, 1, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(3, 4, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(4, 0, x_minus_zbin1, y1, qcoeff1); + SELECT_EOB(5, 5, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(6, 2, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(7, 3, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(8, 6, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(9, 1, x_minus_zbin1, y1, qcoeff1); + SELECT_EOB(10, 4, x_minus_zbin1, y1, qcoeff1); + SELECT_EOB(11, 5, x_minus_zbin1, y1, qcoeff1); + SELECT_EOB(12, 2, x_minus_zbin1, y1, qcoeff1); + SELECT_EOB(13, 7, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(14, 3, x_minus_zbin1, y1, qcoeff1); + SELECT_EOB(15, 6, x_minus_zbin1, y1, qcoeff1); + SELECT_EOB(16, 7, x_minus_zbin1, y1, qcoeff1); + + _mm_store_si128((__m128i *)(d->qcoeff), qcoeff0); + _mm_store_si128((__m128i *)(d->qcoeff + 8), qcoeff1); + + dqcoeff0 = _mm_mullo_epi16(qcoeff0, dequant0); + dqcoeff1 = _mm_mullo_epi16(qcoeff1, dequant1); + + _mm_store_si128((__m128i *)(d->dqcoeff), dqcoeff0); + _mm_store_si128((__m128i *)(d->dqcoeff + 8), dqcoeff1); + + *d->eob = eob; +} diff --git a/media/libvpx/vp8/encoder/x86/quantize_ssse3.asm b/media/libvpx/vp8/encoder/x86/quantize_ssse3.asm deleted file mode 100644 index 7b1dc119f080..000000000000 --- a/media/libvpx/vp8/encoder/x86/quantize_ssse3.asm +++ /dev/null @@ -1,138 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" -%include "vp8_asm_enc_offsets.asm" - - -; void vp8_fast_quantize_b_ssse3 | arg -; (BLOCK *b, | 0 -; BLOCKD *d) | 1 -; - -global sym(vp8_fast_quantize_b_ssse3) PRIVATE -sym(vp8_fast_quantize_b_ssse3): - push rbp - mov rbp, rsp - GET_GOT rbx - -%if ABI_IS_32BIT - push rdi - push rsi -%else - %if LIBVPX_YASM_WIN64 - push rdi - push rsi - %endif -%endif - ; end prolog - -%if ABI_IS_32BIT - mov rdi, arg(0) ; BLOCK *b - mov rsi, arg(1) ; BLOCKD *d -%else - %if LIBVPX_YASM_WIN64 - mov rdi, rcx ; BLOCK *b - mov rsi, rdx ; BLOCKD *d - %else - ;mov rdi, rdi ; BLOCK *b - ;mov rsi, rsi ; BLOCKD *d - %endif -%endif - - mov rax, [rdi + vp8_block_coeff] - mov rcx, [rdi + vp8_block_round] - mov rdx, [rdi + vp8_block_quant_fast] - - ; coeff - movdqa xmm0, [rax] - movdqa xmm4, [rax + 16] - - ; round - movdqa xmm2, [rcx] - movdqa xmm3, [rcx + 16] - - movdqa xmm1, xmm0 - movdqa xmm5, xmm4 - - ; sz = z >> 15 - psraw xmm0, 15 - psraw xmm4, 15 - - pabsw xmm1, xmm1 - pabsw xmm5, xmm5 - - paddw xmm1, xmm2 - paddw xmm5, xmm3 - - ; quant_fast - pmulhw xmm1, [rdx] - pmulhw xmm5, [rdx + 16] - - mov rax, [rsi + vp8_blockd_qcoeff] - mov rdi, [rsi + vp8_blockd_dequant] - mov rcx, [rsi + vp8_blockd_dqcoeff] - - movdqa xmm2, xmm1 ;store y for getting eob - movdqa xmm3, xmm5 - - pxor xmm1, xmm0 - pxor xmm5, xmm4 - psubw xmm1, xmm0 - psubw xmm5, xmm4 - - movdqa [rax], xmm1 - movdqa [rax + 16], xmm5 - - movdqa xmm0, [rdi] - movdqa xmm4, [rdi + 16] - - pmullw xmm0, xmm1 - pmullw xmm4, xmm5 - pxor xmm1, xmm1 - - pcmpgtw xmm2, xmm1 ;calculate eob - pcmpgtw xmm3, xmm1 - packsswb xmm2, xmm3 - pshufb xmm2, [GLOBAL(zz_shuf)] - - pmovmskb edx, xmm2 - - movdqa [rcx], xmm0 ;store dqcoeff - movdqa [rcx + 16], xmm4 ;store dqcoeff - mov rcx, [rsi + vp8_blockd_eob] - - bsr eax, edx ;count 0 - add eax, 1 - - cmp edx, 0 ;if all 0, eob=0 - cmove eax, edx - - mov BYTE PTR [rcx], al ;store eob - - ; begin epilog -%if ABI_IS_32BIT - pop rsi - pop rdi -%else - %if LIBVPX_YASM_WIN64 - pop rsi - pop rdi - %endif -%endif - - RESTORE_GOT - pop rbp - ret - -SECTION_RODATA -align 16 -zz_shuf: - db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 diff --git a/media/libvpx/vp8/encoder/x86/quantize_ssse3.c b/media/libvpx/vp8/encoder/x86/quantize_ssse3.c new file mode 100644 index 000000000000..448217ff412f --- /dev/null +++ b/media/libvpx/vp8/encoder/x86/quantize_ssse3.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include /* SSSE3 */ + +#include "vp8/encoder/block.h" + +/* bitscan reverse (bsr) */ +#if defined(_MSC_VER) +#include +#pragma intrinsic(_BitScanReverse) +static int bsr(int mask) { + int eob; + _BitScanReverse(&eob, mask); + eob++; + if (mask == 0) + eob = 0; + return eob; +} +#else +static int bsr(int mask) { + int eob; +#if defined(__GNUC__) && __GNUC__ + __asm__ __volatile__("bsr %1, %0" : "=r" (eob) : "r" (mask) : "flags"); +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) + asm volatile("bsr %1, %0" : "=r" (eob) : "r" (mask) : "flags"); +#endif + eob++; + if (mask == 0) + eob = 0; + return eob; +} +#endif + +void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) { + int eob, mask; + + __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); + __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); + __m128i round0 = _mm_load_si128((__m128i *)(b->round)); + __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); + __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast)); + __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8)); + __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); + __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); + + __m128i sz0, sz1, x, x0, x1, y0, y1, zeros, abs0, abs1; + + DECLARE_ALIGNED(16, const uint8_t, pshufb_zig_zag_mask[16]) = + { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 }; + __m128i zig_zag = _mm_load_si128((const __m128i *)pshufb_zig_zag_mask); + + /* sign of z: z >> 15 */ + sz0 = _mm_srai_epi16(z0, 15); + sz1 = _mm_srai_epi16(z1, 15); + + /* x = abs(z) */ + x0 = _mm_abs_epi16(z0); + x1 = _mm_abs_epi16(z1); + + /* x += round */ + x0 = _mm_add_epi16(x0, round0); + x1 = _mm_add_epi16(x1, round1); + + /* y = (x * quant) >> 16 */ + y0 = _mm_mulhi_epi16(x0, quant_fast0); + y1 = _mm_mulhi_epi16(x1, quant_fast1); + + /* ASM saves Y for EOB */ + /* I think we can ignore that because adding the sign doesn't change anything + * and multiplying 0 by dequant is OK as well */ + abs0 = y0; + abs1 = y1; + + /* Restore the sign bit. */ + y0 = _mm_xor_si128(y0, sz0); + y1 = _mm_xor_si128(y1, sz1); + x0 = _mm_sub_epi16(y0, sz0); + x1 = _mm_sub_epi16(y1, sz1); + + /* qcoeff = x */ + _mm_store_si128((__m128i *)(d->qcoeff), x0); + _mm_store_si128((__m128i *)(d->qcoeff + 8), x1); + + /* x * dequant */ + x0 = _mm_mullo_epi16(x0, dequant0); + x1 = _mm_mullo_epi16(x1, dequant1); + + /* dqcoeff = x * dequant */ + _mm_store_si128((__m128i *)(d->dqcoeff), x0); + _mm_store_si128((__m128i *)(d->dqcoeff + 8), x1); + + zeros = _mm_setzero_si128(); + + x0 = _mm_cmpgt_epi16(abs0, zeros); + x1 = _mm_cmpgt_epi16(abs1, zeros); + + x = _mm_packs_epi16(x0, x1); + + x = _mm_shuffle_epi8(x, zig_zag); + + mask = _mm_movemask_epi8(x); + + eob = bsr(mask); + + *d->eob = 0xFF & eob; +} diff --git a/media/libvpx/vp8/encoder/x86/ssim_opt_x86_64.asm b/media/libvpx/vp8/encoder/x86/ssim_opt_x86_64.asm new file mode 100644 index 000000000000..5964a85f2cf1 --- /dev/null +++ b/media/libvpx/vp8/encoder/x86/ssim_opt_x86_64.asm @@ -0,0 +1,216 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr +%macro TABULATE_SSIM 0 + paddusw xmm15, xmm3 ; sum_s + paddusw xmm14, xmm4 ; sum_r + movdqa xmm1, xmm3 + pmaddwd xmm1, xmm1 + paddd xmm13, xmm1 ; sum_sq_s + movdqa xmm2, xmm4 + pmaddwd xmm2, xmm2 + paddd xmm12, xmm2 ; sum_sq_r + pmaddwd xmm3, xmm4 + paddd xmm11, xmm3 ; sum_sxr +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_Q 1 + movdqa xmm2,%1 + punpckldq %1,xmm0 + punpckhdq xmm2,xmm0 + paddq %1,xmm2 + movdqa xmm2,%1 + punpcklqdq %1,xmm0 + punpckhqdq xmm2,xmm0 + paddq %1,xmm2 +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_W 1 + movdqa xmm1, %1 + punpcklwd %1,xmm0 + punpckhwd xmm1,xmm0 + paddd %1, xmm1 + SUM_ACROSS_Q %1 +%endmacro +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; unsigned long *sum_s, +; unsigned long *sum_r, +; unsigned long *sum_sq_s, +; unsigned long *sum_sq_r, +; unsigned long *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(vp8_ssim_parms_16x16_sse2) PRIVATE +sym(vp8_ssim_parms_16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 16 ;row counter +.NextRow: + + ;grab source and reference pixels + movdqu xmm5, [rsi] + movdqu xmm6, [rdi] + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpckhbw xmm3, xmm0 ; high_s + punpckhbw xmm4, xmm0 ; high_r + + TABULATE_SSIM + + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; unsigned long *sum_s, +; unsigned long *sum_r, +; unsigned long *sum_sq_s, +; unsigned long *sum_sq_r, +; unsigned long *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(vp8_ssim_parms_8x8_sse2) PRIVATE +sym(vp8_ssim_parms_8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 8 ;row counter +.NextRow: + + ;grab source and reference pixels + movq xmm3, [rsi] + movq xmm4, [rdi] + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vp8/vp8_cx_iface.c b/media/libvpx/vp8/vp8_cx_iface.c index 19e9d2701837..b1b079cb2605 100644 --- a/media/libvpx/vp8/vp8_cx_iface.c +++ b/media/libvpx/vp8/vp8_cx_iface.c @@ -9,10 +9,12 @@ */ +#include "./vpx_config.h" #include "vp8_rtcd.h" #include "vpx/vpx_codec.h" #include "vpx/internal/vpx_codec_internal.h" #include "vpx_version.h" +#include "vpx_mem/vpx_mem.h" #include "vp8/encoder/onyx_int.h" #include "vpx/vp8cx.h" #include "vp8/encoder/firstpass.h" @@ -38,40 +40,28 @@ struct vp8_extracfg }; -struct extraconfig_map -{ - int usage; - struct vp8_extracfg cfg; -}; - -static const struct extraconfig_map extracfg_map[] = -{ - { - 0, - { - NULL, +static struct vp8_extracfg default_extracfg = { + NULL, #if !(CONFIG_REALTIME_ONLY) - 0, /* cpu_used */ + 0, /* cpu_used */ #else - 4, /* cpu_used */ + 4, /* cpu_used */ #endif - 0, /* enable_auto_alt_ref */ - 0, /* noise_sensitivity */ - 0, /* Sharpness */ - 0, /* static_thresh */ + 0, /* enable_auto_alt_ref */ + 0, /* noise_sensitivity */ + 0, /* Sharpness */ + 0, /* static_thresh */ #if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) - VP8_EIGHT_TOKENPARTITION, + VP8_EIGHT_TOKENPARTITION, #else - VP8_ONE_TOKENPARTITION, /* token_partitions */ + VP8_ONE_TOKENPARTITION, /* token_partitions */ #endif - 0, /* arnr_max_frames */ - 3, /* arnr_strength */ - 3, /* arnr_type*/ - 0, /* tuning*/ - 10, /* cq_level */ - 0, /* rc_max_intra_bitrate_pct */ - } - } + 0, /* arnr_max_frames */ + 3, /* arnr_strength */ + 3, /* arnr_type*/ + 0, /* tuning*/ + 10, /* cq_level */ + 0, /* rc_max_intra_bitrate_pct */ }; struct vpx_codec_alg_priv @@ -414,7 +404,6 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf, printf("Sharpness: %d\n", oxcf->Sharpness); printf("cpu_used: %d\n", oxcf->cpu_used); printf("Mode: %d\n", oxcf->Mode); - printf("delete_first_pass_file: %d\n", oxcf->delete_first_pass_file); printf("auto_key: %d\n", oxcf->auto_key); printf("key_freq: %d\n", oxcf->key_freq); printf("end_usage: %d\n", oxcf->end_usage); @@ -473,70 +462,128 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx, return res; } - int vp8_reverse_trans(int); - -static vpx_codec_err_t get_param(vpx_codec_alg_priv_t *ctx, - int ctrl_id, - va_list args) +static vpx_codec_err_t get_quantizer(vpx_codec_alg_priv_t *ctx, va_list args) { - void *arg = va_arg(args, void *); - -#define MAP(id, var) case id: *(RECAST(id, arg)) = var; break - - if (!arg) - return VPX_CODEC_INVALID_PARAM; - - switch (ctrl_id) - { - MAP(VP8E_GET_LAST_QUANTIZER, vp8_get_quantizer(ctx->cpi)); - MAP(VP8E_GET_LAST_QUANTIZER_64, vp8_reverse_trans(vp8_get_quantizer(ctx->cpi))); - } - - return VPX_CODEC_OK; -#undef MAP + int *const arg = va_arg(args, int *); + if (arg == NULL) + return VPX_CODEC_INVALID_PARAM; + *arg = vp8_get_quantizer(ctx->cpi); + return VPX_CODEC_OK; } - -static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx, - int ctrl_id, - va_list args) +static vpx_codec_err_t get_quantizer64(vpx_codec_alg_priv_t *ctx, va_list args) { - vpx_codec_err_t res = VPX_CODEC_OK; - struct vp8_extracfg xcfg = ctx->vp8_cfg; + int *const arg = va_arg(args, int *); + if (arg == NULL) + return VPX_CODEC_INVALID_PARAM; + *arg = vp8_reverse_trans(vp8_get_quantizer(ctx->cpi)); + return VPX_CODEC_OK; +} -#define MAP(id, var) case id: var = CAST(id, args); break; +static vpx_codec_err_t update_extracfg(vpx_codec_alg_priv_t *ctx, + const struct vp8_extracfg *extra_cfg) +{ + const vpx_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg, 0); + if (res == VPX_CODEC_OK) { + ctx->vp8_cfg = *extra_cfg; + set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL); + vp8_change_config(ctx->cpi, &ctx->oxcf); + } + return res; +} - switch (ctrl_id) - { - MAP(VP8E_SET_CPUUSED, xcfg.cpu_used); - MAP(VP8E_SET_ENABLEAUTOALTREF, xcfg.enable_auto_alt_ref); - MAP(VP8E_SET_NOISE_SENSITIVITY, xcfg.noise_sensitivity); - MAP(VP8E_SET_SHARPNESS, xcfg.Sharpness); - MAP(VP8E_SET_STATIC_THRESHOLD, xcfg.static_thresh); - MAP(VP8E_SET_TOKEN_PARTITIONS, xcfg.token_partitions); +static vpx_codec_err_t set_cpu_used(vpx_codec_alg_priv_t *ctx, va_list args) +{ + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.cpu_used = CAST(VP8E_SET_CPUUSED, args); + return update_extracfg(ctx, &extra_cfg); +} - MAP(VP8E_SET_ARNR_MAXFRAMES, xcfg.arnr_max_frames); - MAP(VP8E_SET_ARNR_STRENGTH , xcfg.arnr_strength); - MAP(VP8E_SET_ARNR_TYPE , xcfg.arnr_type); - MAP(VP8E_SET_TUNING, xcfg.tuning); - MAP(VP8E_SET_CQ_LEVEL, xcfg.cq_level); - MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT, xcfg.rc_max_intra_bitrate_pct); +static vpx_codec_err_t set_enable_auto_alt_ref(vpx_codec_alg_priv_t *ctx, + va_list args) +{ + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.enable_auto_alt_ref = CAST(VP8E_SET_ENABLEAUTOALTREF, args); + return update_extracfg(ctx, &extra_cfg); +} - } +static vpx_codec_err_t set_noise_sensitivity(vpx_codec_alg_priv_t *ctx, + va_list args) +{ + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.noise_sensitivity = CAST(VP8E_SET_NOISE_SENSITIVITY, args); + return update_extracfg(ctx, &extra_cfg); +} - res = validate_config(ctx, &ctx->cfg, &xcfg, 0); +static vpx_codec_err_t set_sharpness(vpx_codec_alg_priv_t *ctx, va_list args) +{ + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.Sharpness = CAST(VP8E_SET_SHARPNESS, args); + return update_extracfg(ctx, &extra_cfg); +} - if (!res) - { - ctx->vp8_cfg = xcfg; - set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL); - vp8_change_config(ctx->cpi, &ctx->oxcf); - } +static vpx_codec_err_t set_static_thresh(vpx_codec_alg_priv_t *ctx, + va_list args) +{ + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.static_thresh = CAST(VP8E_SET_STATIC_THRESHOLD, args); + return update_extracfg(ctx, &extra_cfg); +} - return res; -#undef MAP +static vpx_codec_err_t set_token_partitions(vpx_codec_alg_priv_t *ctx, + va_list args) +{ + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.token_partitions = CAST(VP8E_SET_TOKEN_PARTITIONS, args); + return update_extracfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t set_arnr_max_frames(vpx_codec_alg_priv_t *ctx, + va_list args) +{ + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.arnr_max_frames = CAST(VP8E_SET_ARNR_MAXFRAMES, args); + return update_extracfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t set_arnr_strength(vpx_codec_alg_priv_t *ctx, + va_list args) +{ + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.arnr_strength = CAST(VP8E_SET_ARNR_STRENGTH, args); + return update_extracfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t set_arnr_type(vpx_codec_alg_priv_t *ctx, va_list args) +{ + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.arnr_type = CAST(VP8E_SET_ARNR_TYPE, args); + return update_extracfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t set_tuning(vpx_codec_alg_priv_t *ctx, va_list args) +{ + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.tuning = CAST(VP8E_SET_TUNING, args); + return update_extracfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t set_cq_level(vpx_codec_alg_priv_t *ctx, va_list args) +{ + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.cq_level = CAST(VP8E_SET_CQ_LEVEL, args); + return update_extracfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t set_rc_max_intra_bitrate_pct(vpx_codec_alg_priv_t *ctx, + va_list args) +{ + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.rc_max_intra_bitrate_pct = + CAST(VP8E_SET_MAX_INTRA_BITRATE_PCT, args); + return update_extracfg(ctx, &extra_cfg); } static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg, @@ -573,27 +620,21 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx, vpx_codec_priv_enc_mr_cfg_t *mr_cfg) { vpx_codec_err_t res = VPX_CODEC_OK; - struct vpx_codec_alg_priv *priv; - vpx_codec_enc_cfg_t *cfg; - unsigned int i; - struct VP8_COMP *optr; vp8_rtcd(); if (!ctx->priv) { - priv = calloc(1, sizeof(struct vpx_codec_alg_priv)); + struct vpx_codec_alg_priv *priv = + (struct vpx_codec_alg_priv *)vpx_calloc(1, sizeof(*priv)); if (!priv) { return VPX_CODEC_MEM_ERROR; } - ctx->priv = &priv->base; - ctx->priv->sz = sizeof(*ctx->priv); - ctx->priv->iface = ctx->iface; - ctx->priv->alg_priv = priv; + ctx->priv = (vpx_codec_priv_t *)priv; ctx->priv->init_flags = ctx->init_flags; if (ctx->config.enc) @@ -601,21 +642,11 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx, /* Update the reference to the config structure to an * internal copy. */ - ctx->priv->alg_priv->cfg = *ctx->config.enc; - ctx->config.enc = &ctx->priv->alg_priv->cfg; + priv->cfg = *ctx->config.enc; + ctx->config.enc = &priv->cfg; } - cfg = &ctx->priv->alg_priv->cfg; - - /* Select the extra vp8 configuration table based on the current - * usage value. If the current usage value isn't found, use the - * values for usage case 0. - */ - for (i = 0; - extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage; - i++); - - priv->vp8_cfg = extracfg_map[i].cfg; + priv->vp8_cfg = default_extracfg; priv->vp8_cfg.pkt_list = &priv->pkt_list.head; priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2; @@ -638,17 +669,10 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx, if (!res) { - set_vp8e_config(&ctx->priv->alg_priv->oxcf, - ctx->priv->alg_priv->cfg, - ctx->priv->alg_priv->vp8_cfg, - mr_cfg); - - optr = vp8_create_compressor(&ctx->priv->alg_priv->oxcf); - - if (!optr) + set_vp8e_config(&priv->oxcf, priv->cfg, priv->vp8_cfg, mr_cfg); + priv->cpi = vp8_create_compressor(&priv->oxcf); + if (!priv->cpi) res = VPX_CODEC_MEM_ERROR; - else - ctx->priv->alg_priv->cpi = optr; } } @@ -669,24 +693,30 @@ static vpx_codec_err_t vp8e_destroy(vpx_codec_alg_priv_t *ctx) free(ctx->cx_data); vp8_remove_compressor(&ctx->cpi); - free(ctx); + vpx_free(ctx); return VPX_CODEC_OK; } static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, YV12_BUFFER_CONFIG *yv12) { + const int y_w = img->d_w; + const int y_h = img->d_h; + const int uv_w = (img->d_w + 1) / 2; + const int uv_h = (img->d_h + 1) / 2; vpx_codec_err_t res = VPX_CODEC_OK; yv12->y_buffer = img->planes[VPX_PLANE_Y]; yv12->u_buffer = img->planes[VPX_PLANE_U]; yv12->v_buffer = img->planes[VPX_PLANE_V]; - yv12->y_crop_width = img->d_w; - yv12->y_crop_height = img->d_h; - yv12->y_width = img->d_w; - yv12->y_height = img->d_h; - yv12->uv_width = (1 + yv12->y_width) / 2; - yv12->uv_height = (1 + yv12->y_height) / 2; + yv12->y_crop_width = y_w; + yv12->y_crop_height = y_h; + yv12->y_width = y_w; + yv12->y_height = y_h; + yv12->uv_crop_width = uv_w; + yv12->uv_crop_height = uv_h; + yv12->uv_width = uv_w; + yv12->uv_height = uv_h; yv12->y_stride = img->stride[VPX_PLANE_Y]; yv12->uv_stride = img->stride[VPX_PLANE_U]; @@ -748,9 +778,6 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, { vpx_codec_err_t res = VPX_CODEC_OK; - if (!ctx->cfg.rc_target_bitrate) - return res; - if (!ctx->cfg.rc_target_bitrate) return res; @@ -890,7 +917,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, VP8_COMP *cpi = (VP8_COMP *)ctx->cpi; /* Add the frame packet to the list of returned packets. */ - round = (vpx_codec_pts_t)1000000 + round = (vpx_codec_pts_t)10000000 * ctx->cfg.g_timebase.num / 2 - 1; delta = (dst_end_time_stamp - dst_time_stamp); pkt.kind = VPX_CODEC_CX_FRAME_PKT; @@ -980,8 +1007,7 @@ static const vpx_codec_cx_pkt_t *vp8e_get_cxdata(vpx_codec_alg_priv_t *ctx, } static vpx_codec_err_t vp8e_set_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) + va_list args) { vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); @@ -1000,8 +1026,7 @@ static vpx_codec_err_t vp8e_set_reference(vpx_codec_alg_priv_t *ctx, } static vpx_codec_err_t vp8e_get_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) + va_list args) { vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); @@ -1020,12 +1045,10 @@ static vpx_codec_err_t vp8e_get_reference(vpx_codec_alg_priv_t *ctx, } static vpx_codec_err_t vp8e_set_previewpp(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) + va_list args) { #if CONFIG_POSTPROC vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *); - (void)ctr_id; if (data) { @@ -1036,7 +1059,6 @@ static vpx_codec_err_t vp8e_set_previewpp(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_INVALID_PARAM; #else (void)ctx; - (void)ctr_id; (void)args; return VPX_CODEC_INCAPABLE; #endif @@ -1094,8 +1116,7 @@ static vpx_image_t *vp8e_get_preview(vpx_codec_alg_priv_t *ctx) } static vpx_codec_err_t vp8e_update_entropy(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) + va_list args) { int update = va_arg(args, int); vp8_update_entropy(ctx->cpi, update); @@ -1104,8 +1125,7 @@ static vpx_codec_err_t vp8e_update_entropy(vpx_codec_alg_priv_t *ctx, } static vpx_codec_err_t vp8e_update_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) + va_list args) { int update = va_arg(args, int); vp8_update_reference(ctx->cpi, update); @@ -1113,8 +1133,7 @@ static vpx_codec_err_t vp8e_update_reference(vpx_codec_alg_priv_t *ctx, } static vpx_codec_err_t vp8e_use_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) + va_list args) { int reference_flag = va_arg(args, int); vp8_use_as_reference(ctx->cpi, reference_flag); @@ -1122,7 +1141,6 @@ static vpx_codec_err_t vp8e_use_reference(vpx_codec_alg_priv_t *ctx, } static vpx_codec_err_t vp8e_set_roi_map(vpx_codec_alg_priv_t *ctx, - int ctr_id, va_list args) { vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *); @@ -1142,8 +1160,7 @@ static vpx_codec_err_t vp8e_set_roi_map(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t vp8e_set_activemap(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) + va_list args) { vpx_active_map_t *data = va_arg(args, vpx_active_map_t *); @@ -1162,8 +1179,7 @@ static vpx_codec_err_t vp8e_set_activemap(vpx_codec_alg_priv_t *ctx, } static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) + va_list args) { vpx_scaling_mode_t *data = va_arg(args, vpx_scaling_mode_t *); @@ -1201,20 +1217,20 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = {VP8E_SET_ROI_MAP, vp8e_set_roi_map}, {VP8E_SET_ACTIVEMAP, vp8e_set_activemap}, {VP8E_SET_SCALEMODE, vp8e_set_scalemode}, - {VP8E_SET_CPUUSED, set_param}, - {VP8E_SET_NOISE_SENSITIVITY, set_param}, - {VP8E_SET_ENABLEAUTOALTREF, set_param}, - {VP8E_SET_SHARPNESS, set_param}, - {VP8E_SET_STATIC_THRESHOLD, set_param}, - {VP8E_SET_TOKEN_PARTITIONS, set_param}, - {VP8E_GET_LAST_QUANTIZER, get_param}, - {VP8E_GET_LAST_QUANTIZER_64, get_param}, - {VP8E_SET_ARNR_MAXFRAMES, set_param}, - {VP8E_SET_ARNR_STRENGTH , set_param}, - {VP8E_SET_ARNR_TYPE , set_param}, - {VP8E_SET_TUNING, set_param}, - {VP8E_SET_CQ_LEVEL, set_param}, - {VP8E_SET_MAX_INTRA_BITRATE_PCT, set_param}, + {VP8E_SET_CPUUSED, set_cpu_used}, + {VP8E_SET_NOISE_SENSITIVITY, set_noise_sensitivity}, + {VP8E_SET_ENABLEAUTOALTREF, set_enable_auto_alt_ref}, + {VP8E_SET_SHARPNESS, set_sharpness}, + {VP8E_SET_STATIC_THRESHOLD, set_static_thresh}, + {VP8E_SET_TOKEN_PARTITIONS, set_token_partitions}, + {VP8E_GET_LAST_QUANTIZER, get_quantizer}, + {VP8E_GET_LAST_QUANTIZER_64, get_quantizer64}, + {VP8E_SET_ARNR_MAXFRAMES, set_arnr_max_frames}, + {VP8E_SET_ARNR_STRENGTH , set_arnr_strength}, + {VP8E_SET_ARNR_TYPE , set_arnr_type}, + {VP8E_SET_TUNING, set_tuning}, + {VP8E_SET_CQ_LEVEL, set_cq_level}, + {VP8E_SET_MAX_INTRA_BITRATE_PCT, set_rc_max_intra_bitrate_pct}, { -1, NULL}, }; @@ -1229,6 +1245,9 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = 320, /* g_width */ 240, /* g_height */ + VPX_BITS_8, /* g_bit_depth */ + 8, /* g_input_bit_depth */ + {1, 30}, /* g_timebase */ 0, /* g_error_resilient */ @@ -1239,12 +1258,15 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = 0, /* rc_dropframe_thresh */ 0, /* rc_resize_allowed */ + 1, /* rc_scaled_width */ + 1, /* rc_scaled_height */ 60, /* rc_resize_down_thresold */ 30, /* rc_resize_up_thresold */ VPX_VBR, /* rc_end_usage */ #if VPX_ENCODER_ABI_VERSION > (1 + VPX_CODEC_ABI_VERSION) {0}, /* rc_twopass_stats_in */ + {0}, /* rc_firstpass_mb_stats_in */ #endif 256, /* rc_target_bandwidth */ 4, /* rc_min_quantizer */ @@ -1266,17 +1288,17 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = 128, /* kf_max_dist */ #if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION) - 1, /* g_delete_first_pass_file */ "vp8.fpf" /* first pass filename */ #endif VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */ + {0}, + {0}, /* ss_target_bitrate */ 1, /* ts_number_layers */ {0}, /* ts_target_bitrate */ {0}, /* ts_rate_decimator */ 0, /* ts_periodicity */ {0}, /* ts_layer_id */ }}, - { -1, {NOT_IMPLEMENTED}} }; @@ -1293,20 +1315,19 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) = vp8e_init, /* vpx_codec_init_fn_t init; */ vp8e_destroy, /* vpx_codec_destroy_fn_t destroy; */ vp8e_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */ - NOT_IMPLEMENTED, /* vpx_codec_get_mmap_fn_t get_mmap; */ - NOT_IMPLEMENTED, /* vpx_codec_set_mmap_fn_t set_mmap; */ { - NOT_IMPLEMENTED, /* vpx_codec_peek_si_fn_t peek_si; */ - NOT_IMPLEMENTED, /* vpx_codec_get_si_fn_t get_si; */ - NOT_IMPLEMENTED, /* vpx_codec_decode_fn_t decode; */ - NOT_IMPLEMENTED, /* vpx_codec_frame_get_fn_t frame_get; */ + NULL, /* vpx_codec_peek_si_fn_t peek_si; */ + NULL, /* vpx_codec_get_si_fn_t get_si; */ + NULL, /* vpx_codec_decode_fn_t decode; */ + NULL, /* vpx_codec_frame_get_fn_t frame_get; */ }, { + 1, /* 1 cfg map */ vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t peek_si; */ vp8e_encode, /* vpx_codec_encode_fn_t encode; */ vp8e_get_cxdata, /* vpx_codec_get_cx_data_fn_t frame_get; */ vp8e_set_config, - NOT_IMPLEMENTED, + NULL, vp8e_get_preview, vp8e_mr_alloc_mem, } /* encoder functions */ diff --git a/media/libvpx/vp8/vp8_dx_iface.c b/media/libvpx/vp8/vp8_dx_iface.c index 871b8d385791..3ab8ed0ad69b 100644 --- a/media/libvpx/vp8/vp8_dx_iface.c +++ b/media/libvpx/vp8/vp8_dx_iface.c @@ -16,9 +16,10 @@ #include "vpx/vp8dx.h" #include "vpx/internal/vpx_codec_internal.h" #include "vpx_version.h" +#include "common/alloccommon.h" +#include "common/common.h" #include "common/onyxd.h" #include "decoder/onyxd_int.h" -#include "common/alloccommon.h" #include "vpx_mem/vpx_mem.h" #if CONFIG_ERROR_CONCEALMENT #include "decoder/error_concealment.h" @@ -41,19 +42,11 @@ typedef enum static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_t); -static const mem_req_t vp8_mem_req_segs[] = -{ - {VP8_SEG_ALG_PRIV, 0, 8, VPX_CODEC_MEM_ZERO, vp8_priv_sz}, - {VP8_SEG_MAX, 0, 0, 0, NULL} -}; - struct vpx_codec_alg_priv { vpx_codec_priv_t base; - vpx_codec_mmap_t mmaps[NELEMENTS(vp8_mem_req_segs)-1]; vpx_codec_dec_cfg_t cfg; vp8_stream_info_t si; - int defer_alloc; int decoder_init; int postproc_cfg_set; vp8_postproc_cfg_t postproc_cfg; @@ -64,9 +57,10 @@ struct vpx_codec_alg_priv int dbg_color_b_modes_flag; int dbg_display_mv_flag; #endif - vp8_decrypt_cb *decrypt_cb; + vpx_decrypt_cb decrypt_cb; void *decrypt_state; vpx_image_t img; + int flushed; int img_setup; struct frame_buffers yv12_frame_buffers; void *user_priv; @@ -84,41 +78,32 @@ static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_ return sizeof(vpx_codec_alg_priv_t); } -static void vp8_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) +static void vp8_init_ctx(vpx_codec_ctx_t *ctx) { - int i; + vpx_codec_alg_priv_t *priv = + (vpx_codec_alg_priv_t *)vpx_calloc(1, sizeof(*priv)); - ctx->priv = mmap->base; - ctx->priv->sz = sizeof(*ctx->priv); - ctx->priv->iface = ctx->iface; - ctx->priv->alg_priv = mmap->base; - - for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++) - ctx->priv->alg_priv->mmaps[i].id = vp8_mem_req_segs[i].id; - - ctx->priv->alg_priv->mmaps[0] = *mmap; - ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si); - ctx->priv->alg_priv->decrypt_cb = NULL; - ctx->priv->alg_priv->decrypt_state = NULL; + ctx->priv = (vpx_codec_priv_t *)priv; ctx->priv->init_flags = ctx->init_flags; + priv->si.sz = sizeof(priv->si); + priv->decrypt_cb = NULL; + priv->decrypt_state = NULL; + priv->flushed = 0; + if (ctx->config.dec) { /* Update the reference to the config structure to an internal copy. */ - ctx->priv->alg_priv->cfg = *ctx->config.dec; - ctx->config.dec = &ctx->priv->alg_priv->cfg; + priv->cfg = *ctx->config.dec; + ctx->config.dec = &priv->cfg; } } -static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx) -{ - /* nothing to clean up */ -} - static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, vpx_codec_priv_enc_mr_cfg_t *data) { - vpx_codec_err_t res = VPX_CODEC_OK; + vpx_codec_err_t res = VPX_CODEC_OK; + vpx_codec_alg_priv_t *priv = NULL; (void) data; vp8_rtcd(); @@ -129,41 +114,31 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, */ if (!ctx->priv) { - vpx_codec_mmap_t mmap; - - mmap.id = vp8_mem_req_segs[0].id; - mmap.sz = sizeof(vpx_codec_alg_priv_t); - mmap.align = vp8_mem_req_segs[0].align; - mmap.flags = vp8_mem_req_segs[0].flags; - - res = vpx_mmap_alloc(&mmap); - if (res != VPX_CODEC_OK) return res; - - vp8_init_ctx(ctx, &mmap); + vp8_init_ctx(ctx); + priv = (vpx_codec_alg_priv_t *)ctx->priv; /* initialize number of fragments to zero */ - ctx->priv->alg_priv->fragments.count = 0; + priv->fragments.count = 0; /* is input fragments enabled? */ - ctx->priv->alg_priv->fragments.enabled = - (ctx->priv->alg_priv->base.init_flags & - VPX_CODEC_USE_INPUT_FRAGMENTS); + priv->fragments.enabled = + (priv->base.init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS); - ctx->priv->alg_priv->defer_alloc = 1; /*post processing level initialized to do nothing */ } + else + { + priv = (vpx_codec_alg_priv_t *)ctx->priv; + } - ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads = - (ctx->priv->alg_priv->base.init_flags & - VPX_CODEC_USE_FRAME_THREADING); + priv->yv12_frame_buffers.use_frame_threads = + (ctx->priv->init_flags & VPX_CODEC_USE_FRAME_THREADING); /* for now, disable frame threading */ - ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads = 0; + priv->yv12_frame_buffers.use_frame_threads = 0; - if(ctx->priv->alg_priv->yv12_frame_buffers.use_frame_threads && - (( ctx->priv->alg_priv->base.init_flags & - VPX_CODEC_USE_ERROR_CONCEALMENT) - || ( ctx->priv->alg_priv->base.init_flags & - VPX_CODEC_USE_INPUT_FRAGMENTS) ) ) + if (priv->yv12_frame_buffers.use_frame_threads && + ((ctx->priv->init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT) || + (ctx->priv->init_flags & VPX_CODEC_USE_INPUT_FRAGMENTS))) { /* row-based threading, error concealment, and input fragments will * not be supported when using frame-based threading */ @@ -175,15 +150,9 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx) { - int i; - vp8_remove_decoder_instances(&ctx->yv12_frame_buffers); - for (i = NELEMENTS(ctx->mmaps) - 1; i >= 0; i--) - { - if (ctx->mmaps[i].dtor) - ctx->mmaps[i].dtor(&ctx->mmaps[i]); - } + vpx_free(ctx); return VPX_CODEC_OK; } @@ -191,7 +160,7 @@ static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx) static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data, unsigned int data_sz, vpx_codec_stream_info_t *si, - vp8_decrypt_cb *decrypt_cb, + vpx_decrypt_cb decrypt_cb, void *decrypt_state) { vpx_codec_err_t res = VPX_CODEC_OK; @@ -212,7 +181,7 @@ static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data, const uint8_t *clear = data; if (decrypt_cb) { - int n = data_sz > 10 ? 10 : data_sz; + int n = MIN(sizeof(clear_buffer), data_sz); decrypt_cb(decrypt_state, data, clear_buffer, n); clear = clear_buffer; } @@ -303,6 +272,7 @@ static void yuvconfig2image(vpx_image_t *img, img->stride[VPX_PLANE_U] = yv12->uv_stride; img->stride[VPX_PLANE_V] = yv12->uv_stride; img->stride[VPX_PLANE_ALPHA] = yv12->y_stride; + img->bit_depth = 8; img->bps = 12; img->user_priv = user_priv; img->img_data = yv12->buffer_alloc; @@ -361,6 +331,13 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, unsigned int resolution_change = 0; unsigned int w, h; + if (data == NULL && data_sz == 0) { + ctx->flushed = 1; + return VPX_CODEC_OK; + } + + /* Reset flushed when receiving a valid frame */ + ctx->flushed = 0; /* Update the input fragment data */ if(update_fragments(ctx, data, data_sz, &res) <= 0) @@ -389,72 +366,40 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, if ((ctx->si.h != h) || (ctx->si.w != w)) resolution_change = 1; - /* Perform deferred allocations, if required */ - if (!res && ctx->defer_alloc) - { - int i; - - for (i = 1; !res && i < NELEMENTS(ctx->mmaps); i++) - { - vpx_codec_dec_cfg_t cfg; - - cfg.w = ctx->si.w; - cfg.h = ctx->si.h; - ctx->mmaps[i].id = vp8_mem_req_segs[i].id; - ctx->mmaps[i].sz = vp8_mem_req_segs[i].sz; - ctx->mmaps[i].align = vp8_mem_req_segs[i].align; - ctx->mmaps[i].flags = vp8_mem_req_segs[i].flags; - - if (!ctx->mmaps[i].sz) - ctx->mmaps[i].sz = vp8_mem_req_segs[i].calc_sz(&cfg, - ctx->base.init_flags); - - res = vpx_mmap_alloc(&ctx->mmaps[i]); - } - - if (!res) - vp8_finalize_mmaps(ctx); - - ctx->defer_alloc = 0; - } - /* Initialize the decoder instance on the first frame*/ if (!res && !ctx->decoder_init) { - res = vpx_validate_mmaps(&ctx->si, ctx->mmaps, - vp8_mem_req_segs, NELEMENTS(vp8_mem_req_segs), - ctx->base.init_flags); + VP8D_CONFIG oxcf; - if (!res) - { - VP8D_CONFIG oxcf; + oxcf.Width = ctx->si.w; + oxcf.Height = ctx->si.h; + oxcf.Version = 9; + oxcf.postprocess = 0; + oxcf.max_threads = ctx->cfg.threads; + oxcf.error_concealment = + (ctx->base.init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT); - oxcf.Width = ctx->si.w; - oxcf.Height = ctx->si.h; - oxcf.Version = 9; - oxcf.postprocess = 0; - oxcf.max_threads = ctx->cfg.threads; - oxcf.error_concealment = - (ctx->base.init_flags & VPX_CODEC_USE_ERROR_CONCEALMENT); + /* If postprocessing was enabled by the application and a + * configuration has not been provided, default it. + */ + if (!ctx->postproc_cfg_set + && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) { + ctx->postproc_cfg.post_proc_flag = + VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE; + ctx->postproc_cfg.deblocking_level = 4; + ctx->postproc_cfg.noise_level = 0; + } - /* If postprocessing was enabled by the application and a - * configuration has not been provided, default it. - */ - if (!ctx->postproc_cfg_set - && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) - { - ctx->postproc_cfg.post_proc_flag = - VP8_DEBLOCK | VP8_DEMACROBLOCK | VP8_MFQE; - ctx->postproc_cfg.deblocking_level = 4; - ctx->postproc_cfg.noise_level = 0; - } + res = vp8_create_decoder_instances(&ctx->yv12_frame_buffers, &oxcf); + ctx->decoder_init = 1; + } - res = vp8_create_decoder_instances(&ctx->yv12_frame_buffers, &oxcf); - ctx->yv12_frame_buffers.pbi[0]->decrypt_cb = ctx->decrypt_cb; - ctx->yv12_frame_buffers.pbi[0]->decrypt_state = ctx->decrypt_state; - } - - ctx->decoder_init = 1; + /* Set these even if already initialized. The caller may have changed the + * decrypt config between frames. + */ + if (ctx->decoder_init) { + ctx->yv12_frame_buffers.pbi[0]->decrypt_cb = ctx->decrypt_cb; + ctx->yv12_frame_buffers.pbi[0]->decrypt_state = ctx->decrypt_state; } if (!res) @@ -475,6 +420,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, if (setjmp(pbi->common.error.jmp)) { pbi->common.error.setjmp = 0; + vp8_clear_system_state(); /* same return value as used in vp8dx_receive_compressed_data */ return -1; } @@ -618,103 +564,26 @@ static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t *ctx, return img; } - -static -vpx_codec_err_t vp8_xma_get_mmap(const vpx_codec_ctx_t *ctx, - vpx_codec_mmap_t *mmap, - vpx_codec_iter_t *iter) -{ - vpx_codec_err_t res; - const mem_req_t *seg_iter = *iter; - - /* Get address of next segment request */ - do - { - if (!seg_iter) - seg_iter = vp8_mem_req_segs; - else if (seg_iter->id != VP8_SEG_MAX) - seg_iter++; - - *iter = (vpx_codec_iter_t)seg_iter; - - if (seg_iter->id != VP8_SEG_MAX) - { - mmap->id = seg_iter->id; - mmap->sz = seg_iter->sz; - mmap->align = seg_iter->align; - mmap->flags = seg_iter->flags; - - if (!seg_iter->sz) - mmap->sz = seg_iter->calc_sz(ctx->config.dec, ctx->init_flags); - - res = VPX_CODEC_OK; - } - else - res = VPX_CODEC_LIST_END; - } - while (!mmap->sz && res != VPX_CODEC_LIST_END); - - return res; -} - -static vpx_codec_err_t vp8_xma_set_mmap(vpx_codec_ctx_t *ctx, - const vpx_codec_mmap_t *mmap) -{ - vpx_codec_err_t res = VPX_CODEC_MEM_ERROR; - int i, done; - - if (!ctx->priv) - { - if (mmap->id == VP8_SEG_ALG_PRIV) - { - if (!ctx->priv) - { - vp8_init_ctx(ctx, mmap); - res = VPX_CODEC_OK; - } - } - } - - done = 1; - - if (!res && ctx->priv->alg_priv) - { - for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++) - { - if (ctx->priv->alg_priv->mmaps[i].id == mmap->id) - if (!ctx->priv->alg_priv->mmaps[i].base) - { - ctx->priv->alg_priv->mmaps[i] = *mmap; - res = VPX_CODEC_OK; - } - - done &= (ctx->priv->alg_priv->mmaps[i].base != NULL); - } - } - - if (done && !res) - { - vp8_finalize_mmaps(ctx->priv->alg_priv); - res = ctx->iface->init(ctx, NULL); - } - - return res; -} - static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, YV12_BUFFER_CONFIG *yv12) { + const int y_w = img->d_w; + const int y_h = img->d_h; + const int uv_w = (img->d_w + 1) / 2; + const int uv_h = (img->d_h + 1) / 2; vpx_codec_err_t res = VPX_CODEC_OK; yv12->y_buffer = img->planes[VPX_PLANE_Y]; yv12->u_buffer = img->planes[VPX_PLANE_U]; yv12->v_buffer = img->planes[VPX_PLANE_V]; - yv12->y_crop_width = img->d_w; - yv12->y_crop_height = img->d_h; - yv12->y_width = img->d_w; - yv12->y_height = img->d_h; - yv12->uv_width = yv12->y_width / 2; - yv12->uv_height = yv12->y_height / 2; + yv12->y_crop_width = y_w; + yv12->y_crop_height = y_h; + yv12->y_width = y_w; + yv12->y_height = y_h; + yv12->uv_crop_width = uv_w; + yv12->uv_crop_height = uv_h; + yv12->uv_width = uv_w; + yv12->uv_height = uv_h; yv12->y_stride = img->stride[VPX_PLANE_Y]; yv12->uv_stride = img->stride[VPX_PLANE_U]; @@ -725,8 +594,7 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, static vpx_codec_err_t vp8_set_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) + va_list args) { vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); @@ -747,8 +615,7 @@ static vpx_codec_err_t vp8_set_reference(vpx_codec_alg_priv_t *ctx, } static vpx_codec_err_t vp8_get_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) + va_list args) { vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); @@ -769,7 +636,6 @@ static vpx_codec_err_t vp8_get_reference(vpx_codec_alg_priv_t *ctx, } static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx, - int ctr_id, va_list args) { #if CONFIG_POSTPROC @@ -789,31 +655,56 @@ static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx, #endif } -static vpx_codec_err_t vp8_set_dbg_options(vpx_codec_alg_priv_t *ctx, - int ctrl_id, - va_list args) -{ + +static vpx_codec_err_t vp8_set_dbg_color_ref_frame(vpx_codec_alg_priv_t *ctx, + va_list args) { #if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC - int data = va_arg(args, int); - -#define MAP(id, var) case id: var = data; break; - - switch (ctrl_id) - { - MAP (VP8_SET_DBG_COLOR_REF_FRAME, ctx->dbg_color_ref_frame_flag); - MAP (VP8_SET_DBG_COLOR_MB_MODES, ctx->dbg_color_mb_modes_flag); - MAP (VP8_SET_DBG_COLOR_B_MODES, ctx->dbg_color_b_modes_flag); - MAP (VP8_SET_DBG_DISPLAY_MV, ctx->dbg_display_mv_flag); - } - - return VPX_CODEC_OK; + ctx->dbg_color_ref_frame_flag = va_arg(args, int); + return VPX_CODEC_OK; #else - return VPX_CODEC_INCAPABLE; + (void)ctx; + (void)args; + return VPX_CODEC_INCAPABLE; +#endif +} + +static vpx_codec_err_t vp8_set_dbg_color_mb_modes(vpx_codec_alg_priv_t *ctx, + va_list args) { +#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC + ctx->dbg_color_mb_modes_flag = va_arg(args, int); + return VPX_CODEC_OK; +#else + (void)ctx; + (void)args; + return VPX_CODEC_INCAPABLE; +#endif +} + +static vpx_codec_err_t vp8_set_dbg_color_b_modes(vpx_codec_alg_priv_t *ctx, + va_list args) { +#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC + ctx->dbg_color_b_modes_flag = va_arg(args, int); + return VPX_CODEC_OK; +#else + (void)ctx; + (void)args; + return VPX_CODEC_INCAPABLE; +#endif +} + +static vpx_codec_err_t vp8_set_dbg_display_mv(vpx_codec_alg_priv_t *ctx, + va_list args) { +#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC + ctx->dbg_display_mv_flag = va_arg(args, int); + return VPX_CODEC_OK; +#else + (void)ctx; + (void)args; + return VPX_CODEC_INCAPABLE; #endif } static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, - int ctrl_id, va_list args) { int *update_info = va_arg(args, int *); @@ -834,7 +725,6 @@ static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, extern int vp8dx_references_buffer( VP8_COMMON *oci, int ref_frame ); static vpx_codec_err_t vp8_get_last_ref_frame(vpx_codec_alg_priv_t *ctx, - int ctrl_id, va_list args) { int *ref_info = va_arg(args, int *); @@ -855,7 +745,6 @@ static vpx_codec_err_t vp8_get_last_ref_frame(vpx_codec_alg_priv_t *ctx, } static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx, - int ctrl_id, va_list args) { @@ -864,8 +753,9 @@ static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx, if (corrupted && pbi) { - *corrupted = pbi->common.frame_to_show->corrupted; - + const YV12_BUFFER_CONFIG *const frame = pbi->common.frame_to_show; + if (frame == NULL) return VPX_CODEC_ERROR; + *corrupted = frame->corrupted; return VPX_CODEC_OK; } else @@ -874,10 +764,9 @@ static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx, } static vpx_codec_err_t vp8_set_decryptor(vpx_codec_alg_priv_t *ctx, - int ctrl_id, va_list args) { - vp8_decrypt_init *init = va_arg(args, vp8_decrypt_init *); + vpx_decrypt_init *init = va_arg(args, vpx_decrypt_init *); if (init) { @@ -897,14 +786,14 @@ vpx_codec_ctrl_fn_map_t vp8_ctf_maps[] = {VP8_SET_REFERENCE, vp8_set_reference}, {VP8_COPY_REFERENCE, vp8_get_reference}, {VP8_SET_POSTPROC, vp8_set_postproc}, - {VP8_SET_DBG_COLOR_REF_FRAME, vp8_set_dbg_options}, - {VP8_SET_DBG_COLOR_MB_MODES, vp8_set_dbg_options}, - {VP8_SET_DBG_COLOR_B_MODES, vp8_set_dbg_options}, - {VP8_SET_DBG_DISPLAY_MV, vp8_set_dbg_options}, + {VP8_SET_DBG_COLOR_REF_FRAME, vp8_set_dbg_color_ref_frame}, + {VP8_SET_DBG_COLOR_MB_MODES, vp8_set_dbg_color_mb_modes}, + {VP8_SET_DBG_COLOR_B_MODES, vp8_set_dbg_color_b_modes}, + {VP8_SET_DBG_DISPLAY_MV, vp8_set_dbg_display_mv}, {VP8D_GET_LAST_REF_UPDATES, vp8_get_last_ref_updates}, {VP8D_GET_FRAME_CORRUPTED, vp8_get_frame_corrupted}, {VP8D_GET_LAST_REF_USED, vp8_get_last_ref_frame}, - {VP8D_SET_DECRYPTOR, vp8_set_decryptor}, + {VPXD_SET_DECRYPTOR, vp8_set_decryptor}, { -1, NULL}, }; @@ -922,20 +811,20 @@ CODEC_INTERFACE(vpx_codec_vp8_dx) = vp8_init, /* vpx_codec_init_fn_t init; */ vp8_destroy, /* vpx_codec_destroy_fn_t destroy; */ vp8_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */ - vp8_xma_get_mmap, /* vpx_codec_get_mmap_fn_t get_mmap; */ - vp8_xma_set_mmap, /* vpx_codec_set_mmap_fn_t set_mmap; */ { vp8_peek_si, /* vpx_codec_peek_si_fn_t peek_si; */ vp8_get_si, /* vpx_codec_get_si_fn_t get_si; */ vp8_decode, /* vpx_codec_decode_fn_t decode; */ vp8_get_frame, /* vpx_codec_frame_get_fn_t frame_get; */ + NULL, }, { /* encoder functions */ - NOT_IMPLEMENTED, - NOT_IMPLEMENTED, - NOT_IMPLEMENTED, - NOT_IMPLEMENTED, - NOT_IMPLEMENTED, - NOT_IMPLEMENTED + 0, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL } }; diff --git a/media/libvpx/vp8_rtcd_armv7-android-gcc.h b/media/libvpx/vp8_rtcd_armv7-android-gcc.h index 25dc5945d59f..3c1681c6a14a 100644 --- a/media/libvpx/vp8_rtcd_armv7-android-gcc.h +++ b/media/libvpx/vp8_rtcd_armv7-android-gcc.h @@ -7,6 +7,10 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + /* * VP8 */ @@ -22,33 +26,118 @@ struct variance_vtable; union int_mv; struct yv12_buffer_config; +void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict4x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict4x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +int vp8_block_error_c(short *coeff, short *dqcoeff); +#define vp8_block_error vp8_block_error_c + +void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +void vp8_build_intra_predictors_mbuv_s_neon(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +RTCD_EXTERN void (*vp8_build_intra_predictors_mbuv_s)(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); + +void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +void vp8_build_intra_predictors_mby_s_neon(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +RTCD_EXTERN void (*vp8_build_intra_predictors_mby_s)(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); + void vp8_clear_system_state_c(); #define vp8_clear_system_state vp8_clear_system_state_c -void vp8_dequantize_b_c(struct blockd*, short *dqc); -void vp8_dequantize_b_v6(struct blockd*, short *dqc); -void vp8_dequantize_b_neon(struct blockd*, short *dqc); -RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *dqc); +void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_v6(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_v6(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x8_v6(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x8_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_dc_only_idct_add_v6(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_dc_only_idct_add_neon(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +RTCD_EXTERN int (*vp8_denoiser_filter)(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +RTCD_EXTERN int (*vp8_denoiser_filter_uv)(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); void vp8_dequant_idct_add_v6(short *input, short *dq, unsigned char *output, int stride); void vp8_dequant_idct_add_neon(short *input, short *dq, unsigned char *output, int stride); RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *output, int stride); -void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -void vp8_dequant_idct_add_y_block_v6(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs); - void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); RTCD_EXTERN void (*vp8_dequant_idct_add_uv_block)(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); -void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_armv6(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_v6(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs); + +void vp8_dequantize_b_c(struct blockd*, short *dqc); +void vp8_dequantize_b_v6(struct blockd*, short *dqc); +void vp8_dequantize_b_neon(struct blockd*, short *dqc); +RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *dqc); + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_diamond_search_sad vp8_diamond_search_sad_c + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +void vp8_fast_quantize_b_armv6(struct block *, struct blockd *); +void vp8_fast_quantize_b_neon(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); + +void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); +void vp8_fast_quantize_b_pair_neon(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); +RTCD_EXTERN void (*vp8_fast_quantize_b_pair)(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); + +int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_full_search_sad vp8_full_search_sad_c + +unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vp8_get4x4sse_cs_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp8_get4x4sse_cs)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); + +unsigned int vp8_get_mb_ss_c(const short *); +#define vp8_get_mb_ss vp8_get_mb_ss_c + +void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); +void vp8_intra4x4_predict_armv6(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); +#define vp8_intra4x4_predict vp8_intra4x4_predict_armv6 + +void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_armv6(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); void vp8_loop_filter_bv_armv6(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); @@ -60,249 +149,128 @@ void vp8_loop_filter_mbh_armv6(unsigned char *y, unsigned char *u, unsigned char void vp8_loop_filter_mbh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_armv6(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); - -void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_vertical_edge_armv6(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_mbvs_neon(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y, int ystride, const unsigned char *blimit); - -void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_armv6(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_mbhs_neon(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y, int ystride, const unsigned char *blimit); - -void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_armv6(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_neon(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_armv6(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); void vp8_loop_filter_bhs_armv6(unsigned char *y, int ystride, const unsigned char *blimit); void vp8_loop_filter_bhs_neon(unsigned char *y, int ystride, const unsigned char *blimit); RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -void vp8_short_idct4x4llm_v6_dual(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -void vp8_short_idct4x4llm_neon(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_armv6(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_neon(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_short_inv_walsh4x4_1_c(short *input, short *output); -#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_armv6(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_mbhs_neon(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_short_inv_walsh4x4_c(short *input, short *output); -void vp8_short_inv_walsh4x4_v6(short *input, short *output); -void vp8_short_inv_walsh4x4_neon(short *input, short *output); -RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *output); +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_armv6(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_mbvs_neon(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -void vp8_dc_only_idct_add_v6(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -void vp8_dc_only_idct_add_neon(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +#define vp8_mbblock_error vp8_mbblock_error_c -void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_v6(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); - -void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x8_v6(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x8_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); - -void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x4_v6(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x4_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); - -void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c - -void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c - -void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); -void vp8_intra4x4_predict_armv6(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); -RTCD_EXTERN void (*vp8_intra4x4_predict)(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); - -void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict4x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict4x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance4x4 vp8_variance4x4_c - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance8x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance8x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance16x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_variance4x4 vp8_sub_pixel_variance4x4_c - -unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x8_armv6(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x8_neon(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance8x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_variance8x16 vp8_sub_pixel_variance8x16_c - -unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_variance16x8 vp8_sub_pixel_variance16x8_c - -unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_armv6(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_neon(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_h_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_h_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_h)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_v_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_v_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_v)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_hv_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_hv_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_hv)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad4x4_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp8_sad4x4)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x8_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp8_sad8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x16_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp8_sad8x16)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad16x8_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp8_sad16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad16x16_armv6(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad16x16_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp8_sad16x16)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); - -void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp8_sad4x4x3 vp8_sad4x4x3_c - -void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp8_sad8x8x3 vp8_sad8x8x3_c - -void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp8_sad8x16x3 vp8_sad8x16x3_c - -void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp8_sad16x8x3 vp8_sad16x8x3_c - -void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp8_sad16x16x3 vp8_sad16x16x3_c - -void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -#define vp8_sad4x4x8 vp8_sad4x4x8_c - -void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -#define vp8_sad8x8x8 vp8_sad8x8x8_c - -void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -#define vp8_sad8x16x8 vp8_sad8x16x8_c - -void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -#define vp8_sad16x8x8 vp8_sad16x8x8_c - -void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -#define vp8_sad16x16x8 vp8_sad16x16x8_c - -void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp8_sad4x4x4d vp8_sad4x4x4d_c - -void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp8_sad8x8x4d vp8_sad8x8x4d_c - -void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp8_sad8x16x4d vp8_sad8x16x4d_c - -void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp8_sad16x8x4d vp8_sad16x8x4d_c - -void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp8_sad16x16x4d vp8_sad16x16x4d_c - -unsigned int vp8_get_mb_ss_c(const short *); -#define vp8_get_mb_ss vp8_get_mb_ss_c - -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_c +int vp8_mbuverror_c(struct macroblock *mb); +#define vp8_mbuverror vp8_mbuverror_c unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp8_mse16x16_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vp8_mse16x16_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vp8_mse16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -unsigned int vp8_get4x4sse_cs_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -RTCD_EXTERN unsigned int (*vp8_get4x4sse_cs)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +void vp8_quantize_mb_c(struct macroblock *); +void vp8_quantize_mb_neon(struct macroblock *); +RTCD_EXTERN void (*vp8_quantize_mb)(struct macroblock *); + +void vp8_quantize_mbuv_c(struct macroblock *); +void vp8_quantize_mbuv_neon(struct macroblock *); +RTCD_EXTERN void (*vp8_quantize_mbuv)(struct macroblock *); + +void vp8_quantize_mby_c(struct macroblock *); +void vp8_quantize_mby_neon(struct macroblock *); +RTCD_EXTERN void (*vp8_quantize_mby)(struct macroblock *); + +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_refining_search_sad vp8_refining_search_sad_c + +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +#define vp8_regular_quantize_b vp8_regular_quantize_b_c + +void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); +#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c + +unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad16x16_armv6(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad16x16_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +RTCD_EXTERN unsigned int (*vp8_sad16x16)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); + +void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp8_sad16x16x3 vp8_sad16x16x3_c + +void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp8_sad16x16x4d vp8_sad16x16x4d_c + +void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +#define vp8_sad16x16x8 vp8_sad16x16x8_c + +unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad16x8_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +RTCD_EXTERN unsigned int (*vp8_sad16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); + +void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp8_sad16x8x3 vp8_sad16x8x3_c + +void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp8_sad16x8x4d vp8_sad16x8x4d_c + +void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +#define vp8_sad16x8x8 vp8_sad16x8x8_c + +unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad4x4_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +RTCD_EXTERN unsigned int (*vp8_sad4x4)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); + +void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp8_sad4x4x3 vp8_sad4x4x3_c + +void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp8_sad4x4x4d vp8_sad4x4x4d_c + +void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +#define vp8_sad4x4x8 vp8_sad4x4x8_c + +unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x16_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +RTCD_EXTERN unsigned int (*vp8_sad8x16)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); + +void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp8_sad8x16x3 vp8_sad8x16x3_c + +void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp8_sad8x16x4d vp8_sad8x16x4d_c + +void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +#define vp8_sad8x16x8 vp8_sad8x16x8_c + +unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x8_neon(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +RTCD_EXTERN unsigned int (*vp8_sad8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); + +void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp8_sad8x8x3 vp8_sad8x8x3_c + +void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp8_sad8x8x4d vp8_sad8x8x4d_c + +void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +#define vp8_sad8x8x8 vp8_sad8x8x8_c void vp8_short_fdct4x4_c(short *input, short *output, int pitch); void vp8_short_fdct4x4_armv6(short *input, short *output, int pitch); @@ -314,79 +282,118 @@ void vp8_short_fdct8x4_armv6(short *input, short *output, int pitch); void vp8_short_fdct8x4_neon(short *input, short *output, int pitch); RTCD_EXTERN void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_v6_dual(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_neon(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); + +void vp8_short_inv_walsh4x4_c(short *input, short *output); +void vp8_short_inv_walsh4x4_v6(short *input, short *output); +void vp8_short_inv_walsh4x4_neon(short *input, short *output); +RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *output); + +void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + void vp8_short_walsh4x4_c(short *input, short *output, int pitch); void vp8_short_walsh4x4_armv6(short *input, short *output, int pitch); void vp8_short_walsh4x4_neon(short *input, short *output, int pitch); RTCD_EXTERN void (*vp8_short_walsh4x4)(short *input, short *output, int pitch); -void vp8_regular_quantize_b_c(struct block *, struct blockd *); -#define vp8_regular_quantize_b vp8_regular_quantize_b_c +void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_fast_quantize_b_c(struct block *, struct blockd *); -void vp8_fast_quantize_b_armv6(struct block *, struct blockd *); -void vp8_fast_quantize_b_neon(struct block *, struct blockd *); -RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); +void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_armv6 -void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); -#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c +void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); -void vp8_fast_quantize_b_pair_neon(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); -RTCD_EXTERN void (*vp8_fast_quantize_b_pair)(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); +void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_armv6(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_quantize_mb_c(struct macroblock *); -void vp8_quantize_mb_neon(struct macroblock *); -RTCD_EXTERN void (*vp8_quantize_mb)(struct macroblock *); +unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_c -void vp8_quantize_mby_c(struct macroblock *); -void vp8_quantize_mby_neon(struct macroblock *); -RTCD_EXTERN void (*vp8_quantize_mby)(struct macroblock *); +unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_armv6(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_neon(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -void vp8_quantize_mbuv_c(struct macroblock *); -void vp8_quantize_mbuv_neon(struct macroblock *); -RTCD_EXTERN void (*vp8_quantize_mbuv)(struct macroblock *); +unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_variance16x8 vp8_sub_pixel_variance16x8_c -int vp8_block_error_c(short *coeff, short *dqcoeff); -#define vp8_block_error vp8_block_error_c +unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_variance4x4 vp8_sub_pixel_variance4x4_c -int vp8_mbblock_error_c(struct macroblock *mb, int dc); -#define vp8_mbblock_error vp8_mbblock_error_c +unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_variance8x16 vp8_sub_pixel_variance8x16_c -int vp8_mbuverror_c(struct macroblock *mb); -#define vp8_mbuverror vp8_mbuverror_c +unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x8_armv6(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x8_neon(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance8x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); void vp8_subtract_b_c(struct block *be, struct blockd *bd, int pitch); void vp8_subtract_b_armv6(struct block *be, struct blockd *bd, int pitch); void vp8_subtract_b_neon(struct block *be, struct blockd *bd, int pitch); RTCD_EXTERN void (*vp8_subtract_b)(struct block *be, struct blockd *bd, int pitch); -void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -void vp8_subtract_mby_armv6(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -RTCD_EXTERN void (*vp8_subtract_mby)(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); - void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); void vp8_subtract_mbuv_armv6(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); RTCD_EXTERN void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); -int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -#define vp8_full_search_sad vp8_full_search_sad_c +void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +void vp8_subtract_mby_armv6(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +RTCD_EXTERN void (*vp8_subtract_mby)(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -#define vp8_refining_search_sad vp8_refining_search_sad_c +unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x16_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x16_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -#define vp8_diamond_search_sad vp8_diamond_search_sad_c +unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x8_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance16x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -void vp8_yv12_copy_partial_frame_neon(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -RTCD_EXTERN void (*vp8_yv12_copy_partial_frame)(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance4x4 vp8_variance4x4_c -int vp8_denoiser_filter_c(struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset); -#define vp8_denoiser_filter vp8_denoiser_filter_c +unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x16_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance8x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x8_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x8_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance8x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_h_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_h_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_h)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_hv_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_hv_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_hv)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_v_armv6(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_v_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_v)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); void vp8_rtcd(void); + #include "vpx_config.h" #ifdef RTCD_C @@ -397,245 +404,123 @@ static void setup_rtcd_internal(void) (void)flags; - - vp8_dequantize_b = vp8_dequantize_b_c; - if (flags & HAS_MEDIA) vp8_dequantize_b = vp8_dequantize_b_v6; - if (flags & HAS_NEON) vp8_dequantize_b = vp8_dequantize_b_neon; - - vp8_dequant_idct_add = vp8_dequant_idct_add_c; - if (flags & HAS_MEDIA) vp8_dequant_idct_add = vp8_dequant_idct_add_v6; - if (flags & HAS_NEON) vp8_dequant_idct_add = vp8_dequant_idct_add_neon; - - vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_c; - if (flags & HAS_MEDIA) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_v6; - if (flags & HAS_NEON) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_neon; - - vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_c; - if (flags & HAS_MEDIA) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_v6; - if (flags & HAS_NEON) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_neon; - - vp8_loop_filter_mbv = vp8_loop_filter_mbv_c; - if (flags & HAS_MEDIA) vp8_loop_filter_mbv = vp8_loop_filter_mbv_armv6; - if (flags & HAS_NEON) vp8_loop_filter_mbv = vp8_loop_filter_mbv_neon; - - vp8_loop_filter_bv = vp8_loop_filter_bv_c; - if (flags & HAS_MEDIA) vp8_loop_filter_bv = vp8_loop_filter_bv_armv6; - if (flags & HAS_NEON) vp8_loop_filter_bv = vp8_loop_filter_bv_neon; - - vp8_loop_filter_mbh = vp8_loop_filter_mbh_c; - if (flags & HAS_MEDIA) vp8_loop_filter_mbh = vp8_loop_filter_mbh_armv6; - if (flags & HAS_NEON) vp8_loop_filter_mbh = vp8_loop_filter_mbh_neon; - - vp8_loop_filter_bh = vp8_loop_filter_bh_c; - if (flags & HAS_MEDIA) vp8_loop_filter_bh = vp8_loop_filter_bh_armv6; - if (flags & HAS_NEON) vp8_loop_filter_bh = vp8_loop_filter_bh_neon; - - vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_c; - if (flags & HAS_MEDIA) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_armv6; - if (flags & HAS_NEON) vp8_loop_filter_simple_mbv = vp8_loop_filter_mbvs_neon; - - vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_c; - if (flags & HAS_MEDIA) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_armv6; - if (flags & HAS_NEON) vp8_loop_filter_simple_mbh = vp8_loop_filter_mbhs_neon; - - vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_c; - if (flags & HAS_MEDIA) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_armv6; - if (flags & HAS_NEON) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_neon; - - vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_c; - if (flags & HAS_MEDIA) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_armv6; - if (flags & HAS_NEON) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_neon; - - vp8_short_idct4x4llm = vp8_short_idct4x4llm_c; - if (flags & HAS_MEDIA) vp8_short_idct4x4llm = vp8_short_idct4x4llm_v6_dual; - if (flags & HAS_NEON) vp8_short_idct4x4llm = vp8_short_idct4x4llm_neon; - - - vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_c; - if (flags & HAS_MEDIA) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_v6; - if (flags & HAS_NEON) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_neon; - - vp8_dc_only_idct_add = vp8_dc_only_idct_add_c; - if (flags & HAS_MEDIA) vp8_dc_only_idct_add = vp8_dc_only_idct_add_v6; - if (flags & HAS_NEON) vp8_dc_only_idct_add = vp8_dc_only_idct_add_neon; - - vp8_copy_mem16x16 = vp8_copy_mem16x16_c; - if (flags & HAS_MEDIA) vp8_copy_mem16x16 = vp8_copy_mem16x16_v6; - if (flags & HAS_NEON) vp8_copy_mem16x16 = vp8_copy_mem16x16_neon; - - vp8_copy_mem8x8 = vp8_copy_mem8x8_c; - if (flags & HAS_MEDIA) vp8_copy_mem8x8 = vp8_copy_mem8x8_v6; - if (flags & HAS_NEON) vp8_copy_mem8x8 = vp8_copy_mem8x8_neon; - - vp8_copy_mem8x4 = vp8_copy_mem8x4_c; - if (flags & HAS_MEDIA) vp8_copy_mem8x4 = vp8_copy_mem8x4_v6; - if (flags & HAS_NEON) vp8_copy_mem8x4 = vp8_copy_mem8x4_neon; - - - - vp8_intra4x4_predict = vp8_intra4x4_predict_c; - if (flags & HAS_MEDIA) vp8_intra4x4_predict = vp8_intra4x4_predict_armv6; - - vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_c; - if (flags & HAS_MEDIA) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_armv6; - if (flags & HAS_NEON) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_neon; - - vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_c; - if (flags & HAS_MEDIA) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_armv6; - if (flags & HAS_NEON) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_neon; - - vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_c; - if (flags & HAS_MEDIA) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_armv6; - if (flags & HAS_NEON) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_neon; - - vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_c; - if (flags & HAS_MEDIA) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_armv6; - if (flags & HAS_NEON) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_neon; - - vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_c; - if (flags & HAS_MEDIA) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_armv6; + vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_armv6; if (flags & HAS_NEON) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_neon; - - vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_c; - if (flags & HAS_MEDIA) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_armv6; - if (flags & HAS_NEON) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_neon; - - vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c; - if (flags & HAS_MEDIA) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_armv6; - if (flags & HAS_NEON) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_neon; - - vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_c; - if (flags & HAS_MEDIA) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_armv6; + vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_armv6; if (flags & HAS_NEON) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_neon; - - - vp8_variance8x8 = vp8_variance8x8_c; - if (flags & HAS_MEDIA) vp8_variance8x8 = vp8_variance8x8_armv6; - if (flags & HAS_NEON) vp8_variance8x8 = vp8_variance8x8_neon; - - vp8_variance8x16 = vp8_variance8x16_c; - if (flags & HAS_NEON) vp8_variance8x16 = vp8_variance8x16_neon; - - vp8_variance16x8 = vp8_variance16x8_c; - if (flags & HAS_NEON) vp8_variance16x8 = vp8_variance16x8_neon; - - vp8_variance16x16 = vp8_variance16x16_c; - if (flags & HAS_MEDIA) vp8_variance16x16 = vp8_variance16x16_armv6; - if (flags & HAS_NEON) vp8_variance16x16 = vp8_variance16x16_neon; - - - vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_c; - if (flags & HAS_MEDIA) vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_armv6; - if (flags & HAS_NEON) vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_neon; - - - - vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_c; - if (flags & HAS_MEDIA) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_armv6; - if (flags & HAS_NEON) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_neon; - - vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_c; - if (flags & HAS_MEDIA) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_armv6; - if (flags & HAS_NEON) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_neon; - - vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_c; - if (flags & HAS_MEDIA) vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_armv6; - if (flags & HAS_NEON) vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_neon; - - vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_c; - if (flags & HAS_MEDIA) vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_armv6; - if (flags & HAS_NEON) vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_neon; - - vp8_sad4x4 = vp8_sad4x4_c; - if (flags & HAS_NEON) vp8_sad4x4 = vp8_sad4x4_neon; - - vp8_sad8x8 = vp8_sad8x8_c; - if (flags & HAS_NEON) vp8_sad8x8 = vp8_sad8x8_neon; - - vp8_sad8x16 = vp8_sad8x16_c; - if (flags & HAS_NEON) vp8_sad8x16 = vp8_sad8x16_neon; - - vp8_sad16x8 = vp8_sad16x8_c; - if (flags & HAS_NEON) vp8_sad16x8 = vp8_sad16x8_neon; - - vp8_sad16x16 = vp8_sad16x16_c; - if (flags & HAS_MEDIA) vp8_sad16x16 = vp8_sad16x16_armv6; - if (flags & HAS_NEON) vp8_sad16x16 = vp8_sad16x16_neon; - - - - - - - - - - - - - - - - - - - vp8_mse16x16 = vp8_mse16x16_c; - if (flags & HAS_MEDIA) vp8_mse16x16 = vp8_mse16x16_armv6; - if (flags & HAS_NEON) vp8_mse16x16 = vp8_mse16x16_neon; - - vp8_get4x4sse_cs = vp8_get4x4sse_cs_c; - if (flags & HAS_NEON) vp8_get4x4sse_cs = vp8_get4x4sse_cs_neon; - - vp8_short_fdct4x4 = vp8_short_fdct4x4_c; - if (flags & HAS_MEDIA) vp8_short_fdct4x4 = vp8_short_fdct4x4_armv6; - if (flags & HAS_NEON) vp8_short_fdct4x4 = vp8_short_fdct4x4_neon; - - vp8_short_fdct8x4 = vp8_short_fdct8x4_c; - if (flags & HAS_MEDIA) vp8_short_fdct8x4 = vp8_short_fdct8x4_armv6; - if (flags & HAS_NEON) vp8_short_fdct8x4 = vp8_short_fdct8x4_neon; - - vp8_short_walsh4x4 = vp8_short_walsh4x4_c; - if (flags & HAS_MEDIA) vp8_short_walsh4x4 = vp8_short_walsh4x4_armv6; - if (flags & HAS_NEON) vp8_short_walsh4x4 = vp8_short_walsh4x4_neon; - - - vp8_fast_quantize_b = vp8_fast_quantize_b_c; - if (flags & HAS_MEDIA) vp8_fast_quantize_b = vp8_fast_quantize_b_armv6; + vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_armv6; + if (flags & HAS_NEON) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_neon; + vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_armv6; + if (flags & HAS_NEON) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_neon; + vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_c; + if (flags & HAS_NEON) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_neon; + vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_c; + if (flags & HAS_NEON) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_neon; + vp8_copy_mem16x16 = vp8_copy_mem16x16_v6; + if (flags & HAS_NEON) vp8_copy_mem16x16 = vp8_copy_mem16x16_neon; + vp8_copy_mem8x4 = vp8_copy_mem8x4_v6; + if (flags & HAS_NEON) vp8_copy_mem8x4 = vp8_copy_mem8x4_neon; + vp8_copy_mem8x8 = vp8_copy_mem8x8_v6; + if (flags & HAS_NEON) vp8_copy_mem8x8 = vp8_copy_mem8x8_neon; + vp8_dc_only_idct_add = vp8_dc_only_idct_add_v6; + if (flags & HAS_NEON) vp8_dc_only_idct_add = vp8_dc_only_idct_add_neon; + vp8_denoiser_filter = vp8_denoiser_filter_c; + if (flags & HAS_NEON) vp8_denoiser_filter = vp8_denoiser_filter_neon; + vp8_denoiser_filter_uv = vp8_denoiser_filter_uv_c; + if (flags & HAS_NEON) vp8_denoiser_filter_uv = vp8_denoiser_filter_uv_neon; + vp8_dequant_idct_add = vp8_dequant_idct_add_v6; + if (flags & HAS_NEON) vp8_dequant_idct_add = vp8_dequant_idct_add_neon; + vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_v6; + if (flags & HAS_NEON) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_neon; + vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_v6; + if (flags & HAS_NEON) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_neon; + vp8_dequantize_b = vp8_dequantize_b_v6; + if (flags & HAS_NEON) vp8_dequantize_b = vp8_dequantize_b_neon; + vp8_fast_quantize_b = vp8_fast_quantize_b_armv6; if (flags & HAS_NEON) vp8_fast_quantize_b = vp8_fast_quantize_b_neon; - - vp8_fast_quantize_b_pair = vp8_fast_quantize_b_pair_c; if (flags & HAS_NEON) vp8_fast_quantize_b_pair = vp8_fast_quantize_b_pair_neon; - + vp8_get4x4sse_cs = vp8_get4x4sse_cs_c; + if (flags & HAS_NEON) vp8_get4x4sse_cs = vp8_get4x4sse_cs_neon; + vp8_loop_filter_bh = vp8_loop_filter_bh_armv6; + if (flags & HAS_NEON) vp8_loop_filter_bh = vp8_loop_filter_bh_neon; + vp8_loop_filter_bv = vp8_loop_filter_bv_armv6; + if (flags & HAS_NEON) vp8_loop_filter_bv = vp8_loop_filter_bv_neon; + vp8_loop_filter_mbh = vp8_loop_filter_mbh_armv6; + if (flags & HAS_NEON) vp8_loop_filter_mbh = vp8_loop_filter_mbh_neon; + vp8_loop_filter_mbv = vp8_loop_filter_mbv_armv6; + if (flags & HAS_NEON) vp8_loop_filter_mbv = vp8_loop_filter_mbv_neon; + vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_armv6; + if (flags & HAS_NEON) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_neon; + vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_armv6; + if (flags & HAS_NEON) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_neon; + vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_armv6; + if (flags & HAS_NEON) vp8_loop_filter_simple_mbh = vp8_loop_filter_mbhs_neon; + vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_armv6; + if (flags & HAS_NEON) vp8_loop_filter_simple_mbv = vp8_loop_filter_mbvs_neon; + vp8_mse16x16 = vp8_mse16x16_armv6; + if (flags & HAS_NEON) vp8_mse16x16 = vp8_mse16x16_neon; vp8_quantize_mb = vp8_quantize_mb_c; if (flags & HAS_NEON) vp8_quantize_mb = vp8_quantize_mb_neon; - - vp8_quantize_mby = vp8_quantize_mby_c; - if (flags & HAS_NEON) vp8_quantize_mby = vp8_quantize_mby_neon; - vp8_quantize_mbuv = vp8_quantize_mbuv_c; if (flags & HAS_NEON) vp8_quantize_mbuv = vp8_quantize_mbuv_neon; - - - - - vp8_subtract_b = vp8_subtract_b_c; - if (flags & HAS_MEDIA) vp8_subtract_b = vp8_subtract_b_armv6; + vp8_quantize_mby = vp8_quantize_mby_c; + if (flags & HAS_NEON) vp8_quantize_mby = vp8_quantize_mby_neon; + vp8_sad16x16 = vp8_sad16x16_armv6; + if (flags & HAS_NEON) vp8_sad16x16 = vp8_sad16x16_neon; + vp8_sad16x8 = vp8_sad16x8_c; + if (flags & HAS_NEON) vp8_sad16x8 = vp8_sad16x8_neon; + vp8_sad4x4 = vp8_sad4x4_c; + if (flags & HAS_NEON) vp8_sad4x4 = vp8_sad4x4_neon; + vp8_sad8x16 = vp8_sad8x16_c; + if (flags & HAS_NEON) vp8_sad8x16 = vp8_sad8x16_neon; + vp8_sad8x8 = vp8_sad8x8_c; + if (flags & HAS_NEON) vp8_sad8x8 = vp8_sad8x8_neon; + vp8_short_fdct4x4 = vp8_short_fdct4x4_armv6; + if (flags & HAS_NEON) vp8_short_fdct4x4 = vp8_short_fdct4x4_neon; + vp8_short_fdct8x4 = vp8_short_fdct8x4_armv6; + if (flags & HAS_NEON) vp8_short_fdct8x4 = vp8_short_fdct8x4_neon; + vp8_short_idct4x4llm = vp8_short_idct4x4llm_v6_dual; + if (flags & HAS_NEON) vp8_short_idct4x4llm = vp8_short_idct4x4llm_neon; + vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_v6; + if (flags & HAS_NEON) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_neon; + vp8_short_walsh4x4 = vp8_short_walsh4x4_armv6; + if (flags & HAS_NEON) vp8_short_walsh4x4 = vp8_short_walsh4x4_neon; + vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_armv6; + if (flags & HAS_NEON) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_neon; + vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_armv6; + if (flags & HAS_NEON) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_neon; + vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_armv6; + if (flags & HAS_NEON) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_neon; + vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_armv6; + if (flags & HAS_NEON) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_neon; + vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_armv6; + if (flags & HAS_NEON) vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_neon; + vp8_subtract_b = vp8_subtract_b_armv6; if (flags & HAS_NEON) vp8_subtract_b = vp8_subtract_b_neon; - - vp8_subtract_mby = vp8_subtract_mby_c; - if (flags & HAS_MEDIA) vp8_subtract_mby = vp8_subtract_mby_armv6; - if (flags & HAS_NEON) vp8_subtract_mby = vp8_subtract_mby_neon; - - vp8_subtract_mbuv = vp8_subtract_mbuv_c; - if (flags & HAS_MEDIA) vp8_subtract_mbuv = vp8_subtract_mbuv_armv6; + vp8_subtract_mbuv = vp8_subtract_mbuv_armv6; if (flags & HAS_NEON) vp8_subtract_mbuv = vp8_subtract_mbuv_neon; - - - - - vp8_yv12_copy_partial_frame = vp8_yv12_copy_partial_frame_c; - if (flags & HAS_NEON) vp8_yv12_copy_partial_frame = vp8_yv12_copy_partial_frame_neon; + vp8_subtract_mby = vp8_subtract_mby_armv6; + if (flags & HAS_NEON) vp8_subtract_mby = vp8_subtract_mby_neon; + vp8_variance16x16 = vp8_variance16x16_armv6; + if (flags & HAS_NEON) vp8_variance16x16 = vp8_variance16x16_neon; + vp8_variance16x8 = vp8_variance16x8_c; + if (flags & HAS_NEON) vp8_variance16x8 = vp8_variance16x8_neon; + vp8_variance8x16 = vp8_variance8x16_c; + if (flags & HAS_NEON) vp8_variance8x16 = vp8_variance8x16_neon; + vp8_variance8x8 = vp8_variance8x8_armv6; + if (flags & HAS_NEON) vp8_variance8x8 = vp8_variance8x8_neon; + vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_armv6; + if (flags & HAS_NEON) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_neon; + vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_armv6; + if (flags & HAS_NEON) vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_neon; + vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_armv6; + if (flags & HAS_NEON) vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_neon; } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vp8_rtcd_generic-gnu.h b/media/libvpx/vp8_rtcd_generic-gnu.h index a198a439f015..58282fc06f85 100644 --- a/media/libvpx/vp8_rtcd_generic-gnu.h +++ b/media/libvpx/vp8_rtcd_generic-gnu.h @@ -7,6 +7,10 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + /* * VP8 */ @@ -22,23 +26,83 @@ struct variance_vtable; union int_mv; struct yv12_buffer_config; +void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_c + +void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c + +void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_c + +void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_c + +int vp8_block_error_c(short *coeff, short *dqcoeff); +#define vp8_block_error vp8_block_error_c + +void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c + +void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c + void vp8_clear_system_state_c(); #define vp8_clear_system_state vp8_clear_system_state_c -void vp8_dequantize_b_c(struct blockd*, short *dqc); -#define vp8_dequantize_b vp8_dequantize_b_c +void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem16x16 vp8_copy_mem16x16_c + +void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem8x4 vp8_copy_mem8x4_c + +void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem8x8 vp8_copy_mem8x8_c + +void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +#define vp8_dc_only_idct_add vp8_dc_only_idct_add_c + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter vp8_denoiser_filter_c + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_c void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); #define vp8_dequant_idct_add vp8_dequant_idct_add_c -void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c - void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_c -void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -#define vp8_loop_filter_mbv vp8_loop_filter_mbv_c +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_c + +void vp8_dequantize_b_c(struct blockd*, short *dqc); +#define vp8_dequantize_b vp8_dequantize_b_c + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_diamond_search_sad vp8_diamond_search_sad_c + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +#define vp8_fast_quantize_b vp8_fast_quantize_b_c + +void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); +#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_c + +int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_full_search_sad vp8_full_search_sad_c + +unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +#define vp8_get4x4sse_cs vp8_get4x4sse_cs_c + +unsigned int vp8_get_mb_ss_c(const short *); +#define vp8_get_mb_ss vp8_get_mb_ss_c + +void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); +#define vp8_intra4x4_predict vp8_intra4x4_predict_c + +void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bh vp8_loop_filter_bh_c void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); #define vp8_loop_filter_bv vp8_loop_filter_bv_c @@ -46,218 +110,20 @@ void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); #define vp8_loop_filter_mbh vp8_loop_filter_mbh_c -void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -#define vp8_loop_filter_bh vp8_loop_filter_bh_c - -void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c - -void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_c - -void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); -#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c +void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbv vp8_loop_filter_mbv_c void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); #define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_c -void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -#define vp8_short_idct4x4llm vp8_short_idct4x4llm_c +void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_c -void vp8_short_inv_walsh4x4_1_c(short *input, short *output); -#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_c -void vp8_short_inv_walsh4x4_c(short *input, short *output); -#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_c - -void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -#define vp8_dc_only_idct_add vp8_dc_only_idct_add_c - -void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -#define vp8_copy_mem16x16 vp8_copy_mem16x16_c - -void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -#define vp8_copy_mem8x8 vp8_copy_mem8x8_c - -void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -#define vp8_copy_mem8x4 vp8_copy_mem8x4_c - -void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -#define vp8_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_c - -void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -#define vp8_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_c - -void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); -#define vp8_intra4x4_predict vp8_intra4x4_predict_c - -void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_c - -void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_c - -void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_c - -void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_c - -void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_c - -void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_c - -void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_c - -void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_c - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance4x4 vp8_variance4x4_c - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x8 vp8_variance8x8_c - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x16 vp8_variance8x16_c - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x8 vp8_variance16x8_c - -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x16 vp8_variance16x16_c - -unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_variance4x4 vp8_sub_pixel_variance4x4_c - -unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_variance8x8 vp8_sub_pixel_variance8x8_c - -unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_variance8x16 vp8_sub_pixel_variance8x16_c - -unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_variance16x8 vp8_sub_pixel_variance16x8_c - -unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_variance16x16 vp8_sub_pixel_variance16x16_c - -unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_c - -unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_c - -unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_c - -unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp8_sad4x4 vp8_sad4x4_c - -unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp8_sad8x8 vp8_sad8x8_c - -unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp8_sad8x16 vp8_sad8x16_c - -unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp8_sad16x8 vp8_sad16x8_c - -unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp8_sad16x16 vp8_sad16x16_c - -void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp8_sad4x4x3 vp8_sad4x4x3_c - -void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp8_sad8x8x3 vp8_sad8x8x3_c - -void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp8_sad8x16x3 vp8_sad8x16x3_c - -void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp8_sad16x8x3 vp8_sad16x8x3_c - -void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp8_sad16x16x3 vp8_sad16x16x3_c - -void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -#define vp8_sad4x4x8 vp8_sad4x4x8_c - -void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -#define vp8_sad8x8x8 vp8_sad8x8x8_c - -void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -#define vp8_sad8x16x8 vp8_sad8x16x8_c - -void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -#define vp8_sad16x8x8 vp8_sad16x8x8_c - -void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -#define vp8_sad16x16x8 vp8_sad16x16x8_c - -void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp8_sad4x4x4d vp8_sad4x4x4d_c - -void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp8_sad8x8x4d vp8_sad8x8x4d_c - -void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp8_sad8x16x4d vp8_sad8x16x4d_c - -void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp8_sad16x8x4d vp8_sad16x8x4d_c - -void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp8_sad16x16x4d vp8_sad16x16x4d_c - -unsigned int vp8_get_mb_ss_c(const short *); -#define vp8_get_mb_ss vp8_get_mb_ss_c - -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_c - -unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_mse16x16 vp8_mse16x16_c - -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -#define vp8_get4x4sse_cs vp8_get4x4sse_cs_c - -void vp8_short_fdct4x4_c(short *input, short *output, int pitch); -#define vp8_short_fdct4x4 vp8_short_fdct4x4_c - -void vp8_short_fdct8x4_c(short *input, short *output, int pitch); -#define vp8_short_fdct8x4 vp8_short_fdct8x4_c - -void vp8_short_walsh4x4_c(short *input, short *output, int pitch); -#define vp8_short_walsh4x4 vp8_short_walsh4x4_c - -void vp8_regular_quantize_b_c(struct block *, struct blockd *); -#define vp8_regular_quantize_b vp8_regular_quantize_b_c - -void vp8_fast_quantize_b_c(struct block *, struct blockd *); -#define vp8_fast_quantize_b vp8_fast_quantize_b_c - -void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); -#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c - -void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); -#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_c - -void vp8_quantize_mb_c(struct macroblock *); -#define vp8_quantize_mb vp8_quantize_mb_c - -void vp8_quantize_mby_c(struct macroblock *); -#define vp8_quantize_mby vp8_quantize_mby_c - -void vp8_quantize_mbuv_c(struct macroblock *); -#define vp8_quantize_mbuv vp8_quantize_mbuv_c - -int vp8_block_error_c(short *coeff, short *dqcoeff); -#define vp8_block_error vp8_block_error_c +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_c int vp8_mbblock_error_c(struct macroblock *mb, int dc); #define vp8_mbblock_error vp8_mbblock_error_c @@ -265,40 +131,183 @@ int vp8_mbblock_error_c(struct macroblock *mb, int dc); int vp8_mbuverror_c(struct macroblock *mb); #define vp8_mbuverror vp8_mbuverror_c -void vp8_subtract_b_c(struct block *be, struct blockd *bd, int pitch); -#define vp8_subtract_b vp8_subtract_b_c +unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_mse16x16 vp8_mse16x16_c -void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -#define vp8_subtract_mby vp8_subtract_mby_c +void vp8_quantize_mb_c(struct macroblock *); +#define vp8_quantize_mb vp8_quantize_mb_c -void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); -#define vp8_subtract_mbuv vp8_subtract_mbuv_c +void vp8_quantize_mbuv_c(struct macroblock *); +#define vp8_quantize_mbuv vp8_quantize_mbuv_c -int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -#define vp8_full_search_sad vp8_full_search_sad_c +void vp8_quantize_mby_c(struct macroblock *); +#define vp8_quantize_mby vp8_quantize_mby_c int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); #define vp8_refining_search_sad vp8_refining_search_sad_c -int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -#define vp8_diamond_search_sad vp8_diamond_search_sad_c +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +#define vp8_regular_quantize_b vp8_regular_quantize_b_c + +void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); +#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c + +unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +#define vp8_sad16x16 vp8_sad16x16_c + +void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp8_sad16x16x3 vp8_sad16x16x3_c + +void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp8_sad16x16x4d vp8_sad16x16x4d_c + +void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +#define vp8_sad16x16x8 vp8_sad16x16x8_c + +unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +#define vp8_sad16x8 vp8_sad16x8_c + +void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp8_sad16x8x3 vp8_sad16x8x3_c + +void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp8_sad16x8x4d vp8_sad16x8x4d_c + +void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +#define vp8_sad16x8x8 vp8_sad16x8x8_c + +unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +#define vp8_sad4x4 vp8_sad4x4_c + +void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp8_sad4x4x3 vp8_sad4x4x3_c + +void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp8_sad4x4x4d vp8_sad4x4x4d_c + +void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +#define vp8_sad4x4x8 vp8_sad4x4x8_c + +unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +#define vp8_sad8x16 vp8_sad8x16_c + +void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp8_sad8x16x3 vp8_sad8x16x3_c + +void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp8_sad8x16x4d vp8_sad8x16x4d_c + +void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +#define vp8_sad8x16x8 vp8_sad8x16x8_c + +unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +#define vp8_sad8x8 vp8_sad8x8_c + +void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp8_sad8x8x3 vp8_sad8x8x3_c + +void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp8_sad8x8x4d vp8_sad8x8x4d_c + +void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +#define vp8_sad8x8x8 vp8_sad8x8x8_c + +void vp8_short_fdct4x4_c(short *input, short *output, int pitch); +#define vp8_short_fdct4x4 vp8_short_fdct4x4_c + +void vp8_short_fdct8x4_c(short *input, short *output, int pitch); +#define vp8_short_fdct8x4 vp8_short_fdct8x4_c + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +#define vp8_short_idct4x4llm vp8_short_idct4x4llm_c + +void vp8_short_inv_walsh4x4_c(short *input, short *output); +#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_c + +void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_short_walsh4x4_c(short *input, short *output, int pitch); +#define vp8_short_walsh4x4 vp8_short_walsh4x4_c + +void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_c + +void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_c + +void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_c + +void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_c + +unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_c + +unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_variance16x16 vp8_sub_pixel_variance16x16_c + +unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_variance16x8 vp8_sub_pixel_variance16x8_c + +unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_variance4x4 vp8_sub_pixel_variance4x4_c + +unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_variance8x16 vp8_sub_pixel_variance8x16_c + +unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_variance8x8 vp8_sub_pixel_variance8x8_c + +void vp8_subtract_b_c(struct block *be, struct blockd *bd, int pitch); +#define vp8_subtract_b vp8_subtract_b_c + +void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); +#define vp8_subtract_mbuv vp8_subtract_mbuv_c + +void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +#define vp8_subtract_mby vp8_subtract_mby_c void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); #define vp8_temporal_filter_apply vp8_temporal_filter_apply_c -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c +unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance16x16 vp8_variance16x16_c -int vp8_denoiser_filter_c(struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset); -#define vp8_denoiser_filter vp8_denoiser_filter_c +unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance16x8 vp8_variance16x8_c + +unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance4x4 vp8_variance4x4_c + +unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance8x16 vp8_variance8x16_c + +unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance8x8 vp8_variance8x8_c + +unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_c + +unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_c + +unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_c void vp8_rtcd(void); + #include "vpx_config.h" #ifdef RTCD_C static void setup_rtcd_internal(void) { - } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vp8_rtcd_x86-darwin9-gcc.h b/media/libvpx/vp8_rtcd_x86-darwin9-gcc.h index a13ca2b5a73b..a4f8d95dd3cf 100644 --- a/media/libvpx/vp8_rtcd_x86-darwin9-gcc.h +++ b/media/libvpx/vp8_rtcd_x86-darwin9-gcc.h @@ -7,6 +7,10 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + /* * VP8 */ @@ -22,32 +26,146 @@ struct variance_vtable; union int_mv; struct yv12_buffer_config; +void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_b vp8_blend_b_c + +void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_mb_inner vp8_blend_mb_inner_c + +void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_mb_outer vp8_blend_mb_outer_c + +int vp8_block_error_c(short *coeff, short *dqcoeff); +int vp8_block_error_mmx(short *coeff, short *dqcoeff); +int vp8_block_error_xmm(short *coeff, short *dqcoeff); +RTCD_EXTERN int (*vp8_block_error)(short *coeff, short *dqcoeff); + +void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +void vp8_build_intra_predictors_mbuv_s_sse2(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +void vp8_build_intra_predictors_mbuv_s_ssse3(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +RTCD_EXTERN void (*vp8_build_intra_predictors_mbuv_s)(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); + +void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +void vp8_build_intra_predictors_mby_s_sse2(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +void vp8_build_intra_predictors_mby_s_ssse3(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +RTCD_EXTERN void (*vp8_build_intra_predictors_mby_s)(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); + void vp8_clear_system_state_c(); void vpx_reset_mmx_state(); RTCD_EXTERN void (*vp8_clear_system_state)(); -void vp8_dequantize_b_c(struct blockd*, short *dqc); -void vp8_dequantize_b_mmx(struct blockd*, short *dqc); -RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *dqc); +void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); + +void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +RTCD_EXTERN int (*vp8_denoiser_filter)(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +RTCD_EXTERN int (*vp8_denoiser_filter_uv)(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride); RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *output, int stride); -void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs); - void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); void vp8_dequant_idct_add_uv_block_mmx(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); RTCD_EXTERN void (*vp8_dequant_idct_add_uv_block)(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); -void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs); + +void vp8_dequantize_b_c(struct blockd*, short *dqc); +void vp8_dequantize_b_mmx(struct blockd*, short *dqc); +RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *dqc); + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_diamond_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +void vp8_fast_quantize_b_sse2(struct block *, struct blockd *); +void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); + +void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); +#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_c + +void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +RTCD_EXTERN void (*vp8_filter_by_weight16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); + +void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c + +void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +RTCD_EXTERN void (*vp8_filter_by_weight8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); + +int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); + +unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vp8_get4x4sse_cs_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp8_get4x4sse_cs)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); + +unsigned int vp8_get_mb_ss_c(const short *); +unsigned int vp8_get_mb_ss_mmx(const short *); +unsigned int vp8_get_mb_ss_sse2(const short *); +RTCD_EXTERN unsigned int (*vp8_get_mb_ss)(const short *); + +void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); +#define vp8_intra4x4_predict vp8_intra4x4_predict_c + +void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); void vp8_loop_filter_bv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); @@ -59,240 +177,84 @@ void vp8_loop_filter_mbh_mmx(unsigned char *y, unsigned char *u, unsigned char * void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); - -void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_vertical_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y, int ystride, const unsigned char *blimit); - -void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y, int ystride, const unsigned char *blimit); - -void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); void vp8_loop_filter_bhs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_short_inv_walsh4x4_1_c(short *input, short *output); -#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_short_inv_walsh4x4_c(short *input, short *output); -void vp8_short_inv_walsh4x4_mmx(short *input, short *output); -void vp8_short_inv_walsh4x4_sse2(short *input, short *output); -RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *output); +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +int vp8_mbblock_error_mmx(struct macroblock *mb, int dc); +int vp8_mbblock_error_xmm(struct macroblock *mb, int dc); +RTCD_EXTERN int (*vp8_mbblock_error)(struct macroblock *mb, int dc); -void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); - -void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); - -void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); - -void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -void vp8_build_intra_predictors_mby_s_sse2(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -void vp8_build_intra_predictors_mby_s_ssse3(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -RTCD_EXTERN void (*vp8_build_intra_predictors_mby_s)(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); - -void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -void vp8_build_intra_predictors_mbuv_s_sse2(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -void vp8_build_intra_predictors_mbuv_s_ssse3(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -RTCD_EXTERN void (*vp8_build_intra_predictors_mbuv_s)(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); - -void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); -#define vp8_intra4x4_predict vp8_intra4x4_predict_c +void vp8_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vp8_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); +RTCD_EXTERN void (*vp8_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); RTCD_EXTERN void (*vp8_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vp8_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vp8_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); -RTCD_EXTERN void (*vp8_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit); +int vp8_mbuverror_c(struct macroblock *mb); +int vp8_mbuverror_mmx(struct macroblock *mb); +int vp8_mbuverror_xmm(struct macroblock *mb); +RTCD_EXTERN int (*vp8_mbuverror)(struct macroblock *mb); -void vp8_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -void vp8_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -RTCD_EXTERN void (*vp8_post_proc_down_and_across_mb_row)(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_mse16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_mmx(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_wmt(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); RTCD_EXTERN void (*vp8_plane_add_noise)(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); -void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_inner vp8_blend_mb_inner_c +void vp8_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +void vp8_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +RTCD_EXTERN void (*vp8_post_proc_down_and_across_mb_row)(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_outer vp8_blend_mb_outer_c +void vp8_quantize_mb_c(struct macroblock *); +#define vp8_quantize_mb vp8_quantize_mb_c -void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_b vp8_blend_b_c +void vp8_quantize_mbuv_c(struct macroblock *); +#define vp8_quantize_mbuv vp8_quantize_mbuv_c -void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -RTCD_EXTERN void (*vp8_filter_by_weight16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_quantize_mby_c(struct macroblock *); +#define vp8_quantize_mby vp8_quantize_mby_c -void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -RTCD_EXTERN void (*vp8_filter_by_weight8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse4_1(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_regular_quantize_b)(struct block *, struct blockd *); -void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance4x4)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance8x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance8x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance16x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance4x4)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance8x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance8x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_h_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_h)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_v)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_hv_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_hv_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_hv)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad4x4_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad4x4_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp8_sad4x4)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp8_sad8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x16_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp8_sad8x16)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad16x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad16x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp8_sad16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); +#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); unsigned int vp8_sad16x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); @@ -300,91 +262,87 @@ unsigned int vp8_sad16x16_wmt(const unsigned char *src_ptr, int src_stride, cons unsigned int vp8_sad16x16_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); RTCD_EXTERN unsigned int (*vp8_sad16x16)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad4x4x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad4x4x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad16x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); + +unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad16x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad16x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +RTCD_EXTERN unsigned int (*vp8_sad16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad16x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad16x8x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad16x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad4x4x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad4x4x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); - -void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad8x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); - -void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad8x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad8x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad16x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); void vp8_sad16x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); RTCD_EXTERN void (*vp8_sad16x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad4x4_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad4x4_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +RTCD_EXTERN unsigned int (*vp8_sad4x4)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); + +void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad4x4x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad4x4x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad4x4x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad4x4x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad4x4x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad4x4x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); + +unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x16_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +RTCD_EXTERN unsigned int (*vp8_sad8x16)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); + +void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad8x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad8x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad8x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad8x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad8x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +RTCD_EXTERN unsigned int (*vp8_sad8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_get_mb_ss_c(const short *); -unsigned int vp8_get_mb_ss_mmx(const short *); -unsigned int vp8_get_mb_ss_sse2(const short *); -RTCD_EXTERN unsigned int (*vp8_get_mb_ss)(const short *); +void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad8x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_mse16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad8x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_mse16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -unsigned int vp8_get4x4sse_cs_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -RTCD_EXTERN unsigned int (*vp8_get4x4sse_cs)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); - -void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad8x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); void vp8_short_fdct4x4_c(short *input, short *output, int pitch); void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch); @@ -396,87 +354,135 @@ void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch); void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch); RTCD_EXTERN void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); + +void vp8_short_inv_walsh4x4_c(short *input, short *output); +void vp8_short_inv_walsh4x4_mmx(short *input, short *output); +void vp8_short_inv_walsh4x4_sse2(short *input, short *output); +RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *output); + +void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + void vp8_short_walsh4x4_c(short *input, short *output, int pitch); void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch); RTCD_EXTERN void (*vp8_short_walsh4x4)(short *input, short *output, int pitch); -void vp8_regular_quantize_b_c(struct block *, struct blockd *); -void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); -RTCD_EXTERN void (*vp8_regular_quantize_b)(struct block *, struct blockd *); +void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_fast_quantize_b_c(struct block *, struct blockd *); -void vp8_fast_quantize_b_sse2(struct block *, struct blockd *); -void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *); -RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); +void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); -#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c +void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); -#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_c +void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_quantize_mb_c(struct macroblock *); -#define vp8_quantize_mb vp8_quantize_mb_c +unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_mse16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -void vp8_quantize_mby_c(struct macroblock *); -#define vp8_quantize_mby vp8_quantize_mby_c +unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -void vp8_quantize_mbuv_c(struct macroblock *); -#define vp8_quantize_mbuv vp8_quantize_mbuv_c +unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -int vp8_block_error_c(short *coeff, short *dqcoeff); -int vp8_block_error_mmx(short *coeff, short *dqcoeff); -int vp8_block_error_xmm(short *coeff, short *dqcoeff); -RTCD_EXTERN int (*vp8_block_error)(short *coeff, short *dqcoeff); +unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance4x4)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -int vp8_mbblock_error_c(struct macroblock *mb, int dc); -int vp8_mbblock_error_mmx(struct macroblock *mb, int dc); -int vp8_mbblock_error_xmm(struct macroblock *mb, int dc); -RTCD_EXTERN int (*vp8_mbblock_error)(struct macroblock *mb, int dc); +unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance8x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -int vp8_mbuverror_c(struct macroblock *mb); -int vp8_mbuverror_mmx(struct macroblock *mb); -int vp8_mbuverror_xmm(struct macroblock *mb); -RTCD_EXTERN int (*vp8_mbuverror)(struct macroblock *mb); +unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance8x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); void vp8_subtract_b_c(struct block *be, struct blockd *bd, int pitch); void vp8_subtract_b_mmx(struct block *be, struct blockd *bd, int pitch); void vp8_subtract_b_sse2(struct block *be, struct blockd *bd, int pitch); RTCD_EXTERN void (*vp8_subtract_b)(struct block *be, struct blockd *bd, int pitch); -void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -RTCD_EXTERN void (*vp8_subtract_mby)(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); - void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); RTCD_EXTERN void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); -int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); - -int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); - -int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_diamond_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +RTCD_EXTERN void (*vp8_subtract_mby)(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); RTCD_EXTERN void (*vp8_temporal_filter_apply)(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c +unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -int vp8_denoiser_filter_c(struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset); -int vp8_denoiser_filter_sse2(struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset); -RTCD_EXTERN int (*vp8_denoiser_filter)(struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset); +unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance16x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance4x4)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance8x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance8x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_h_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_h)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_hv_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_hv_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_hv)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_v)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); void vp8_rtcd(void); @@ -488,348 +494,256 @@ static void setup_rtcd_internal(void) (void)flags; - vp8_clear_system_state = vp8_clear_system_state_c; - if (flags & HAS_MMX) vp8_clear_system_state = vpx_reset_mmx_state; - - vp8_dequantize_b = vp8_dequantize_b_c; - if (flags & HAS_MMX) vp8_dequantize_b = vp8_dequantize_b_mmx; - - vp8_dequant_idct_add = vp8_dequant_idct_add_c; - if (flags & HAS_MMX) vp8_dequant_idct_add = vp8_dequant_idct_add_mmx; - - vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_c; - if (flags & HAS_MMX) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_mmx; - if (flags & HAS_SSE2) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_sse2; - - vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_c; - if (flags & HAS_MMX) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_mmx; - if (flags & HAS_SSE2) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_sse2; - - vp8_loop_filter_mbv = vp8_loop_filter_mbv_c; - if (flags & HAS_MMX) vp8_loop_filter_mbv = vp8_loop_filter_mbv_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_mbv = vp8_loop_filter_mbv_sse2; - - vp8_loop_filter_bv = vp8_loop_filter_bv_c; - if (flags & HAS_MMX) vp8_loop_filter_bv = vp8_loop_filter_bv_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_bv = vp8_loop_filter_bv_sse2; - - vp8_loop_filter_mbh = vp8_loop_filter_mbh_c; - if (flags & HAS_MMX) vp8_loop_filter_mbh = vp8_loop_filter_mbh_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_mbh = vp8_loop_filter_mbh_sse2; - - vp8_loop_filter_bh = vp8_loop_filter_bh_c; - if (flags & HAS_MMX) vp8_loop_filter_bh = vp8_loop_filter_bh_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_bh = vp8_loop_filter_bh_sse2; - - vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_c; - if (flags & HAS_MMX) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_sse2; - - vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_c; - if (flags & HAS_MMX) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_sse2; - - vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_c; - if (flags & HAS_MMX) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_sse2; - - vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_c; - if (flags & HAS_MMX) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_sse2; - - vp8_short_idct4x4llm = vp8_short_idct4x4llm_c; - if (flags & HAS_MMX) vp8_short_idct4x4llm = vp8_short_idct4x4llm_mmx; - - - vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_c; - if (flags & HAS_MMX) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_mmx; - if (flags & HAS_SSE2) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_sse2; - - vp8_dc_only_idct_add = vp8_dc_only_idct_add_c; - if (flags & HAS_MMX) vp8_dc_only_idct_add = vp8_dc_only_idct_add_mmx; - - vp8_copy_mem16x16 = vp8_copy_mem16x16_c; - if (flags & HAS_MMX) vp8_copy_mem16x16 = vp8_copy_mem16x16_mmx; - if (flags & HAS_SSE2) vp8_copy_mem16x16 = vp8_copy_mem16x16_sse2; - - vp8_copy_mem8x8 = vp8_copy_mem8x8_c; - if (flags & HAS_MMX) vp8_copy_mem8x8 = vp8_copy_mem8x8_mmx; - - vp8_copy_mem8x4 = vp8_copy_mem8x4_c; - if (flags & HAS_MMX) vp8_copy_mem8x4 = vp8_copy_mem8x4_mmx; - - vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_c; - if (flags & HAS_SSE2) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_sse2; - if (flags & HAS_SSSE3) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_ssse3; - - vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_c; - if (flags & HAS_SSE2) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_sse2; - if (flags & HAS_SSSE3) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_ssse3; - - - vp8_mbpost_proc_down = vp8_mbpost_proc_down_c; - if (flags & HAS_MMX) vp8_mbpost_proc_down = vp8_mbpost_proc_down_mmx; - if (flags & HAS_SSE2) vp8_mbpost_proc_down = vp8_mbpost_proc_down_xmm; - - vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_c; - if (flags & HAS_SSE2) vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_xmm; - - vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c; - if (flags & HAS_SSE2) vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_sse2; - - vp8_plane_add_noise = vp8_plane_add_noise_c; - if (flags & HAS_MMX) vp8_plane_add_noise = vp8_plane_add_noise_mmx; - if (flags & HAS_SSE2) vp8_plane_add_noise = vp8_plane_add_noise_wmt; - - - - - vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_c; - if (flags & HAS_SSE2) vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_sse2; - - vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_c; - if (flags & HAS_SSE2) vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_sse2; - - - vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_c; - if (flags & HAS_MMX) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_mmx; - if (flags & HAS_SSE2) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; - - vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_c; - if (flags & HAS_MMX) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_mmx; - if (flags & HAS_SSE2) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; - - vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_c; - if (flags & HAS_MMX) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_mmx; - if (flags & HAS_SSE2) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; - - vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_c; - if (flags & HAS_MMX) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; - if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; - vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_c; if (flags & HAS_MMX) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_mmx; if (flags & HAS_SSE2) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_sse2; if (flags & HAS_SSSE3) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_ssse3; - + vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_c; + if (flags & HAS_MMX) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_mmx; + vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c; + if (flags & HAS_MMX) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_mmx; vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_c; if (flags & HAS_MMX) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_mmx; if (flags & HAS_SSE2) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_sse2; if (flags & HAS_SSSE3) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_ssse3; - - vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c; - if (flags & HAS_MMX) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_mmx; - - vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_c; - if (flags & HAS_MMX) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_mmx; - - vp8_variance4x4 = vp8_variance4x4_c; - if (flags & HAS_MMX) vp8_variance4x4 = vp8_variance4x4_mmx; - if (flags & HAS_SSE2) vp8_variance4x4 = vp8_variance4x4_wmt; - - vp8_variance8x8 = vp8_variance8x8_c; - if (flags & HAS_MMX) vp8_variance8x8 = vp8_variance8x8_mmx; - if (flags & HAS_SSE2) vp8_variance8x8 = vp8_variance8x8_wmt; - - vp8_variance8x16 = vp8_variance8x16_c; - if (flags & HAS_MMX) vp8_variance8x16 = vp8_variance8x16_mmx; - if (flags & HAS_SSE2) vp8_variance8x16 = vp8_variance8x16_wmt; - - vp8_variance16x8 = vp8_variance16x8_c; - if (flags & HAS_MMX) vp8_variance16x8 = vp8_variance16x8_mmx; - if (flags & HAS_SSE2) vp8_variance16x8 = vp8_variance16x8_wmt; - - vp8_variance16x16 = vp8_variance16x16_c; - if (flags & HAS_MMX) vp8_variance16x16 = vp8_variance16x16_mmx; - if (flags & HAS_SSE2) vp8_variance16x16 = vp8_variance16x16_wmt; - - vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_c; - if (flags & HAS_MMX) vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_wmt; - - vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_c; - if (flags & HAS_MMX) vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_wmt; - - vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_c; - if (flags & HAS_MMX) vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_wmt; - - vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_c; - if (flags & HAS_MMX) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_wmt; - if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_ssse3; - - vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_c; - if (flags & HAS_MMX) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt; - if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ssse3; - - vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_c; - if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_mmx; - if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_wmt; - - vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_c; - if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_mmx; - if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_wmt; - - vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_c; - if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_mmx; - if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_wmt; - - vp8_sad4x4 = vp8_sad4x4_c; - if (flags & HAS_MMX) vp8_sad4x4 = vp8_sad4x4_mmx; - if (flags & HAS_SSE2) vp8_sad4x4 = vp8_sad4x4_wmt; - - vp8_sad8x8 = vp8_sad8x8_c; - if (flags & HAS_MMX) vp8_sad8x8 = vp8_sad8x8_mmx; - if (flags & HAS_SSE2) vp8_sad8x8 = vp8_sad8x8_wmt; - - vp8_sad8x16 = vp8_sad8x16_c; - if (flags & HAS_MMX) vp8_sad8x16 = vp8_sad8x16_mmx; - if (flags & HAS_SSE2) vp8_sad8x16 = vp8_sad8x16_wmt; - - vp8_sad16x8 = vp8_sad16x8_c; - if (flags & HAS_MMX) vp8_sad16x8 = vp8_sad16x8_mmx; - if (flags & HAS_SSE2) vp8_sad16x8 = vp8_sad16x8_wmt; - + vp8_block_error = vp8_block_error_c; + if (flags & HAS_MMX) vp8_block_error = vp8_block_error_mmx; + if (flags & HAS_SSE2) vp8_block_error = vp8_block_error_xmm; + vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_c; + if (flags & HAS_SSE2) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_sse2; + if (flags & HAS_SSSE3) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_ssse3; + vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_c; + if (flags & HAS_SSE2) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_sse2; + if (flags & HAS_SSSE3) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_ssse3; + vp8_clear_system_state = vp8_clear_system_state_c; + if (flags & HAS_MMX) vp8_clear_system_state = vpx_reset_mmx_state; + vp8_copy32xn = vp8_copy32xn_c; + if (flags & HAS_SSE2) vp8_copy32xn = vp8_copy32xn_sse2; + if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3; + vp8_copy_mem16x16 = vp8_copy_mem16x16_c; + if (flags & HAS_MMX) vp8_copy_mem16x16 = vp8_copy_mem16x16_mmx; + if (flags & HAS_SSE2) vp8_copy_mem16x16 = vp8_copy_mem16x16_sse2; + vp8_copy_mem8x4 = vp8_copy_mem8x4_c; + if (flags & HAS_MMX) vp8_copy_mem8x4 = vp8_copy_mem8x4_mmx; + vp8_copy_mem8x8 = vp8_copy_mem8x8_c; + if (flags & HAS_MMX) vp8_copy_mem8x8 = vp8_copy_mem8x8_mmx; + vp8_dc_only_idct_add = vp8_dc_only_idct_add_c; + if (flags & HAS_MMX) vp8_dc_only_idct_add = vp8_dc_only_idct_add_mmx; + vp8_denoiser_filter = vp8_denoiser_filter_c; + if (flags & HAS_SSE2) vp8_denoiser_filter = vp8_denoiser_filter_sse2; + vp8_denoiser_filter_uv = vp8_denoiser_filter_uv_c; + if (flags & HAS_SSE2) vp8_denoiser_filter_uv = vp8_denoiser_filter_uv_sse2; + vp8_dequant_idct_add = vp8_dequant_idct_add_c; + if (flags & HAS_MMX) vp8_dequant_idct_add = vp8_dequant_idct_add_mmx; + vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_c; + if (flags & HAS_MMX) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_mmx; + if (flags & HAS_SSE2) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_sse2; + vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_c; + if (flags & HAS_MMX) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_mmx; + if (flags & HAS_SSE2) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_sse2; + vp8_dequantize_b = vp8_dequantize_b_c; + if (flags & HAS_MMX) vp8_dequantize_b = vp8_dequantize_b_mmx; + vp8_diamond_search_sad = vp8_diamond_search_sad_c; + if (flags & HAS_SSE3) vp8_diamond_search_sad = vp8_diamond_search_sadx4; + vp8_fast_quantize_b = vp8_fast_quantize_b_c; + if (flags & HAS_SSE2) vp8_fast_quantize_b = vp8_fast_quantize_b_sse2; + if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3; + vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_c; + if (flags & HAS_SSE2) vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_sse2; + vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_c; + if (flags & HAS_SSE2) vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_sse2; + vp8_full_search_sad = vp8_full_search_sad_c; + if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; + if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; + vp8_get4x4sse_cs = vp8_get4x4sse_cs_c; + if (flags & HAS_MMX) vp8_get4x4sse_cs = vp8_get4x4sse_cs_mmx; + vp8_get_mb_ss = vp8_get_mb_ss_c; + if (flags & HAS_MMX) vp8_get_mb_ss = vp8_get_mb_ss_mmx; + if (flags & HAS_SSE2) vp8_get_mb_ss = vp8_get_mb_ss_sse2; + vp8_loop_filter_bh = vp8_loop_filter_bh_c; + if (flags & HAS_MMX) vp8_loop_filter_bh = vp8_loop_filter_bh_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_bh = vp8_loop_filter_bh_sse2; + vp8_loop_filter_bv = vp8_loop_filter_bv_c; + if (flags & HAS_MMX) vp8_loop_filter_bv = vp8_loop_filter_bv_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_bv = vp8_loop_filter_bv_sse2; + vp8_loop_filter_mbh = vp8_loop_filter_mbh_c; + if (flags & HAS_MMX) vp8_loop_filter_mbh = vp8_loop_filter_mbh_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_mbh = vp8_loop_filter_mbh_sse2; + vp8_loop_filter_mbv = vp8_loop_filter_mbv_c; + if (flags & HAS_MMX) vp8_loop_filter_mbv = vp8_loop_filter_mbv_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_mbv = vp8_loop_filter_mbv_sse2; + vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_sse2; + vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_sse2; + vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_sse2; + vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_sse2; + vp8_mbblock_error = vp8_mbblock_error_c; + if (flags & HAS_MMX) vp8_mbblock_error = vp8_mbblock_error_mmx; + if (flags & HAS_SSE2) vp8_mbblock_error = vp8_mbblock_error_xmm; + vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_c; + if (flags & HAS_SSE2) vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_xmm; + vp8_mbpost_proc_down = vp8_mbpost_proc_down_c; + if (flags & HAS_MMX) vp8_mbpost_proc_down = vp8_mbpost_proc_down_mmx; + if (flags & HAS_SSE2) vp8_mbpost_proc_down = vp8_mbpost_proc_down_xmm; + vp8_mbuverror = vp8_mbuverror_c; + if (flags & HAS_MMX) vp8_mbuverror = vp8_mbuverror_mmx; + if (flags & HAS_SSE2) vp8_mbuverror = vp8_mbuverror_xmm; + vp8_mse16x16 = vp8_mse16x16_c; + if (flags & HAS_MMX) vp8_mse16x16 = vp8_mse16x16_mmx; + if (flags & HAS_SSE2) vp8_mse16x16 = vp8_mse16x16_wmt; + vp8_plane_add_noise = vp8_plane_add_noise_c; + if (flags & HAS_MMX) vp8_plane_add_noise = vp8_plane_add_noise_mmx; + if (flags & HAS_SSE2) vp8_plane_add_noise = vp8_plane_add_noise_wmt; + vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c; + if (flags & HAS_SSE2) vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_sse2; + vp8_refining_search_sad = vp8_refining_search_sad_c; + if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4; + vp8_regular_quantize_b = vp8_regular_quantize_b_c; + if (flags & HAS_SSE2) vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; + if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1; vp8_sad16x16 = vp8_sad16x16_c; if (flags & HAS_MMX) vp8_sad16x16 = vp8_sad16x16_mmx; if (flags & HAS_SSE2) vp8_sad16x16 = vp8_sad16x16_wmt; if (flags & HAS_SSE3) vp8_sad16x16 = vp8_sad16x16_sse3; - - vp8_sad4x4x3 = vp8_sad4x4x3_c; - if (flags & HAS_SSE3) vp8_sad4x4x3 = vp8_sad4x4x3_sse3; - - vp8_sad8x8x3 = vp8_sad8x8x3_c; - if (flags & HAS_SSE3) vp8_sad8x8x3 = vp8_sad8x8x3_sse3; - - vp8_sad8x16x3 = vp8_sad8x16x3_c; - if (flags & HAS_SSE3) vp8_sad8x16x3 = vp8_sad8x16x3_sse3; - - vp8_sad16x8x3 = vp8_sad16x8x3_c; - if (flags & HAS_SSE3) vp8_sad16x8x3 = vp8_sad16x8x3_sse3; - if (flags & HAS_SSSE3) vp8_sad16x8x3 = vp8_sad16x8x3_ssse3; - vp8_sad16x16x3 = vp8_sad16x16x3_c; if (flags & HAS_SSE3) vp8_sad16x16x3 = vp8_sad16x16x3_sse3; if (flags & HAS_SSSE3) vp8_sad16x16x3 = vp8_sad16x16x3_ssse3; - - vp8_sad4x4x8 = vp8_sad4x4x8_c; - if (flags & HAS_SSE4_1) vp8_sad4x4x8 = vp8_sad4x4x8_sse4; - - vp8_sad8x8x8 = vp8_sad8x8x8_c; - if (flags & HAS_SSE4_1) vp8_sad8x8x8 = vp8_sad8x8x8_sse4; - - vp8_sad8x16x8 = vp8_sad8x16x8_c; - if (flags & HAS_SSE4_1) vp8_sad8x16x8 = vp8_sad8x16x8_sse4; - - vp8_sad16x8x8 = vp8_sad16x8x8_c; - if (flags & HAS_SSE4_1) vp8_sad16x8x8 = vp8_sad16x8x8_sse4; - - vp8_sad16x16x8 = vp8_sad16x16x8_c; - if (flags & HAS_SSE4_1) vp8_sad16x16x8 = vp8_sad16x16x8_sse4; - - vp8_sad4x4x4d = vp8_sad4x4x4d_c; - if (flags & HAS_SSE3) vp8_sad4x4x4d = vp8_sad4x4x4d_sse3; - - vp8_sad8x8x4d = vp8_sad8x8x4d_c; - if (flags & HAS_SSE3) vp8_sad8x8x4d = vp8_sad8x8x4d_sse3; - - vp8_sad8x16x4d = vp8_sad8x16x4d_c; - if (flags & HAS_SSE3) vp8_sad8x16x4d = vp8_sad8x16x4d_sse3; - - vp8_sad16x8x4d = vp8_sad16x8x4d_c; - if (flags & HAS_SSE3) vp8_sad16x8x4d = vp8_sad16x8x4d_sse3; - vp8_sad16x16x4d = vp8_sad16x16x4d_c; if (flags & HAS_SSE3) vp8_sad16x16x4d = vp8_sad16x16x4d_sse3; - - vp8_get_mb_ss = vp8_get_mb_ss_c; - if (flags & HAS_MMX) vp8_get_mb_ss = vp8_get_mb_ss_mmx; - if (flags & HAS_SSE2) vp8_get_mb_ss = vp8_get_mb_ss_sse2; - - vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_c; - if (flags & HAS_MMX) vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_wmt; - - vp8_mse16x16 = vp8_mse16x16_c; - if (flags & HAS_MMX) vp8_mse16x16 = vp8_mse16x16_mmx; - if (flags & HAS_SSE2) vp8_mse16x16 = vp8_mse16x16_wmt; - - vp8_get4x4sse_cs = vp8_get4x4sse_cs_c; - if (flags & HAS_MMX) vp8_get4x4sse_cs = vp8_get4x4sse_cs_mmx; - - vp8_copy32xn = vp8_copy32xn_c; - if (flags & HAS_SSE2) vp8_copy32xn = vp8_copy32xn_sse2; - if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3; - + vp8_sad16x16x8 = vp8_sad16x16x8_c; + if (flags & HAS_SSE4_1) vp8_sad16x16x8 = vp8_sad16x16x8_sse4; + vp8_sad16x8 = vp8_sad16x8_c; + if (flags & HAS_MMX) vp8_sad16x8 = vp8_sad16x8_mmx; + if (flags & HAS_SSE2) vp8_sad16x8 = vp8_sad16x8_wmt; + vp8_sad16x8x3 = vp8_sad16x8x3_c; + if (flags & HAS_SSE3) vp8_sad16x8x3 = vp8_sad16x8x3_sse3; + if (flags & HAS_SSSE3) vp8_sad16x8x3 = vp8_sad16x8x3_ssse3; + vp8_sad16x8x4d = vp8_sad16x8x4d_c; + if (flags & HAS_SSE3) vp8_sad16x8x4d = vp8_sad16x8x4d_sse3; + vp8_sad16x8x8 = vp8_sad16x8x8_c; + if (flags & HAS_SSE4_1) vp8_sad16x8x8 = vp8_sad16x8x8_sse4; + vp8_sad4x4 = vp8_sad4x4_c; + if (flags & HAS_MMX) vp8_sad4x4 = vp8_sad4x4_mmx; + if (flags & HAS_SSE2) vp8_sad4x4 = vp8_sad4x4_wmt; + vp8_sad4x4x3 = vp8_sad4x4x3_c; + if (flags & HAS_SSE3) vp8_sad4x4x3 = vp8_sad4x4x3_sse3; + vp8_sad4x4x4d = vp8_sad4x4x4d_c; + if (flags & HAS_SSE3) vp8_sad4x4x4d = vp8_sad4x4x4d_sse3; + vp8_sad4x4x8 = vp8_sad4x4x8_c; + if (flags & HAS_SSE4_1) vp8_sad4x4x8 = vp8_sad4x4x8_sse4; + vp8_sad8x16 = vp8_sad8x16_c; + if (flags & HAS_MMX) vp8_sad8x16 = vp8_sad8x16_mmx; + if (flags & HAS_SSE2) vp8_sad8x16 = vp8_sad8x16_wmt; + vp8_sad8x16x3 = vp8_sad8x16x3_c; + if (flags & HAS_SSE3) vp8_sad8x16x3 = vp8_sad8x16x3_sse3; + vp8_sad8x16x4d = vp8_sad8x16x4d_c; + if (flags & HAS_SSE3) vp8_sad8x16x4d = vp8_sad8x16x4d_sse3; + vp8_sad8x16x8 = vp8_sad8x16x8_c; + if (flags & HAS_SSE4_1) vp8_sad8x16x8 = vp8_sad8x16x8_sse4; + vp8_sad8x8 = vp8_sad8x8_c; + if (flags & HAS_MMX) vp8_sad8x8 = vp8_sad8x8_mmx; + if (flags & HAS_SSE2) vp8_sad8x8 = vp8_sad8x8_wmt; + vp8_sad8x8x3 = vp8_sad8x8x3_c; + if (flags & HAS_SSE3) vp8_sad8x8x3 = vp8_sad8x8x3_sse3; + vp8_sad8x8x4d = vp8_sad8x8x4d_c; + if (flags & HAS_SSE3) vp8_sad8x8x4d = vp8_sad8x8x4d_sse3; + vp8_sad8x8x8 = vp8_sad8x8x8_c; + if (flags & HAS_SSE4_1) vp8_sad8x8x8 = vp8_sad8x8x8_sse4; vp8_short_fdct4x4 = vp8_short_fdct4x4_c; if (flags & HAS_MMX) vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx; if (flags & HAS_SSE2) vp8_short_fdct4x4 = vp8_short_fdct4x4_sse2; - vp8_short_fdct8x4 = vp8_short_fdct8x4_c; if (flags & HAS_MMX) vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx; if (flags & HAS_SSE2) vp8_short_fdct8x4 = vp8_short_fdct8x4_sse2; - + vp8_short_idct4x4llm = vp8_short_idct4x4llm_c; + if (flags & HAS_MMX) vp8_short_idct4x4llm = vp8_short_idct4x4llm_mmx; + vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_c; + if (flags & HAS_MMX) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_mmx; + if (flags & HAS_SSE2) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_sse2; vp8_short_walsh4x4 = vp8_short_walsh4x4_c; if (flags & HAS_SSE2) vp8_short_walsh4x4 = vp8_short_walsh4x4_sse2; - - vp8_regular_quantize_b = vp8_regular_quantize_b_c; - if (flags & HAS_SSE2) vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; - - vp8_fast_quantize_b = vp8_fast_quantize_b_c; - if (flags & HAS_SSE2) vp8_fast_quantize_b = vp8_fast_quantize_b_sse2; - if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3; - - - - - - - vp8_block_error = vp8_block_error_c; - if (flags & HAS_MMX) vp8_block_error = vp8_block_error_mmx; - if (flags & HAS_SSE2) vp8_block_error = vp8_block_error_xmm; - - vp8_mbblock_error = vp8_mbblock_error_c; - if (flags & HAS_MMX) vp8_mbblock_error = vp8_mbblock_error_mmx; - if (flags & HAS_SSE2) vp8_mbblock_error = vp8_mbblock_error_xmm; - - vp8_mbuverror = vp8_mbuverror_c; - if (flags & HAS_MMX) vp8_mbuverror = vp8_mbuverror_mmx; - if (flags & HAS_SSE2) vp8_mbuverror = vp8_mbuverror_xmm; - + vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_c; + if (flags & HAS_MMX) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_mmx; + if (flags & HAS_SSE2) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; + vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_c; + if (flags & HAS_MMX) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; + if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; + vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_c; + if (flags & HAS_MMX) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_mmx; + if (flags & HAS_SSE2) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; + vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_c; + if (flags & HAS_MMX) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_mmx; + if (flags & HAS_SSE2) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; + vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_c; + if (flags & HAS_MMX) vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_wmt; + vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_c; + if (flags & HAS_MMX) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt; + if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ssse3; + vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_c; + if (flags & HAS_MMX) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_wmt; + if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_ssse3; + vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_c; + if (flags & HAS_MMX) vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_wmt; + vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_c; + if (flags & HAS_MMX) vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_wmt; + vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_c; + if (flags & HAS_MMX) vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_wmt; vp8_subtract_b = vp8_subtract_b_c; if (flags & HAS_MMX) vp8_subtract_b = vp8_subtract_b_mmx; if (flags & HAS_SSE2) vp8_subtract_b = vp8_subtract_b_sse2; - - vp8_subtract_mby = vp8_subtract_mby_c; - if (flags & HAS_MMX) vp8_subtract_mby = vp8_subtract_mby_mmx; - if (flags & HAS_SSE2) vp8_subtract_mby = vp8_subtract_mby_sse2; - vp8_subtract_mbuv = vp8_subtract_mbuv_c; if (flags & HAS_MMX) vp8_subtract_mbuv = vp8_subtract_mbuv_mmx; if (flags & HAS_SSE2) vp8_subtract_mbuv = vp8_subtract_mbuv_sse2; - - vp8_full_search_sad = vp8_full_search_sad_c; - if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; - if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; - - vp8_refining_search_sad = vp8_refining_search_sad_c; - if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4; - - vp8_diamond_search_sad = vp8_diamond_search_sad_c; - if (flags & HAS_SSE3) vp8_diamond_search_sad = vp8_diamond_search_sadx4; - + vp8_subtract_mby = vp8_subtract_mby_c; + if (flags & HAS_MMX) vp8_subtract_mby = vp8_subtract_mby_mmx; + if (flags & HAS_SSE2) vp8_subtract_mby = vp8_subtract_mby_sse2; vp8_temporal_filter_apply = vp8_temporal_filter_apply_c; if (flags & HAS_SSE2) vp8_temporal_filter_apply = vp8_temporal_filter_apply_sse2; - - - vp8_denoiser_filter = vp8_denoiser_filter_c; - if (flags & HAS_SSE2) vp8_denoiser_filter = vp8_denoiser_filter_sse2; + vp8_variance16x16 = vp8_variance16x16_c; + if (flags & HAS_MMX) vp8_variance16x16 = vp8_variance16x16_mmx; + if (flags & HAS_SSE2) vp8_variance16x16 = vp8_variance16x16_wmt; + vp8_variance16x8 = vp8_variance16x8_c; + if (flags & HAS_MMX) vp8_variance16x8 = vp8_variance16x8_mmx; + if (flags & HAS_SSE2) vp8_variance16x8 = vp8_variance16x8_wmt; + vp8_variance4x4 = vp8_variance4x4_c; + if (flags & HAS_MMX) vp8_variance4x4 = vp8_variance4x4_mmx; + if (flags & HAS_SSE2) vp8_variance4x4 = vp8_variance4x4_wmt; + vp8_variance8x16 = vp8_variance8x16_c; + if (flags & HAS_MMX) vp8_variance8x16 = vp8_variance8x16_mmx; + if (flags & HAS_SSE2) vp8_variance8x16 = vp8_variance8x16_wmt; + vp8_variance8x8 = vp8_variance8x8_c; + if (flags & HAS_MMX) vp8_variance8x8 = vp8_variance8x8_mmx; + if (flags & HAS_SSE2) vp8_variance8x8 = vp8_variance8x8_wmt; + vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_c; + if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_mmx; + if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_wmt; + vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_c; + if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_mmx; + if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_wmt; + vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_c; + if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_mmx; + if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_wmt; } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vp8_rtcd_x86-linux-gcc.h b/media/libvpx/vp8_rtcd_x86-linux-gcc.h index a13ca2b5a73b..a4f8d95dd3cf 100644 --- a/media/libvpx/vp8_rtcd_x86-linux-gcc.h +++ b/media/libvpx/vp8_rtcd_x86-linux-gcc.h @@ -7,6 +7,10 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + /* * VP8 */ @@ -22,32 +26,146 @@ struct variance_vtable; union int_mv; struct yv12_buffer_config; +void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_b vp8_blend_b_c + +void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_mb_inner vp8_blend_mb_inner_c + +void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_mb_outer vp8_blend_mb_outer_c + +int vp8_block_error_c(short *coeff, short *dqcoeff); +int vp8_block_error_mmx(short *coeff, short *dqcoeff); +int vp8_block_error_xmm(short *coeff, short *dqcoeff); +RTCD_EXTERN int (*vp8_block_error)(short *coeff, short *dqcoeff); + +void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +void vp8_build_intra_predictors_mbuv_s_sse2(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +void vp8_build_intra_predictors_mbuv_s_ssse3(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +RTCD_EXTERN void (*vp8_build_intra_predictors_mbuv_s)(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); + +void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +void vp8_build_intra_predictors_mby_s_sse2(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +void vp8_build_intra_predictors_mby_s_ssse3(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +RTCD_EXTERN void (*vp8_build_intra_predictors_mby_s)(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); + void vp8_clear_system_state_c(); void vpx_reset_mmx_state(); RTCD_EXTERN void (*vp8_clear_system_state)(); -void vp8_dequantize_b_c(struct blockd*, short *dqc); -void vp8_dequantize_b_mmx(struct blockd*, short *dqc); -RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *dqc); +void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); + +void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +RTCD_EXTERN int (*vp8_denoiser_filter)(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +RTCD_EXTERN int (*vp8_denoiser_filter_uv)(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride); RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *output, int stride); -void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs); - void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); void vp8_dequant_idct_add_uv_block_mmx(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); RTCD_EXTERN void (*vp8_dequant_idct_add_uv_block)(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); -void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs); + +void vp8_dequantize_b_c(struct blockd*, short *dqc); +void vp8_dequantize_b_mmx(struct blockd*, short *dqc); +RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *dqc); + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_diamond_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +void vp8_fast_quantize_b_sse2(struct block *, struct blockd *); +void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); + +void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); +#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_c + +void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +RTCD_EXTERN void (*vp8_filter_by_weight16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); + +void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c + +void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +RTCD_EXTERN void (*vp8_filter_by_weight8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); + +int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); + +unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vp8_get4x4sse_cs_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp8_get4x4sse_cs)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); + +unsigned int vp8_get_mb_ss_c(const short *); +unsigned int vp8_get_mb_ss_mmx(const short *); +unsigned int vp8_get_mb_ss_sse2(const short *); +RTCD_EXTERN unsigned int (*vp8_get_mb_ss)(const short *); + +void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); +#define vp8_intra4x4_predict vp8_intra4x4_predict_c + +void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); void vp8_loop_filter_bv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); @@ -59,240 +177,84 @@ void vp8_loop_filter_mbh_mmx(unsigned char *y, unsigned char *u, unsigned char * void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); - -void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_vertical_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y, int ystride, const unsigned char *blimit); - -void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y, int ystride, const unsigned char *blimit); - -void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); void vp8_loop_filter_bhs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_short_inv_walsh4x4_1_c(short *input, short *output); -#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_short_inv_walsh4x4_c(short *input, short *output); -void vp8_short_inv_walsh4x4_mmx(short *input, short *output); -void vp8_short_inv_walsh4x4_sse2(short *input, short *output); -RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *output); +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +int vp8_mbblock_error_mmx(struct macroblock *mb, int dc); +int vp8_mbblock_error_xmm(struct macroblock *mb, int dc); +RTCD_EXTERN int (*vp8_mbblock_error)(struct macroblock *mb, int dc); -void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); - -void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); - -void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); - -void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -void vp8_build_intra_predictors_mby_s_sse2(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -void vp8_build_intra_predictors_mby_s_ssse3(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -RTCD_EXTERN void (*vp8_build_intra_predictors_mby_s)(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); - -void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -void vp8_build_intra_predictors_mbuv_s_sse2(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -void vp8_build_intra_predictors_mbuv_s_ssse3(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -RTCD_EXTERN void (*vp8_build_intra_predictors_mbuv_s)(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); - -void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); -#define vp8_intra4x4_predict vp8_intra4x4_predict_c +void vp8_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vp8_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); +RTCD_EXTERN void (*vp8_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); RTCD_EXTERN void (*vp8_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vp8_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vp8_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); -RTCD_EXTERN void (*vp8_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit); +int vp8_mbuverror_c(struct macroblock *mb); +int vp8_mbuverror_mmx(struct macroblock *mb); +int vp8_mbuverror_xmm(struct macroblock *mb); +RTCD_EXTERN int (*vp8_mbuverror)(struct macroblock *mb); -void vp8_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -void vp8_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -RTCD_EXTERN void (*vp8_post_proc_down_and_across_mb_row)(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_mse16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_mmx(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_wmt(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); RTCD_EXTERN void (*vp8_plane_add_noise)(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); -void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_inner vp8_blend_mb_inner_c +void vp8_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +void vp8_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +RTCD_EXTERN void (*vp8_post_proc_down_and_across_mb_row)(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_outer vp8_blend_mb_outer_c +void vp8_quantize_mb_c(struct macroblock *); +#define vp8_quantize_mb vp8_quantize_mb_c -void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_b vp8_blend_b_c +void vp8_quantize_mbuv_c(struct macroblock *); +#define vp8_quantize_mbuv vp8_quantize_mbuv_c -void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -RTCD_EXTERN void (*vp8_filter_by_weight16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_quantize_mby_c(struct macroblock *); +#define vp8_quantize_mby vp8_quantize_mby_c -void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -RTCD_EXTERN void (*vp8_filter_by_weight8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse4_1(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_regular_quantize_b)(struct block *, struct blockd *); -void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance4x4)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance8x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance8x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance16x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance4x4)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance8x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance8x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_h_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_h)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_v)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_hv_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_hv_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_hv)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad4x4_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad4x4_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp8_sad4x4)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp8_sad8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x16_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp8_sad8x16)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad16x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad16x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp8_sad16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); +#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); unsigned int vp8_sad16x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); @@ -300,91 +262,87 @@ unsigned int vp8_sad16x16_wmt(const unsigned char *src_ptr, int src_stride, cons unsigned int vp8_sad16x16_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); RTCD_EXTERN unsigned int (*vp8_sad16x16)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad4x4x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad4x4x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad16x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); + +unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad16x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad16x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +RTCD_EXTERN unsigned int (*vp8_sad16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad16x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad16x8x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad16x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad4x4x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad4x4x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); - -void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad8x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); - -void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad8x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad8x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad16x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); void vp8_sad16x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); RTCD_EXTERN void (*vp8_sad16x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad4x4_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad4x4_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +RTCD_EXTERN unsigned int (*vp8_sad4x4)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); + +void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad4x4x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad4x4x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad4x4x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad4x4x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad4x4x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad4x4x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); + +unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x16_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +RTCD_EXTERN unsigned int (*vp8_sad8x16)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); + +void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad8x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad8x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad8x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad8x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad8x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +RTCD_EXTERN unsigned int (*vp8_sad8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_get_mb_ss_c(const short *); -unsigned int vp8_get_mb_ss_mmx(const short *); -unsigned int vp8_get_mb_ss_sse2(const short *); -RTCD_EXTERN unsigned int (*vp8_get_mb_ss)(const short *); +void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad8x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_mse16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad8x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_mse16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -unsigned int vp8_get4x4sse_cs_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -RTCD_EXTERN unsigned int (*vp8_get4x4sse_cs)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); - -void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad8x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); void vp8_short_fdct4x4_c(short *input, short *output, int pitch); void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch); @@ -396,87 +354,135 @@ void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch); void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch); RTCD_EXTERN void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); + +void vp8_short_inv_walsh4x4_c(short *input, short *output); +void vp8_short_inv_walsh4x4_mmx(short *input, short *output); +void vp8_short_inv_walsh4x4_sse2(short *input, short *output); +RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *output); + +void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + void vp8_short_walsh4x4_c(short *input, short *output, int pitch); void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch); RTCD_EXTERN void (*vp8_short_walsh4x4)(short *input, short *output, int pitch); -void vp8_regular_quantize_b_c(struct block *, struct blockd *); -void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); -RTCD_EXTERN void (*vp8_regular_quantize_b)(struct block *, struct blockd *); +void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_fast_quantize_b_c(struct block *, struct blockd *); -void vp8_fast_quantize_b_sse2(struct block *, struct blockd *); -void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *); -RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); +void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); -#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c +void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); -#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_c +void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_quantize_mb_c(struct macroblock *); -#define vp8_quantize_mb vp8_quantize_mb_c +unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_mse16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -void vp8_quantize_mby_c(struct macroblock *); -#define vp8_quantize_mby vp8_quantize_mby_c +unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -void vp8_quantize_mbuv_c(struct macroblock *); -#define vp8_quantize_mbuv vp8_quantize_mbuv_c +unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -int vp8_block_error_c(short *coeff, short *dqcoeff); -int vp8_block_error_mmx(short *coeff, short *dqcoeff); -int vp8_block_error_xmm(short *coeff, short *dqcoeff); -RTCD_EXTERN int (*vp8_block_error)(short *coeff, short *dqcoeff); +unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance4x4)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -int vp8_mbblock_error_c(struct macroblock *mb, int dc); -int vp8_mbblock_error_mmx(struct macroblock *mb, int dc); -int vp8_mbblock_error_xmm(struct macroblock *mb, int dc); -RTCD_EXTERN int (*vp8_mbblock_error)(struct macroblock *mb, int dc); +unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance8x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -int vp8_mbuverror_c(struct macroblock *mb); -int vp8_mbuverror_mmx(struct macroblock *mb); -int vp8_mbuverror_xmm(struct macroblock *mb); -RTCD_EXTERN int (*vp8_mbuverror)(struct macroblock *mb); +unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance8x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); void vp8_subtract_b_c(struct block *be, struct blockd *bd, int pitch); void vp8_subtract_b_mmx(struct block *be, struct blockd *bd, int pitch); void vp8_subtract_b_sse2(struct block *be, struct blockd *bd, int pitch); RTCD_EXTERN void (*vp8_subtract_b)(struct block *be, struct blockd *bd, int pitch); -void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -RTCD_EXTERN void (*vp8_subtract_mby)(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); - void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); RTCD_EXTERN void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); -int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); - -int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); - -int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_diamond_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +RTCD_EXTERN void (*vp8_subtract_mby)(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); RTCD_EXTERN void (*vp8_temporal_filter_apply)(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c +unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -int vp8_denoiser_filter_c(struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset); -int vp8_denoiser_filter_sse2(struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset); -RTCD_EXTERN int (*vp8_denoiser_filter)(struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset); +unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance16x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance4x4)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance8x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance8x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_h_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_h)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_hv_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_hv_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_hv)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_v)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); void vp8_rtcd(void); @@ -488,348 +494,256 @@ static void setup_rtcd_internal(void) (void)flags; - vp8_clear_system_state = vp8_clear_system_state_c; - if (flags & HAS_MMX) vp8_clear_system_state = vpx_reset_mmx_state; - - vp8_dequantize_b = vp8_dequantize_b_c; - if (flags & HAS_MMX) vp8_dequantize_b = vp8_dequantize_b_mmx; - - vp8_dequant_idct_add = vp8_dequant_idct_add_c; - if (flags & HAS_MMX) vp8_dequant_idct_add = vp8_dequant_idct_add_mmx; - - vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_c; - if (flags & HAS_MMX) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_mmx; - if (flags & HAS_SSE2) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_sse2; - - vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_c; - if (flags & HAS_MMX) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_mmx; - if (flags & HAS_SSE2) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_sse2; - - vp8_loop_filter_mbv = vp8_loop_filter_mbv_c; - if (flags & HAS_MMX) vp8_loop_filter_mbv = vp8_loop_filter_mbv_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_mbv = vp8_loop_filter_mbv_sse2; - - vp8_loop_filter_bv = vp8_loop_filter_bv_c; - if (flags & HAS_MMX) vp8_loop_filter_bv = vp8_loop_filter_bv_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_bv = vp8_loop_filter_bv_sse2; - - vp8_loop_filter_mbh = vp8_loop_filter_mbh_c; - if (flags & HAS_MMX) vp8_loop_filter_mbh = vp8_loop_filter_mbh_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_mbh = vp8_loop_filter_mbh_sse2; - - vp8_loop_filter_bh = vp8_loop_filter_bh_c; - if (flags & HAS_MMX) vp8_loop_filter_bh = vp8_loop_filter_bh_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_bh = vp8_loop_filter_bh_sse2; - - vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_c; - if (flags & HAS_MMX) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_sse2; - - vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_c; - if (flags & HAS_MMX) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_sse2; - - vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_c; - if (flags & HAS_MMX) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_sse2; - - vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_c; - if (flags & HAS_MMX) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_sse2; - - vp8_short_idct4x4llm = vp8_short_idct4x4llm_c; - if (flags & HAS_MMX) vp8_short_idct4x4llm = vp8_short_idct4x4llm_mmx; - - - vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_c; - if (flags & HAS_MMX) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_mmx; - if (flags & HAS_SSE2) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_sse2; - - vp8_dc_only_idct_add = vp8_dc_only_idct_add_c; - if (flags & HAS_MMX) vp8_dc_only_idct_add = vp8_dc_only_idct_add_mmx; - - vp8_copy_mem16x16 = vp8_copy_mem16x16_c; - if (flags & HAS_MMX) vp8_copy_mem16x16 = vp8_copy_mem16x16_mmx; - if (flags & HAS_SSE2) vp8_copy_mem16x16 = vp8_copy_mem16x16_sse2; - - vp8_copy_mem8x8 = vp8_copy_mem8x8_c; - if (flags & HAS_MMX) vp8_copy_mem8x8 = vp8_copy_mem8x8_mmx; - - vp8_copy_mem8x4 = vp8_copy_mem8x4_c; - if (flags & HAS_MMX) vp8_copy_mem8x4 = vp8_copy_mem8x4_mmx; - - vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_c; - if (flags & HAS_SSE2) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_sse2; - if (flags & HAS_SSSE3) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_ssse3; - - vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_c; - if (flags & HAS_SSE2) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_sse2; - if (flags & HAS_SSSE3) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_ssse3; - - - vp8_mbpost_proc_down = vp8_mbpost_proc_down_c; - if (flags & HAS_MMX) vp8_mbpost_proc_down = vp8_mbpost_proc_down_mmx; - if (flags & HAS_SSE2) vp8_mbpost_proc_down = vp8_mbpost_proc_down_xmm; - - vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_c; - if (flags & HAS_SSE2) vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_xmm; - - vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c; - if (flags & HAS_SSE2) vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_sse2; - - vp8_plane_add_noise = vp8_plane_add_noise_c; - if (flags & HAS_MMX) vp8_plane_add_noise = vp8_plane_add_noise_mmx; - if (flags & HAS_SSE2) vp8_plane_add_noise = vp8_plane_add_noise_wmt; - - - - - vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_c; - if (flags & HAS_SSE2) vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_sse2; - - vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_c; - if (flags & HAS_SSE2) vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_sse2; - - - vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_c; - if (flags & HAS_MMX) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_mmx; - if (flags & HAS_SSE2) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; - - vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_c; - if (flags & HAS_MMX) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_mmx; - if (flags & HAS_SSE2) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; - - vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_c; - if (flags & HAS_MMX) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_mmx; - if (flags & HAS_SSE2) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; - - vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_c; - if (flags & HAS_MMX) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; - if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; - vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_c; if (flags & HAS_MMX) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_mmx; if (flags & HAS_SSE2) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_sse2; if (flags & HAS_SSSE3) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_ssse3; - + vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_c; + if (flags & HAS_MMX) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_mmx; + vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c; + if (flags & HAS_MMX) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_mmx; vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_c; if (flags & HAS_MMX) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_mmx; if (flags & HAS_SSE2) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_sse2; if (flags & HAS_SSSE3) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_ssse3; - - vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c; - if (flags & HAS_MMX) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_mmx; - - vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_c; - if (flags & HAS_MMX) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_mmx; - - vp8_variance4x4 = vp8_variance4x4_c; - if (flags & HAS_MMX) vp8_variance4x4 = vp8_variance4x4_mmx; - if (flags & HAS_SSE2) vp8_variance4x4 = vp8_variance4x4_wmt; - - vp8_variance8x8 = vp8_variance8x8_c; - if (flags & HAS_MMX) vp8_variance8x8 = vp8_variance8x8_mmx; - if (flags & HAS_SSE2) vp8_variance8x8 = vp8_variance8x8_wmt; - - vp8_variance8x16 = vp8_variance8x16_c; - if (flags & HAS_MMX) vp8_variance8x16 = vp8_variance8x16_mmx; - if (flags & HAS_SSE2) vp8_variance8x16 = vp8_variance8x16_wmt; - - vp8_variance16x8 = vp8_variance16x8_c; - if (flags & HAS_MMX) vp8_variance16x8 = vp8_variance16x8_mmx; - if (flags & HAS_SSE2) vp8_variance16x8 = vp8_variance16x8_wmt; - - vp8_variance16x16 = vp8_variance16x16_c; - if (flags & HAS_MMX) vp8_variance16x16 = vp8_variance16x16_mmx; - if (flags & HAS_SSE2) vp8_variance16x16 = vp8_variance16x16_wmt; - - vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_c; - if (flags & HAS_MMX) vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_wmt; - - vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_c; - if (flags & HAS_MMX) vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_wmt; - - vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_c; - if (flags & HAS_MMX) vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_wmt; - - vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_c; - if (flags & HAS_MMX) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_wmt; - if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_ssse3; - - vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_c; - if (flags & HAS_MMX) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt; - if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ssse3; - - vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_c; - if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_mmx; - if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_wmt; - - vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_c; - if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_mmx; - if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_wmt; - - vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_c; - if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_mmx; - if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_wmt; - - vp8_sad4x4 = vp8_sad4x4_c; - if (flags & HAS_MMX) vp8_sad4x4 = vp8_sad4x4_mmx; - if (flags & HAS_SSE2) vp8_sad4x4 = vp8_sad4x4_wmt; - - vp8_sad8x8 = vp8_sad8x8_c; - if (flags & HAS_MMX) vp8_sad8x8 = vp8_sad8x8_mmx; - if (flags & HAS_SSE2) vp8_sad8x8 = vp8_sad8x8_wmt; - - vp8_sad8x16 = vp8_sad8x16_c; - if (flags & HAS_MMX) vp8_sad8x16 = vp8_sad8x16_mmx; - if (flags & HAS_SSE2) vp8_sad8x16 = vp8_sad8x16_wmt; - - vp8_sad16x8 = vp8_sad16x8_c; - if (flags & HAS_MMX) vp8_sad16x8 = vp8_sad16x8_mmx; - if (flags & HAS_SSE2) vp8_sad16x8 = vp8_sad16x8_wmt; - + vp8_block_error = vp8_block_error_c; + if (flags & HAS_MMX) vp8_block_error = vp8_block_error_mmx; + if (flags & HAS_SSE2) vp8_block_error = vp8_block_error_xmm; + vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_c; + if (flags & HAS_SSE2) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_sse2; + if (flags & HAS_SSSE3) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_ssse3; + vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_c; + if (flags & HAS_SSE2) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_sse2; + if (flags & HAS_SSSE3) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_ssse3; + vp8_clear_system_state = vp8_clear_system_state_c; + if (flags & HAS_MMX) vp8_clear_system_state = vpx_reset_mmx_state; + vp8_copy32xn = vp8_copy32xn_c; + if (flags & HAS_SSE2) vp8_copy32xn = vp8_copy32xn_sse2; + if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3; + vp8_copy_mem16x16 = vp8_copy_mem16x16_c; + if (flags & HAS_MMX) vp8_copy_mem16x16 = vp8_copy_mem16x16_mmx; + if (flags & HAS_SSE2) vp8_copy_mem16x16 = vp8_copy_mem16x16_sse2; + vp8_copy_mem8x4 = vp8_copy_mem8x4_c; + if (flags & HAS_MMX) vp8_copy_mem8x4 = vp8_copy_mem8x4_mmx; + vp8_copy_mem8x8 = vp8_copy_mem8x8_c; + if (flags & HAS_MMX) vp8_copy_mem8x8 = vp8_copy_mem8x8_mmx; + vp8_dc_only_idct_add = vp8_dc_only_idct_add_c; + if (flags & HAS_MMX) vp8_dc_only_idct_add = vp8_dc_only_idct_add_mmx; + vp8_denoiser_filter = vp8_denoiser_filter_c; + if (flags & HAS_SSE2) vp8_denoiser_filter = vp8_denoiser_filter_sse2; + vp8_denoiser_filter_uv = vp8_denoiser_filter_uv_c; + if (flags & HAS_SSE2) vp8_denoiser_filter_uv = vp8_denoiser_filter_uv_sse2; + vp8_dequant_idct_add = vp8_dequant_idct_add_c; + if (flags & HAS_MMX) vp8_dequant_idct_add = vp8_dequant_idct_add_mmx; + vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_c; + if (flags & HAS_MMX) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_mmx; + if (flags & HAS_SSE2) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_sse2; + vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_c; + if (flags & HAS_MMX) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_mmx; + if (flags & HAS_SSE2) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_sse2; + vp8_dequantize_b = vp8_dequantize_b_c; + if (flags & HAS_MMX) vp8_dequantize_b = vp8_dequantize_b_mmx; + vp8_diamond_search_sad = vp8_diamond_search_sad_c; + if (flags & HAS_SSE3) vp8_diamond_search_sad = vp8_diamond_search_sadx4; + vp8_fast_quantize_b = vp8_fast_quantize_b_c; + if (flags & HAS_SSE2) vp8_fast_quantize_b = vp8_fast_quantize_b_sse2; + if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3; + vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_c; + if (flags & HAS_SSE2) vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_sse2; + vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_c; + if (flags & HAS_SSE2) vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_sse2; + vp8_full_search_sad = vp8_full_search_sad_c; + if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; + if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; + vp8_get4x4sse_cs = vp8_get4x4sse_cs_c; + if (flags & HAS_MMX) vp8_get4x4sse_cs = vp8_get4x4sse_cs_mmx; + vp8_get_mb_ss = vp8_get_mb_ss_c; + if (flags & HAS_MMX) vp8_get_mb_ss = vp8_get_mb_ss_mmx; + if (flags & HAS_SSE2) vp8_get_mb_ss = vp8_get_mb_ss_sse2; + vp8_loop_filter_bh = vp8_loop_filter_bh_c; + if (flags & HAS_MMX) vp8_loop_filter_bh = vp8_loop_filter_bh_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_bh = vp8_loop_filter_bh_sse2; + vp8_loop_filter_bv = vp8_loop_filter_bv_c; + if (flags & HAS_MMX) vp8_loop_filter_bv = vp8_loop_filter_bv_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_bv = vp8_loop_filter_bv_sse2; + vp8_loop_filter_mbh = vp8_loop_filter_mbh_c; + if (flags & HAS_MMX) vp8_loop_filter_mbh = vp8_loop_filter_mbh_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_mbh = vp8_loop_filter_mbh_sse2; + vp8_loop_filter_mbv = vp8_loop_filter_mbv_c; + if (flags & HAS_MMX) vp8_loop_filter_mbv = vp8_loop_filter_mbv_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_mbv = vp8_loop_filter_mbv_sse2; + vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_sse2; + vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_sse2; + vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_sse2; + vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_sse2; + vp8_mbblock_error = vp8_mbblock_error_c; + if (flags & HAS_MMX) vp8_mbblock_error = vp8_mbblock_error_mmx; + if (flags & HAS_SSE2) vp8_mbblock_error = vp8_mbblock_error_xmm; + vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_c; + if (flags & HAS_SSE2) vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_xmm; + vp8_mbpost_proc_down = vp8_mbpost_proc_down_c; + if (flags & HAS_MMX) vp8_mbpost_proc_down = vp8_mbpost_proc_down_mmx; + if (flags & HAS_SSE2) vp8_mbpost_proc_down = vp8_mbpost_proc_down_xmm; + vp8_mbuverror = vp8_mbuverror_c; + if (flags & HAS_MMX) vp8_mbuverror = vp8_mbuverror_mmx; + if (flags & HAS_SSE2) vp8_mbuverror = vp8_mbuverror_xmm; + vp8_mse16x16 = vp8_mse16x16_c; + if (flags & HAS_MMX) vp8_mse16x16 = vp8_mse16x16_mmx; + if (flags & HAS_SSE2) vp8_mse16x16 = vp8_mse16x16_wmt; + vp8_plane_add_noise = vp8_plane_add_noise_c; + if (flags & HAS_MMX) vp8_plane_add_noise = vp8_plane_add_noise_mmx; + if (flags & HAS_SSE2) vp8_plane_add_noise = vp8_plane_add_noise_wmt; + vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c; + if (flags & HAS_SSE2) vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_sse2; + vp8_refining_search_sad = vp8_refining_search_sad_c; + if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4; + vp8_regular_quantize_b = vp8_regular_quantize_b_c; + if (flags & HAS_SSE2) vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; + if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1; vp8_sad16x16 = vp8_sad16x16_c; if (flags & HAS_MMX) vp8_sad16x16 = vp8_sad16x16_mmx; if (flags & HAS_SSE2) vp8_sad16x16 = vp8_sad16x16_wmt; if (flags & HAS_SSE3) vp8_sad16x16 = vp8_sad16x16_sse3; - - vp8_sad4x4x3 = vp8_sad4x4x3_c; - if (flags & HAS_SSE3) vp8_sad4x4x3 = vp8_sad4x4x3_sse3; - - vp8_sad8x8x3 = vp8_sad8x8x3_c; - if (flags & HAS_SSE3) vp8_sad8x8x3 = vp8_sad8x8x3_sse3; - - vp8_sad8x16x3 = vp8_sad8x16x3_c; - if (flags & HAS_SSE3) vp8_sad8x16x3 = vp8_sad8x16x3_sse3; - - vp8_sad16x8x3 = vp8_sad16x8x3_c; - if (flags & HAS_SSE3) vp8_sad16x8x3 = vp8_sad16x8x3_sse3; - if (flags & HAS_SSSE3) vp8_sad16x8x3 = vp8_sad16x8x3_ssse3; - vp8_sad16x16x3 = vp8_sad16x16x3_c; if (flags & HAS_SSE3) vp8_sad16x16x3 = vp8_sad16x16x3_sse3; if (flags & HAS_SSSE3) vp8_sad16x16x3 = vp8_sad16x16x3_ssse3; - - vp8_sad4x4x8 = vp8_sad4x4x8_c; - if (flags & HAS_SSE4_1) vp8_sad4x4x8 = vp8_sad4x4x8_sse4; - - vp8_sad8x8x8 = vp8_sad8x8x8_c; - if (flags & HAS_SSE4_1) vp8_sad8x8x8 = vp8_sad8x8x8_sse4; - - vp8_sad8x16x8 = vp8_sad8x16x8_c; - if (flags & HAS_SSE4_1) vp8_sad8x16x8 = vp8_sad8x16x8_sse4; - - vp8_sad16x8x8 = vp8_sad16x8x8_c; - if (flags & HAS_SSE4_1) vp8_sad16x8x8 = vp8_sad16x8x8_sse4; - - vp8_sad16x16x8 = vp8_sad16x16x8_c; - if (flags & HAS_SSE4_1) vp8_sad16x16x8 = vp8_sad16x16x8_sse4; - - vp8_sad4x4x4d = vp8_sad4x4x4d_c; - if (flags & HAS_SSE3) vp8_sad4x4x4d = vp8_sad4x4x4d_sse3; - - vp8_sad8x8x4d = vp8_sad8x8x4d_c; - if (flags & HAS_SSE3) vp8_sad8x8x4d = vp8_sad8x8x4d_sse3; - - vp8_sad8x16x4d = vp8_sad8x16x4d_c; - if (flags & HAS_SSE3) vp8_sad8x16x4d = vp8_sad8x16x4d_sse3; - - vp8_sad16x8x4d = vp8_sad16x8x4d_c; - if (flags & HAS_SSE3) vp8_sad16x8x4d = vp8_sad16x8x4d_sse3; - vp8_sad16x16x4d = vp8_sad16x16x4d_c; if (flags & HAS_SSE3) vp8_sad16x16x4d = vp8_sad16x16x4d_sse3; - - vp8_get_mb_ss = vp8_get_mb_ss_c; - if (flags & HAS_MMX) vp8_get_mb_ss = vp8_get_mb_ss_mmx; - if (flags & HAS_SSE2) vp8_get_mb_ss = vp8_get_mb_ss_sse2; - - vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_c; - if (flags & HAS_MMX) vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_wmt; - - vp8_mse16x16 = vp8_mse16x16_c; - if (flags & HAS_MMX) vp8_mse16x16 = vp8_mse16x16_mmx; - if (flags & HAS_SSE2) vp8_mse16x16 = vp8_mse16x16_wmt; - - vp8_get4x4sse_cs = vp8_get4x4sse_cs_c; - if (flags & HAS_MMX) vp8_get4x4sse_cs = vp8_get4x4sse_cs_mmx; - - vp8_copy32xn = vp8_copy32xn_c; - if (flags & HAS_SSE2) vp8_copy32xn = vp8_copy32xn_sse2; - if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3; - + vp8_sad16x16x8 = vp8_sad16x16x8_c; + if (flags & HAS_SSE4_1) vp8_sad16x16x8 = vp8_sad16x16x8_sse4; + vp8_sad16x8 = vp8_sad16x8_c; + if (flags & HAS_MMX) vp8_sad16x8 = vp8_sad16x8_mmx; + if (flags & HAS_SSE2) vp8_sad16x8 = vp8_sad16x8_wmt; + vp8_sad16x8x3 = vp8_sad16x8x3_c; + if (flags & HAS_SSE3) vp8_sad16x8x3 = vp8_sad16x8x3_sse3; + if (flags & HAS_SSSE3) vp8_sad16x8x3 = vp8_sad16x8x3_ssse3; + vp8_sad16x8x4d = vp8_sad16x8x4d_c; + if (flags & HAS_SSE3) vp8_sad16x8x4d = vp8_sad16x8x4d_sse3; + vp8_sad16x8x8 = vp8_sad16x8x8_c; + if (flags & HAS_SSE4_1) vp8_sad16x8x8 = vp8_sad16x8x8_sse4; + vp8_sad4x4 = vp8_sad4x4_c; + if (flags & HAS_MMX) vp8_sad4x4 = vp8_sad4x4_mmx; + if (flags & HAS_SSE2) vp8_sad4x4 = vp8_sad4x4_wmt; + vp8_sad4x4x3 = vp8_sad4x4x3_c; + if (flags & HAS_SSE3) vp8_sad4x4x3 = vp8_sad4x4x3_sse3; + vp8_sad4x4x4d = vp8_sad4x4x4d_c; + if (flags & HAS_SSE3) vp8_sad4x4x4d = vp8_sad4x4x4d_sse3; + vp8_sad4x4x8 = vp8_sad4x4x8_c; + if (flags & HAS_SSE4_1) vp8_sad4x4x8 = vp8_sad4x4x8_sse4; + vp8_sad8x16 = vp8_sad8x16_c; + if (flags & HAS_MMX) vp8_sad8x16 = vp8_sad8x16_mmx; + if (flags & HAS_SSE2) vp8_sad8x16 = vp8_sad8x16_wmt; + vp8_sad8x16x3 = vp8_sad8x16x3_c; + if (flags & HAS_SSE3) vp8_sad8x16x3 = vp8_sad8x16x3_sse3; + vp8_sad8x16x4d = vp8_sad8x16x4d_c; + if (flags & HAS_SSE3) vp8_sad8x16x4d = vp8_sad8x16x4d_sse3; + vp8_sad8x16x8 = vp8_sad8x16x8_c; + if (flags & HAS_SSE4_1) vp8_sad8x16x8 = vp8_sad8x16x8_sse4; + vp8_sad8x8 = vp8_sad8x8_c; + if (flags & HAS_MMX) vp8_sad8x8 = vp8_sad8x8_mmx; + if (flags & HAS_SSE2) vp8_sad8x8 = vp8_sad8x8_wmt; + vp8_sad8x8x3 = vp8_sad8x8x3_c; + if (flags & HAS_SSE3) vp8_sad8x8x3 = vp8_sad8x8x3_sse3; + vp8_sad8x8x4d = vp8_sad8x8x4d_c; + if (flags & HAS_SSE3) vp8_sad8x8x4d = vp8_sad8x8x4d_sse3; + vp8_sad8x8x8 = vp8_sad8x8x8_c; + if (flags & HAS_SSE4_1) vp8_sad8x8x8 = vp8_sad8x8x8_sse4; vp8_short_fdct4x4 = vp8_short_fdct4x4_c; if (flags & HAS_MMX) vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx; if (flags & HAS_SSE2) vp8_short_fdct4x4 = vp8_short_fdct4x4_sse2; - vp8_short_fdct8x4 = vp8_short_fdct8x4_c; if (flags & HAS_MMX) vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx; if (flags & HAS_SSE2) vp8_short_fdct8x4 = vp8_short_fdct8x4_sse2; - + vp8_short_idct4x4llm = vp8_short_idct4x4llm_c; + if (flags & HAS_MMX) vp8_short_idct4x4llm = vp8_short_idct4x4llm_mmx; + vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_c; + if (flags & HAS_MMX) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_mmx; + if (flags & HAS_SSE2) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_sse2; vp8_short_walsh4x4 = vp8_short_walsh4x4_c; if (flags & HAS_SSE2) vp8_short_walsh4x4 = vp8_short_walsh4x4_sse2; - - vp8_regular_quantize_b = vp8_regular_quantize_b_c; - if (flags & HAS_SSE2) vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; - - vp8_fast_quantize_b = vp8_fast_quantize_b_c; - if (flags & HAS_SSE2) vp8_fast_quantize_b = vp8_fast_quantize_b_sse2; - if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3; - - - - - - - vp8_block_error = vp8_block_error_c; - if (flags & HAS_MMX) vp8_block_error = vp8_block_error_mmx; - if (flags & HAS_SSE2) vp8_block_error = vp8_block_error_xmm; - - vp8_mbblock_error = vp8_mbblock_error_c; - if (flags & HAS_MMX) vp8_mbblock_error = vp8_mbblock_error_mmx; - if (flags & HAS_SSE2) vp8_mbblock_error = vp8_mbblock_error_xmm; - - vp8_mbuverror = vp8_mbuverror_c; - if (flags & HAS_MMX) vp8_mbuverror = vp8_mbuverror_mmx; - if (flags & HAS_SSE2) vp8_mbuverror = vp8_mbuverror_xmm; - + vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_c; + if (flags & HAS_MMX) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_mmx; + if (flags & HAS_SSE2) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; + vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_c; + if (flags & HAS_MMX) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; + if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; + vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_c; + if (flags & HAS_MMX) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_mmx; + if (flags & HAS_SSE2) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; + vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_c; + if (flags & HAS_MMX) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_mmx; + if (flags & HAS_SSE2) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; + vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_c; + if (flags & HAS_MMX) vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_wmt; + vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_c; + if (flags & HAS_MMX) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt; + if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ssse3; + vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_c; + if (flags & HAS_MMX) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_wmt; + if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_ssse3; + vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_c; + if (flags & HAS_MMX) vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_wmt; + vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_c; + if (flags & HAS_MMX) vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_wmt; + vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_c; + if (flags & HAS_MMX) vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_wmt; vp8_subtract_b = vp8_subtract_b_c; if (flags & HAS_MMX) vp8_subtract_b = vp8_subtract_b_mmx; if (flags & HAS_SSE2) vp8_subtract_b = vp8_subtract_b_sse2; - - vp8_subtract_mby = vp8_subtract_mby_c; - if (flags & HAS_MMX) vp8_subtract_mby = vp8_subtract_mby_mmx; - if (flags & HAS_SSE2) vp8_subtract_mby = vp8_subtract_mby_sse2; - vp8_subtract_mbuv = vp8_subtract_mbuv_c; if (flags & HAS_MMX) vp8_subtract_mbuv = vp8_subtract_mbuv_mmx; if (flags & HAS_SSE2) vp8_subtract_mbuv = vp8_subtract_mbuv_sse2; - - vp8_full_search_sad = vp8_full_search_sad_c; - if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; - if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; - - vp8_refining_search_sad = vp8_refining_search_sad_c; - if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4; - - vp8_diamond_search_sad = vp8_diamond_search_sad_c; - if (flags & HAS_SSE3) vp8_diamond_search_sad = vp8_diamond_search_sadx4; - + vp8_subtract_mby = vp8_subtract_mby_c; + if (flags & HAS_MMX) vp8_subtract_mby = vp8_subtract_mby_mmx; + if (flags & HAS_SSE2) vp8_subtract_mby = vp8_subtract_mby_sse2; vp8_temporal_filter_apply = vp8_temporal_filter_apply_c; if (flags & HAS_SSE2) vp8_temporal_filter_apply = vp8_temporal_filter_apply_sse2; - - - vp8_denoiser_filter = vp8_denoiser_filter_c; - if (flags & HAS_SSE2) vp8_denoiser_filter = vp8_denoiser_filter_sse2; + vp8_variance16x16 = vp8_variance16x16_c; + if (flags & HAS_MMX) vp8_variance16x16 = vp8_variance16x16_mmx; + if (flags & HAS_SSE2) vp8_variance16x16 = vp8_variance16x16_wmt; + vp8_variance16x8 = vp8_variance16x8_c; + if (flags & HAS_MMX) vp8_variance16x8 = vp8_variance16x8_mmx; + if (flags & HAS_SSE2) vp8_variance16x8 = vp8_variance16x8_wmt; + vp8_variance4x4 = vp8_variance4x4_c; + if (flags & HAS_MMX) vp8_variance4x4 = vp8_variance4x4_mmx; + if (flags & HAS_SSE2) vp8_variance4x4 = vp8_variance4x4_wmt; + vp8_variance8x16 = vp8_variance8x16_c; + if (flags & HAS_MMX) vp8_variance8x16 = vp8_variance8x16_mmx; + if (flags & HAS_SSE2) vp8_variance8x16 = vp8_variance8x16_wmt; + vp8_variance8x8 = vp8_variance8x8_c; + if (flags & HAS_MMX) vp8_variance8x8 = vp8_variance8x8_mmx; + if (flags & HAS_SSE2) vp8_variance8x8 = vp8_variance8x8_wmt; + vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_c; + if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_mmx; + if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_wmt; + vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_c; + if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_mmx; + if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_wmt; + vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_c; + if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_mmx; + if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_wmt; } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vp8_rtcd_x86-win32-gcc.h b/media/libvpx/vp8_rtcd_x86-win32-gcc.h index a13ca2b5a73b..a4f8d95dd3cf 100644 --- a/media/libvpx/vp8_rtcd_x86-win32-gcc.h +++ b/media/libvpx/vp8_rtcd_x86-win32-gcc.h @@ -7,6 +7,10 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + /* * VP8 */ @@ -22,32 +26,146 @@ struct variance_vtable; union int_mv; struct yv12_buffer_config; +void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_b vp8_blend_b_c + +void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_mb_inner vp8_blend_mb_inner_c + +void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_mb_outer vp8_blend_mb_outer_c + +int vp8_block_error_c(short *coeff, short *dqcoeff); +int vp8_block_error_mmx(short *coeff, short *dqcoeff); +int vp8_block_error_xmm(short *coeff, short *dqcoeff); +RTCD_EXTERN int (*vp8_block_error)(short *coeff, short *dqcoeff); + +void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +void vp8_build_intra_predictors_mbuv_s_sse2(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +void vp8_build_intra_predictors_mbuv_s_ssse3(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +RTCD_EXTERN void (*vp8_build_intra_predictors_mbuv_s)(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); + +void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +void vp8_build_intra_predictors_mby_s_sse2(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +void vp8_build_intra_predictors_mby_s_ssse3(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +RTCD_EXTERN void (*vp8_build_intra_predictors_mby_s)(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); + void vp8_clear_system_state_c(); void vpx_reset_mmx_state(); RTCD_EXTERN void (*vp8_clear_system_state)(); -void vp8_dequantize_b_c(struct blockd*, short *dqc); -void vp8_dequantize_b_mmx(struct blockd*, short *dqc); -RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *dqc); +void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); + +void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +RTCD_EXTERN int (*vp8_denoiser_filter)(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +RTCD_EXTERN int (*vp8_denoiser_filter_uv)(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride); RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *output, int stride); -void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs); - void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); void vp8_dequant_idct_add_uv_block_mmx(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); RTCD_EXTERN void (*vp8_dequant_idct_add_uv_block)(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); -void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs); + +void vp8_dequantize_b_c(struct blockd*, short *dqc); +void vp8_dequantize_b_mmx(struct blockd*, short *dqc); +RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *dqc); + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_diamond_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +void vp8_fast_quantize_b_sse2(struct block *, struct blockd *); +void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); + +void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); +#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_c + +void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +RTCD_EXTERN void (*vp8_filter_by_weight16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); + +void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c + +void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +RTCD_EXTERN void (*vp8_filter_by_weight8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); + +int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); + +unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vp8_get4x4sse_cs_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp8_get4x4sse_cs)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); + +unsigned int vp8_get_mb_ss_c(const short *); +unsigned int vp8_get_mb_ss_mmx(const short *); +unsigned int vp8_get_mb_ss_sse2(const short *); +RTCD_EXTERN unsigned int (*vp8_get_mb_ss)(const short *); + +void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); +#define vp8_intra4x4_predict vp8_intra4x4_predict_c + +void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); void vp8_loop_filter_bv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); @@ -59,240 +177,84 @@ void vp8_loop_filter_mbh_mmx(unsigned char *y, unsigned char *u, unsigned char * void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); - -void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_vertical_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y, int ystride, const unsigned char *blimit); - -void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y, int ystride, const unsigned char *blimit); - -void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); void vp8_loop_filter_bhs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_short_inv_walsh4x4_1_c(short *input, short *output); -#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_short_inv_walsh4x4_c(short *input, short *output); -void vp8_short_inv_walsh4x4_mmx(short *input, short *output); -void vp8_short_inv_walsh4x4_sse2(short *input, short *output); -RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *output); +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +int vp8_mbblock_error_mmx(struct macroblock *mb, int dc); +int vp8_mbblock_error_xmm(struct macroblock *mb, int dc); +RTCD_EXTERN int (*vp8_mbblock_error)(struct macroblock *mb, int dc); -void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); - -void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); - -void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); - -void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -void vp8_build_intra_predictors_mby_s_sse2(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -void vp8_build_intra_predictors_mby_s_ssse3(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -RTCD_EXTERN void (*vp8_build_intra_predictors_mby_s)(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); - -void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -void vp8_build_intra_predictors_mbuv_s_sse2(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -void vp8_build_intra_predictors_mbuv_s_ssse3(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -RTCD_EXTERN void (*vp8_build_intra_predictors_mbuv_s)(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); - -void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); -#define vp8_intra4x4_predict vp8_intra4x4_predict_c +void vp8_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vp8_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); +RTCD_EXTERN void (*vp8_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); RTCD_EXTERN void (*vp8_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vp8_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vp8_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); -RTCD_EXTERN void (*vp8_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit); +int vp8_mbuverror_c(struct macroblock *mb); +int vp8_mbuverror_mmx(struct macroblock *mb); +int vp8_mbuverror_xmm(struct macroblock *mb); +RTCD_EXTERN int (*vp8_mbuverror)(struct macroblock *mb); -void vp8_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -void vp8_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -RTCD_EXTERN void (*vp8_post_proc_down_and_across_mb_row)(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_mse16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_mmx(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_wmt(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); RTCD_EXTERN void (*vp8_plane_add_noise)(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); -void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_inner vp8_blend_mb_inner_c +void vp8_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +void vp8_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +RTCD_EXTERN void (*vp8_post_proc_down_and_across_mb_row)(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_outer vp8_blend_mb_outer_c +void vp8_quantize_mb_c(struct macroblock *); +#define vp8_quantize_mb vp8_quantize_mb_c -void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_b vp8_blend_b_c +void vp8_quantize_mbuv_c(struct macroblock *); +#define vp8_quantize_mbuv vp8_quantize_mbuv_c -void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -RTCD_EXTERN void (*vp8_filter_by_weight16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_quantize_mby_c(struct macroblock *); +#define vp8_quantize_mby vp8_quantize_mby_c -void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -RTCD_EXTERN void (*vp8_filter_by_weight8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse4_1(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_regular_quantize_b)(struct block *, struct blockd *); -void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance4x4)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance8x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance8x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance16x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance4x4)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance8x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance8x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_h_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_h)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_v)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_hv_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_hv_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_hv)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad4x4_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad4x4_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp8_sad4x4)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp8_sad8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x16_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp8_sad8x16)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad16x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad16x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp8_sad16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); +#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); unsigned int vp8_sad16x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); @@ -300,91 +262,87 @@ unsigned int vp8_sad16x16_wmt(const unsigned char *src_ptr, int src_stride, cons unsigned int vp8_sad16x16_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); RTCD_EXTERN unsigned int (*vp8_sad16x16)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad4x4x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad4x4x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad16x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); + +unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad16x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad16x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +RTCD_EXTERN unsigned int (*vp8_sad16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad16x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad16x8x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad16x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad4x4x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad4x4x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); - -void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad8x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); - -void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad8x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad8x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad16x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); void vp8_sad16x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); RTCD_EXTERN void (*vp8_sad16x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad4x4_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad4x4_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +RTCD_EXTERN unsigned int (*vp8_sad4x4)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); + +void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad4x4x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad4x4x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad4x4x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad4x4x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad4x4x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad4x4x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); + +unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x16_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +RTCD_EXTERN unsigned int (*vp8_sad8x16)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); + +void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad8x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad8x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad8x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad8x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad8x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +RTCD_EXTERN unsigned int (*vp8_sad8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_get_mb_ss_c(const short *); -unsigned int vp8_get_mb_ss_mmx(const short *); -unsigned int vp8_get_mb_ss_sse2(const short *); -RTCD_EXTERN unsigned int (*vp8_get_mb_ss)(const short *); +void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad8x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_mse16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad8x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_mse16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -unsigned int vp8_get4x4sse_cs_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -RTCD_EXTERN unsigned int (*vp8_get4x4sse_cs)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); - -void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad8x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); void vp8_short_fdct4x4_c(short *input, short *output, int pitch); void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch); @@ -396,87 +354,135 @@ void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch); void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch); RTCD_EXTERN void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); + +void vp8_short_inv_walsh4x4_c(short *input, short *output); +void vp8_short_inv_walsh4x4_mmx(short *input, short *output); +void vp8_short_inv_walsh4x4_sse2(short *input, short *output); +RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *output); + +void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + void vp8_short_walsh4x4_c(short *input, short *output, int pitch); void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch); RTCD_EXTERN void (*vp8_short_walsh4x4)(short *input, short *output, int pitch); -void vp8_regular_quantize_b_c(struct block *, struct blockd *); -void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); -RTCD_EXTERN void (*vp8_regular_quantize_b)(struct block *, struct blockd *); +void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_fast_quantize_b_c(struct block *, struct blockd *); -void vp8_fast_quantize_b_sse2(struct block *, struct blockd *); -void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *); -RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); +void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); -#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c +void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); -#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_c +void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_quantize_mb_c(struct macroblock *); -#define vp8_quantize_mb vp8_quantize_mb_c +unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_mse16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -void vp8_quantize_mby_c(struct macroblock *); -#define vp8_quantize_mby vp8_quantize_mby_c +unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -void vp8_quantize_mbuv_c(struct macroblock *); -#define vp8_quantize_mbuv vp8_quantize_mbuv_c +unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -int vp8_block_error_c(short *coeff, short *dqcoeff); -int vp8_block_error_mmx(short *coeff, short *dqcoeff); -int vp8_block_error_xmm(short *coeff, short *dqcoeff); -RTCD_EXTERN int (*vp8_block_error)(short *coeff, short *dqcoeff); +unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance4x4)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -int vp8_mbblock_error_c(struct macroblock *mb, int dc); -int vp8_mbblock_error_mmx(struct macroblock *mb, int dc); -int vp8_mbblock_error_xmm(struct macroblock *mb, int dc); -RTCD_EXTERN int (*vp8_mbblock_error)(struct macroblock *mb, int dc); +unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance8x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -int vp8_mbuverror_c(struct macroblock *mb); -int vp8_mbuverror_mmx(struct macroblock *mb); -int vp8_mbuverror_xmm(struct macroblock *mb); -RTCD_EXTERN int (*vp8_mbuverror)(struct macroblock *mb); +unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance8x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); void vp8_subtract_b_c(struct block *be, struct blockd *bd, int pitch); void vp8_subtract_b_mmx(struct block *be, struct blockd *bd, int pitch); void vp8_subtract_b_sse2(struct block *be, struct blockd *bd, int pitch); RTCD_EXTERN void (*vp8_subtract_b)(struct block *be, struct blockd *bd, int pitch); -void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -RTCD_EXTERN void (*vp8_subtract_mby)(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); - void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); RTCD_EXTERN void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); -int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); - -int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); - -int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_diamond_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +RTCD_EXTERN void (*vp8_subtract_mby)(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); RTCD_EXTERN void (*vp8_temporal_filter_apply)(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c +unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -int vp8_denoiser_filter_c(struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset); -int vp8_denoiser_filter_sse2(struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset); -RTCD_EXTERN int (*vp8_denoiser_filter)(struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset); +unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance16x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance4x4)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance8x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance8x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_h_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_h)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_hv_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_hv_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_hv)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_v)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); void vp8_rtcd(void); @@ -488,348 +494,256 @@ static void setup_rtcd_internal(void) (void)flags; - vp8_clear_system_state = vp8_clear_system_state_c; - if (flags & HAS_MMX) vp8_clear_system_state = vpx_reset_mmx_state; - - vp8_dequantize_b = vp8_dequantize_b_c; - if (flags & HAS_MMX) vp8_dequantize_b = vp8_dequantize_b_mmx; - - vp8_dequant_idct_add = vp8_dequant_idct_add_c; - if (flags & HAS_MMX) vp8_dequant_idct_add = vp8_dequant_idct_add_mmx; - - vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_c; - if (flags & HAS_MMX) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_mmx; - if (flags & HAS_SSE2) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_sse2; - - vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_c; - if (flags & HAS_MMX) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_mmx; - if (flags & HAS_SSE2) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_sse2; - - vp8_loop_filter_mbv = vp8_loop_filter_mbv_c; - if (flags & HAS_MMX) vp8_loop_filter_mbv = vp8_loop_filter_mbv_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_mbv = vp8_loop_filter_mbv_sse2; - - vp8_loop_filter_bv = vp8_loop_filter_bv_c; - if (flags & HAS_MMX) vp8_loop_filter_bv = vp8_loop_filter_bv_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_bv = vp8_loop_filter_bv_sse2; - - vp8_loop_filter_mbh = vp8_loop_filter_mbh_c; - if (flags & HAS_MMX) vp8_loop_filter_mbh = vp8_loop_filter_mbh_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_mbh = vp8_loop_filter_mbh_sse2; - - vp8_loop_filter_bh = vp8_loop_filter_bh_c; - if (flags & HAS_MMX) vp8_loop_filter_bh = vp8_loop_filter_bh_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_bh = vp8_loop_filter_bh_sse2; - - vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_c; - if (flags & HAS_MMX) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_sse2; - - vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_c; - if (flags & HAS_MMX) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_sse2; - - vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_c; - if (flags & HAS_MMX) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_sse2; - - vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_c; - if (flags & HAS_MMX) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_sse2; - - vp8_short_idct4x4llm = vp8_short_idct4x4llm_c; - if (flags & HAS_MMX) vp8_short_idct4x4llm = vp8_short_idct4x4llm_mmx; - - - vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_c; - if (flags & HAS_MMX) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_mmx; - if (flags & HAS_SSE2) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_sse2; - - vp8_dc_only_idct_add = vp8_dc_only_idct_add_c; - if (flags & HAS_MMX) vp8_dc_only_idct_add = vp8_dc_only_idct_add_mmx; - - vp8_copy_mem16x16 = vp8_copy_mem16x16_c; - if (flags & HAS_MMX) vp8_copy_mem16x16 = vp8_copy_mem16x16_mmx; - if (flags & HAS_SSE2) vp8_copy_mem16x16 = vp8_copy_mem16x16_sse2; - - vp8_copy_mem8x8 = vp8_copy_mem8x8_c; - if (flags & HAS_MMX) vp8_copy_mem8x8 = vp8_copy_mem8x8_mmx; - - vp8_copy_mem8x4 = vp8_copy_mem8x4_c; - if (flags & HAS_MMX) vp8_copy_mem8x4 = vp8_copy_mem8x4_mmx; - - vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_c; - if (flags & HAS_SSE2) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_sse2; - if (flags & HAS_SSSE3) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_ssse3; - - vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_c; - if (flags & HAS_SSE2) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_sse2; - if (flags & HAS_SSSE3) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_ssse3; - - - vp8_mbpost_proc_down = vp8_mbpost_proc_down_c; - if (flags & HAS_MMX) vp8_mbpost_proc_down = vp8_mbpost_proc_down_mmx; - if (flags & HAS_SSE2) vp8_mbpost_proc_down = vp8_mbpost_proc_down_xmm; - - vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_c; - if (flags & HAS_SSE2) vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_xmm; - - vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c; - if (flags & HAS_SSE2) vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_sse2; - - vp8_plane_add_noise = vp8_plane_add_noise_c; - if (flags & HAS_MMX) vp8_plane_add_noise = vp8_plane_add_noise_mmx; - if (flags & HAS_SSE2) vp8_plane_add_noise = vp8_plane_add_noise_wmt; - - - - - vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_c; - if (flags & HAS_SSE2) vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_sse2; - - vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_c; - if (flags & HAS_SSE2) vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_sse2; - - - vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_c; - if (flags & HAS_MMX) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_mmx; - if (flags & HAS_SSE2) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; - - vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_c; - if (flags & HAS_MMX) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_mmx; - if (flags & HAS_SSE2) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; - - vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_c; - if (flags & HAS_MMX) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_mmx; - if (flags & HAS_SSE2) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; - - vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_c; - if (flags & HAS_MMX) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; - if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; - vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_c; if (flags & HAS_MMX) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_mmx; if (flags & HAS_SSE2) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_sse2; if (flags & HAS_SSSE3) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_ssse3; - + vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_c; + if (flags & HAS_MMX) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_mmx; + vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c; + if (flags & HAS_MMX) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_mmx; vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_c; if (flags & HAS_MMX) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_mmx; if (flags & HAS_SSE2) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_sse2; if (flags & HAS_SSSE3) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_ssse3; - - vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c; - if (flags & HAS_MMX) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_mmx; - - vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_c; - if (flags & HAS_MMX) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_mmx; - - vp8_variance4x4 = vp8_variance4x4_c; - if (flags & HAS_MMX) vp8_variance4x4 = vp8_variance4x4_mmx; - if (flags & HAS_SSE2) vp8_variance4x4 = vp8_variance4x4_wmt; - - vp8_variance8x8 = vp8_variance8x8_c; - if (flags & HAS_MMX) vp8_variance8x8 = vp8_variance8x8_mmx; - if (flags & HAS_SSE2) vp8_variance8x8 = vp8_variance8x8_wmt; - - vp8_variance8x16 = vp8_variance8x16_c; - if (flags & HAS_MMX) vp8_variance8x16 = vp8_variance8x16_mmx; - if (flags & HAS_SSE2) vp8_variance8x16 = vp8_variance8x16_wmt; - - vp8_variance16x8 = vp8_variance16x8_c; - if (flags & HAS_MMX) vp8_variance16x8 = vp8_variance16x8_mmx; - if (flags & HAS_SSE2) vp8_variance16x8 = vp8_variance16x8_wmt; - - vp8_variance16x16 = vp8_variance16x16_c; - if (flags & HAS_MMX) vp8_variance16x16 = vp8_variance16x16_mmx; - if (flags & HAS_SSE2) vp8_variance16x16 = vp8_variance16x16_wmt; - - vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_c; - if (flags & HAS_MMX) vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_wmt; - - vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_c; - if (flags & HAS_MMX) vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_wmt; - - vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_c; - if (flags & HAS_MMX) vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_wmt; - - vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_c; - if (flags & HAS_MMX) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_wmt; - if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_ssse3; - - vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_c; - if (flags & HAS_MMX) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt; - if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ssse3; - - vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_c; - if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_mmx; - if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_wmt; - - vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_c; - if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_mmx; - if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_wmt; - - vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_c; - if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_mmx; - if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_wmt; - - vp8_sad4x4 = vp8_sad4x4_c; - if (flags & HAS_MMX) vp8_sad4x4 = vp8_sad4x4_mmx; - if (flags & HAS_SSE2) vp8_sad4x4 = vp8_sad4x4_wmt; - - vp8_sad8x8 = vp8_sad8x8_c; - if (flags & HAS_MMX) vp8_sad8x8 = vp8_sad8x8_mmx; - if (flags & HAS_SSE2) vp8_sad8x8 = vp8_sad8x8_wmt; - - vp8_sad8x16 = vp8_sad8x16_c; - if (flags & HAS_MMX) vp8_sad8x16 = vp8_sad8x16_mmx; - if (flags & HAS_SSE2) vp8_sad8x16 = vp8_sad8x16_wmt; - - vp8_sad16x8 = vp8_sad16x8_c; - if (flags & HAS_MMX) vp8_sad16x8 = vp8_sad16x8_mmx; - if (flags & HAS_SSE2) vp8_sad16x8 = vp8_sad16x8_wmt; - + vp8_block_error = vp8_block_error_c; + if (flags & HAS_MMX) vp8_block_error = vp8_block_error_mmx; + if (flags & HAS_SSE2) vp8_block_error = vp8_block_error_xmm; + vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_c; + if (flags & HAS_SSE2) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_sse2; + if (flags & HAS_SSSE3) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_ssse3; + vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_c; + if (flags & HAS_SSE2) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_sse2; + if (flags & HAS_SSSE3) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_ssse3; + vp8_clear_system_state = vp8_clear_system_state_c; + if (flags & HAS_MMX) vp8_clear_system_state = vpx_reset_mmx_state; + vp8_copy32xn = vp8_copy32xn_c; + if (flags & HAS_SSE2) vp8_copy32xn = vp8_copy32xn_sse2; + if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3; + vp8_copy_mem16x16 = vp8_copy_mem16x16_c; + if (flags & HAS_MMX) vp8_copy_mem16x16 = vp8_copy_mem16x16_mmx; + if (flags & HAS_SSE2) vp8_copy_mem16x16 = vp8_copy_mem16x16_sse2; + vp8_copy_mem8x4 = vp8_copy_mem8x4_c; + if (flags & HAS_MMX) vp8_copy_mem8x4 = vp8_copy_mem8x4_mmx; + vp8_copy_mem8x8 = vp8_copy_mem8x8_c; + if (flags & HAS_MMX) vp8_copy_mem8x8 = vp8_copy_mem8x8_mmx; + vp8_dc_only_idct_add = vp8_dc_only_idct_add_c; + if (flags & HAS_MMX) vp8_dc_only_idct_add = vp8_dc_only_idct_add_mmx; + vp8_denoiser_filter = vp8_denoiser_filter_c; + if (flags & HAS_SSE2) vp8_denoiser_filter = vp8_denoiser_filter_sse2; + vp8_denoiser_filter_uv = vp8_denoiser_filter_uv_c; + if (flags & HAS_SSE2) vp8_denoiser_filter_uv = vp8_denoiser_filter_uv_sse2; + vp8_dequant_idct_add = vp8_dequant_idct_add_c; + if (flags & HAS_MMX) vp8_dequant_idct_add = vp8_dequant_idct_add_mmx; + vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_c; + if (flags & HAS_MMX) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_mmx; + if (flags & HAS_SSE2) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_sse2; + vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_c; + if (flags & HAS_MMX) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_mmx; + if (flags & HAS_SSE2) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_sse2; + vp8_dequantize_b = vp8_dequantize_b_c; + if (flags & HAS_MMX) vp8_dequantize_b = vp8_dequantize_b_mmx; + vp8_diamond_search_sad = vp8_diamond_search_sad_c; + if (flags & HAS_SSE3) vp8_diamond_search_sad = vp8_diamond_search_sadx4; + vp8_fast_quantize_b = vp8_fast_quantize_b_c; + if (flags & HAS_SSE2) vp8_fast_quantize_b = vp8_fast_quantize_b_sse2; + if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3; + vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_c; + if (flags & HAS_SSE2) vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_sse2; + vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_c; + if (flags & HAS_SSE2) vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_sse2; + vp8_full_search_sad = vp8_full_search_sad_c; + if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; + if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; + vp8_get4x4sse_cs = vp8_get4x4sse_cs_c; + if (flags & HAS_MMX) vp8_get4x4sse_cs = vp8_get4x4sse_cs_mmx; + vp8_get_mb_ss = vp8_get_mb_ss_c; + if (flags & HAS_MMX) vp8_get_mb_ss = vp8_get_mb_ss_mmx; + if (flags & HAS_SSE2) vp8_get_mb_ss = vp8_get_mb_ss_sse2; + vp8_loop_filter_bh = vp8_loop_filter_bh_c; + if (flags & HAS_MMX) vp8_loop_filter_bh = vp8_loop_filter_bh_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_bh = vp8_loop_filter_bh_sse2; + vp8_loop_filter_bv = vp8_loop_filter_bv_c; + if (flags & HAS_MMX) vp8_loop_filter_bv = vp8_loop_filter_bv_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_bv = vp8_loop_filter_bv_sse2; + vp8_loop_filter_mbh = vp8_loop_filter_mbh_c; + if (flags & HAS_MMX) vp8_loop_filter_mbh = vp8_loop_filter_mbh_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_mbh = vp8_loop_filter_mbh_sse2; + vp8_loop_filter_mbv = vp8_loop_filter_mbv_c; + if (flags & HAS_MMX) vp8_loop_filter_mbv = vp8_loop_filter_mbv_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_mbv = vp8_loop_filter_mbv_sse2; + vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_sse2; + vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_sse2; + vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_sse2; + vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_sse2; + vp8_mbblock_error = vp8_mbblock_error_c; + if (flags & HAS_MMX) vp8_mbblock_error = vp8_mbblock_error_mmx; + if (flags & HAS_SSE2) vp8_mbblock_error = vp8_mbblock_error_xmm; + vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_c; + if (flags & HAS_SSE2) vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_xmm; + vp8_mbpost_proc_down = vp8_mbpost_proc_down_c; + if (flags & HAS_MMX) vp8_mbpost_proc_down = vp8_mbpost_proc_down_mmx; + if (flags & HAS_SSE2) vp8_mbpost_proc_down = vp8_mbpost_proc_down_xmm; + vp8_mbuverror = vp8_mbuverror_c; + if (flags & HAS_MMX) vp8_mbuverror = vp8_mbuverror_mmx; + if (flags & HAS_SSE2) vp8_mbuverror = vp8_mbuverror_xmm; + vp8_mse16x16 = vp8_mse16x16_c; + if (flags & HAS_MMX) vp8_mse16x16 = vp8_mse16x16_mmx; + if (flags & HAS_SSE2) vp8_mse16x16 = vp8_mse16x16_wmt; + vp8_plane_add_noise = vp8_plane_add_noise_c; + if (flags & HAS_MMX) vp8_plane_add_noise = vp8_plane_add_noise_mmx; + if (flags & HAS_SSE2) vp8_plane_add_noise = vp8_plane_add_noise_wmt; + vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c; + if (flags & HAS_SSE2) vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_sse2; + vp8_refining_search_sad = vp8_refining_search_sad_c; + if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4; + vp8_regular_quantize_b = vp8_regular_quantize_b_c; + if (flags & HAS_SSE2) vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; + if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1; vp8_sad16x16 = vp8_sad16x16_c; if (flags & HAS_MMX) vp8_sad16x16 = vp8_sad16x16_mmx; if (flags & HAS_SSE2) vp8_sad16x16 = vp8_sad16x16_wmt; if (flags & HAS_SSE3) vp8_sad16x16 = vp8_sad16x16_sse3; - - vp8_sad4x4x3 = vp8_sad4x4x3_c; - if (flags & HAS_SSE3) vp8_sad4x4x3 = vp8_sad4x4x3_sse3; - - vp8_sad8x8x3 = vp8_sad8x8x3_c; - if (flags & HAS_SSE3) vp8_sad8x8x3 = vp8_sad8x8x3_sse3; - - vp8_sad8x16x3 = vp8_sad8x16x3_c; - if (flags & HAS_SSE3) vp8_sad8x16x3 = vp8_sad8x16x3_sse3; - - vp8_sad16x8x3 = vp8_sad16x8x3_c; - if (flags & HAS_SSE3) vp8_sad16x8x3 = vp8_sad16x8x3_sse3; - if (flags & HAS_SSSE3) vp8_sad16x8x3 = vp8_sad16x8x3_ssse3; - vp8_sad16x16x3 = vp8_sad16x16x3_c; if (flags & HAS_SSE3) vp8_sad16x16x3 = vp8_sad16x16x3_sse3; if (flags & HAS_SSSE3) vp8_sad16x16x3 = vp8_sad16x16x3_ssse3; - - vp8_sad4x4x8 = vp8_sad4x4x8_c; - if (flags & HAS_SSE4_1) vp8_sad4x4x8 = vp8_sad4x4x8_sse4; - - vp8_sad8x8x8 = vp8_sad8x8x8_c; - if (flags & HAS_SSE4_1) vp8_sad8x8x8 = vp8_sad8x8x8_sse4; - - vp8_sad8x16x8 = vp8_sad8x16x8_c; - if (flags & HAS_SSE4_1) vp8_sad8x16x8 = vp8_sad8x16x8_sse4; - - vp8_sad16x8x8 = vp8_sad16x8x8_c; - if (flags & HAS_SSE4_1) vp8_sad16x8x8 = vp8_sad16x8x8_sse4; - - vp8_sad16x16x8 = vp8_sad16x16x8_c; - if (flags & HAS_SSE4_1) vp8_sad16x16x8 = vp8_sad16x16x8_sse4; - - vp8_sad4x4x4d = vp8_sad4x4x4d_c; - if (flags & HAS_SSE3) vp8_sad4x4x4d = vp8_sad4x4x4d_sse3; - - vp8_sad8x8x4d = vp8_sad8x8x4d_c; - if (flags & HAS_SSE3) vp8_sad8x8x4d = vp8_sad8x8x4d_sse3; - - vp8_sad8x16x4d = vp8_sad8x16x4d_c; - if (flags & HAS_SSE3) vp8_sad8x16x4d = vp8_sad8x16x4d_sse3; - - vp8_sad16x8x4d = vp8_sad16x8x4d_c; - if (flags & HAS_SSE3) vp8_sad16x8x4d = vp8_sad16x8x4d_sse3; - vp8_sad16x16x4d = vp8_sad16x16x4d_c; if (flags & HAS_SSE3) vp8_sad16x16x4d = vp8_sad16x16x4d_sse3; - - vp8_get_mb_ss = vp8_get_mb_ss_c; - if (flags & HAS_MMX) vp8_get_mb_ss = vp8_get_mb_ss_mmx; - if (flags & HAS_SSE2) vp8_get_mb_ss = vp8_get_mb_ss_sse2; - - vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_c; - if (flags & HAS_MMX) vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_wmt; - - vp8_mse16x16 = vp8_mse16x16_c; - if (flags & HAS_MMX) vp8_mse16x16 = vp8_mse16x16_mmx; - if (flags & HAS_SSE2) vp8_mse16x16 = vp8_mse16x16_wmt; - - vp8_get4x4sse_cs = vp8_get4x4sse_cs_c; - if (flags & HAS_MMX) vp8_get4x4sse_cs = vp8_get4x4sse_cs_mmx; - - vp8_copy32xn = vp8_copy32xn_c; - if (flags & HAS_SSE2) vp8_copy32xn = vp8_copy32xn_sse2; - if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3; - + vp8_sad16x16x8 = vp8_sad16x16x8_c; + if (flags & HAS_SSE4_1) vp8_sad16x16x8 = vp8_sad16x16x8_sse4; + vp8_sad16x8 = vp8_sad16x8_c; + if (flags & HAS_MMX) vp8_sad16x8 = vp8_sad16x8_mmx; + if (flags & HAS_SSE2) vp8_sad16x8 = vp8_sad16x8_wmt; + vp8_sad16x8x3 = vp8_sad16x8x3_c; + if (flags & HAS_SSE3) vp8_sad16x8x3 = vp8_sad16x8x3_sse3; + if (flags & HAS_SSSE3) vp8_sad16x8x3 = vp8_sad16x8x3_ssse3; + vp8_sad16x8x4d = vp8_sad16x8x4d_c; + if (flags & HAS_SSE3) vp8_sad16x8x4d = vp8_sad16x8x4d_sse3; + vp8_sad16x8x8 = vp8_sad16x8x8_c; + if (flags & HAS_SSE4_1) vp8_sad16x8x8 = vp8_sad16x8x8_sse4; + vp8_sad4x4 = vp8_sad4x4_c; + if (flags & HAS_MMX) vp8_sad4x4 = vp8_sad4x4_mmx; + if (flags & HAS_SSE2) vp8_sad4x4 = vp8_sad4x4_wmt; + vp8_sad4x4x3 = vp8_sad4x4x3_c; + if (flags & HAS_SSE3) vp8_sad4x4x3 = vp8_sad4x4x3_sse3; + vp8_sad4x4x4d = vp8_sad4x4x4d_c; + if (flags & HAS_SSE3) vp8_sad4x4x4d = vp8_sad4x4x4d_sse3; + vp8_sad4x4x8 = vp8_sad4x4x8_c; + if (flags & HAS_SSE4_1) vp8_sad4x4x8 = vp8_sad4x4x8_sse4; + vp8_sad8x16 = vp8_sad8x16_c; + if (flags & HAS_MMX) vp8_sad8x16 = vp8_sad8x16_mmx; + if (flags & HAS_SSE2) vp8_sad8x16 = vp8_sad8x16_wmt; + vp8_sad8x16x3 = vp8_sad8x16x3_c; + if (flags & HAS_SSE3) vp8_sad8x16x3 = vp8_sad8x16x3_sse3; + vp8_sad8x16x4d = vp8_sad8x16x4d_c; + if (flags & HAS_SSE3) vp8_sad8x16x4d = vp8_sad8x16x4d_sse3; + vp8_sad8x16x8 = vp8_sad8x16x8_c; + if (flags & HAS_SSE4_1) vp8_sad8x16x8 = vp8_sad8x16x8_sse4; + vp8_sad8x8 = vp8_sad8x8_c; + if (flags & HAS_MMX) vp8_sad8x8 = vp8_sad8x8_mmx; + if (flags & HAS_SSE2) vp8_sad8x8 = vp8_sad8x8_wmt; + vp8_sad8x8x3 = vp8_sad8x8x3_c; + if (flags & HAS_SSE3) vp8_sad8x8x3 = vp8_sad8x8x3_sse3; + vp8_sad8x8x4d = vp8_sad8x8x4d_c; + if (flags & HAS_SSE3) vp8_sad8x8x4d = vp8_sad8x8x4d_sse3; + vp8_sad8x8x8 = vp8_sad8x8x8_c; + if (flags & HAS_SSE4_1) vp8_sad8x8x8 = vp8_sad8x8x8_sse4; vp8_short_fdct4x4 = vp8_short_fdct4x4_c; if (flags & HAS_MMX) vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx; if (flags & HAS_SSE2) vp8_short_fdct4x4 = vp8_short_fdct4x4_sse2; - vp8_short_fdct8x4 = vp8_short_fdct8x4_c; if (flags & HAS_MMX) vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx; if (flags & HAS_SSE2) vp8_short_fdct8x4 = vp8_short_fdct8x4_sse2; - + vp8_short_idct4x4llm = vp8_short_idct4x4llm_c; + if (flags & HAS_MMX) vp8_short_idct4x4llm = vp8_short_idct4x4llm_mmx; + vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_c; + if (flags & HAS_MMX) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_mmx; + if (flags & HAS_SSE2) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_sse2; vp8_short_walsh4x4 = vp8_short_walsh4x4_c; if (flags & HAS_SSE2) vp8_short_walsh4x4 = vp8_short_walsh4x4_sse2; - - vp8_regular_quantize_b = vp8_regular_quantize_b_c; - if (flags & HAS_SSE2) vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; - - vp8_fast_quantize_b = vp8_fast_quantize_b_c; - if (flags & HAS_SSE2) vp8_fast_quantize_b = vp8_fast_quantize_b_sse2; - if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3; - - - - - - - vp8_block_error = vp8_block_error_c; - if (flags & HAS_MMX) vp8_block_error = vp8_block_error_mmx; - if (flags & HAS_SSE2) vp8_block_error = vp8_block_error_xmm; - - vp8_mbblock_error = vp8_mbblock_error_c; - if (flags & HAS_MMX) vp8_mbblock_error = vp8_mbblock_error_mmx; - if (flags & HAS_SSE2) vp8_mbblock_error = vp8_mbblock_error_xmm; - - vp8_mbuverror = vp8_mbuverror_c; - if (flags & HAS_MMX) vp8_mbuverror = vp8_mbuverror_mmx; - if (flags & HAS_SSE2) vp8_mbuverror = vp8_mbuverror_xmm; - + vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_c; + if (flags & HAS_MMX) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_mmx; + if (flags & HAS_SSE2) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; + vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_c; + if (flags & HAS_MMX) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; + if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; + vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_c; + if (flags & HAS_MMX) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_mmx; + if (flags & HAS_SSE2) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; + vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_c; + if (flags & HAS_MMX) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_mmx; + if (flags & HAS_SSE2) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; + vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_c; + if (flags & HAS_MMX) vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_wmt; + vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_c; + if (flags & HAS_MMX) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt; + if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ssse3; + vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_c; + if (flags & HAS_MMX) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_wmt; + if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_ssse3; + vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_c; + if (flags & HAS_MMX) vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_wmt; + vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_c; + if (flags & HAS_MMX) vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_wmt; + vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_c; + if (flags & HAS_MMX) vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_wmt; vp8_subtract_b = vp8_subtract_b_c; if (flags & HAS_MMX) vp8_subtract_b = vp8_subtract_b_mmx; if (flags & HAS_SSE2) vp8_subtract_b = vp8_subtract_b_sse2; - - vp8_subtract_mby = vp8_subtract_mby_c; - if (flags & HAS_MMX) vp8_subtract_mby = vp8_subtract_mby_mmx; - if (flags & HAS_SSE2) vp8_subtract_mby = vp8_subtract_mby_sse2; - vp8_subtract_mbuv = vp8_subtract_mbuv_c; if (flags & HAS_MMX) vp8_subtract_mbuv = vp8_subtract_mbuv_mmx; if (flags & HAS_SSE2) vp8_subtract_mbuv = vp8_subtract_mbuv_sse2; - - vp8_full_search_sad = vp8_full_search_sad_c; - if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; - if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; - - vp8_refining_search_sad = vp8_refining_search_sad_c; - if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4; - - vp8_diamond_search_sad = vp8_diamond_search_sad_c; - if (flags & HAS_SSE3) vp8_diamond_search_sad = vp8_diamond_search_sadx4; - + vp8_subtract_mby = vp8_subtract_mby_c; + if (flags & HAS_MMX) vp8_subtract_mby = vp8_subtract_mby_mmx; + if (flags & HAS_SSE2) vp8_subtract_mby = vp8_subtract_mby_sse2; vp8_temporal_filter_apply = vp8_temporal_filter_apply_c; if (flags & HAS_SSE2) vp8_temporal_filter_apply = vp8_temporal_filter_apply_sse2; - - - vp8_denoiser_filter = vp8_denoiser_filter_c; - if (flags & HAS_SSE2) vp8_denoiser_filter = vp8_denoiser_filter_sse2; + vp8_variance16x16 = vp8_variance16x16_c; + if (flags & HAS_MMX) vp8_variance16x16 = vp8_variance16x16_mmx; + if (flags & HAS_SSE2) vp8_variance16x16 = vp8_variance16x16_wmt; + vp8_variance16x8 = vp8_variance16x8_c; + if (flags & HAS_MMX) vp8_variance16x8 = vp8_variance16x8_mmx; + if (flags & HAS_SSE2) vp8_variance16x8 = vp8_variance16x8_wmt; + vp8_variance4x4 = vp8_variance4x4_c; + if (flags & HAS_MMX) vp8_variance4x4 = vp8_variance4x4_mmx; + if (flags & HAS_SSE2) vp8_variance4x4 = vp8_variance4x4_wmt; + vp8_variance8x16 = vp8_variance8x16_c; + if (flags & HAS_MMX) vp8_variance8x16 = vp8_variance8x16_mmx; + if (flags & HAS_SSE2) vp8_variance8x16 = vp8_variance8x16_wmt; + vp8_variance8x8 = vp8_variance8x8_c; + if (flags & HAS_MMX) vp8_variance8x8 = vp8_variance8x8_mmx; + if (flags & HAS_SSE2) vp8_variance8x8 = vp8_variance8x8_wmt; + vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_c; + if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_mmx; + if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_wmt; + vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_c; + if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_mmx; + if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_wmt; + vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_c; + if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_mmx; + if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_wmt; } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vp8_rtcd_x86-win32-vs8.h b/media/libvpx/vp8_rtcd_x86-win32-vs8.h index a13ca2b5a73b..a4f8d95dd3cf 100644 --- a/media/libvpx/vp8_rtcd_x86-win32-vs8.h +++ b/media/libvpx/vp8_rtcd_x86-win32-vs8.h @@ -7,6 +7,10 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + /* * VP8 */ @@ -22,32 +26,146 @@ struct variance_vtable; union int_mv; struct yv12_buffer_config; +void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_b vp8_blend_b_c + +void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_mb_inner vp8_blend_mb_inner_c + +void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_mb_outer vp8_blend_mb_outer_c + +int vp8_block_error_c(short *coeff, short *dqcoeff); +int vp8_block_error_mmx(short *coeff, short *dqcoeff); +int vp8_block_error_xmm(short *coeff, short *dqcoeff); +RTCD_EXTERN int (*vp8_block_error)(short *coeff, short *dqcoeff); + +void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +void vp8_build_intra_predictors_mbuv_s_sse2(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +void vp8_build_intra_predictors_mbuv_s_ssse3(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +RTCD_EXTERN void (*vp8_build_intra_predictors_mbuv_s)(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); + +void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +void vp8_build_intra_predictors_mby_s_sse2(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +void vp8_build_intra_predictors_mby_s_ssse3(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +RTCD_EXTERN void (*vp8_build_intra_predictors_mby_s)(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); + void vp8_clear_system_state_c(); void vpx_reset_mmx_state(); RTCD_EXTERN void (*vp8_clear_system_state)(); -void vp8_dequantize_b_c(struct blockd*, short *dqc); -void vp8_dequantize_b_mmx(struct blockd*, short *dqc); -RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *dqc); +void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); + +void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); + +void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +RTCD_EXTERN int (*vp8_denoiser_filter)(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +RTCD_EXTERN int (*vp8_denoiser_filter_uv)(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride); RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *output, int stride); -void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs); - void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); void vp8_dequant_idct_add_uv_block_mmx(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); RTCD_EXTERN void (*vp8_dequant_idct_add_uv_block)(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); -void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs); + +void vp8_dequantize_b_c(struct blockd*, short *dqc); +void vp8_dequantize_b_mmx(struct blockd*, short *dqc); +RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *dqc); + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_diamond_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +void vp8_fast_quantize_b_sse2(struct block *, struct blockd *); +void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); + +void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); +#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_c + +void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +RTCD_EXTERN void (*vp8_filter_by_weight16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); + +void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c + +void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +RTCD_EXTERN void (*vp8_filter_by_weight8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); + +int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); + +unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vp8_get4x4sse_cs_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp8_get4x4sse_cs)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); + +unsigned int vp8_get_mb_ss_c(const short *); +unsigned int vp8_get_mb_ss_mmx(const short *); +unsigned int vp8_get_mb_ss_sse2(const short *); +RTCD_EXTERN unsigned int (*vp8_get_mb_ss)(const short *); + +void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); +#define vp8_intra4x4_predict vp8_intra4x4_predict_c + +void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); void vp8_loop_filter_bv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); @@ -59,240 +177,84 @@ void vp8_loop_filter_mbh_mmx(unsigned char *y, unsigned char *u, unsigned char * void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); - -void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_vertical_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y, int ystride, const unsigned char *blimit); - -void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y, int ystride, const unsigned char *blimit); - -void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); void vp8_loop_filter_bhs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_short_inv_walsh4x4_1_c(short *input, short *output); -#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_short_inv_walsh4x4_c(short *input, short *output); -void vp8_short_inv_walsh4x4_mmx(short *input, short *output); -void vp8_short_inv_walsh4x4_sse2(short *input, short *output); -RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *output); +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +int vp8_mbblock_error_mmx(struct macroblock *mb, int dc); +int vp8_mbblock_error_xmm(struct macroblock *mb, int dc); +RTCD_EXTERN int (*vp8_mbblock_error)(struct macroblock *mb, int dc); -void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); - -void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); - -void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); - -void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -void vp8_build_intra_predictors_mby_s_sse2(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -void vp8_build_intra_predictors_mby_s_ssse3(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -RTCD_EXTERN void (*vp8_build_intra_predictors_mby_s)(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); - -void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -void vp8_build_intra_predictors_mbuv_s_sse2(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -void vp8_build_intra_predictors_mbuv_s_ssse3(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -RTCD_EXTERN void (*vp8_build_intra_predictors_mbuv_s)(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); - -void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); -#define vp8_intra4x4_predict vp8_intra4x4_predict_c +void vp8_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vp8_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); +RTCD_EXTERN void (*vp8_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); RTCD_EXTERN void (*vp8_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vp8_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vp8_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); -RTCD_EXTERN void (*vp8_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit); +int vp8_mbuverror_c(struct macroblock *mb); +int vp8_mbuverror_mmx(struct macroblock *mb); +int vp8_mbuverror_xmm(struct macroblock *mb); +RTCD_EXTERN int (*vp8_mbuverror)(struct macroblock *mb); -void vp8_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -void vp8_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -RTCD_EXTERN void (*vp8_post_proc_down_and_across_mb_row)(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_mse16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_mmx(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_wmt(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); RTCD_EXTERN void (*vp8_plane_add_noise)(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); -void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_inner vp8_blend_mb_inner_c +void vp8_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +void vp8_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +RTCD_EXTERN void (*vp8_post_proc_down_and_across_mb_row)(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_outer vp8_blend_mb_outer_c +void vp8_quantize_mb_c(struct macroblock *); +#define vp8_quantize_mb vp8_quantize_mb_c -void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_b vp8_blend_b_c +void vp8_quantize_mbuv_c(struct macroblock *); +#define vp8_quantize_mbuv vp8_quantize_mbuv_c -void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -RTCD_EXTERN void (*vp8_filter_by_weight16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_quantize_mby_c(struct macroblock *); +#define vp8_quantize_mby vp8_quantize_mby_c -void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -RTCD_EXTERN void (*vp8_filter_by_weight8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse4_1(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_regular_quantize_b)(struct block *, struct blockd *); -void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance4x4)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance8x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance8x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance16x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance4x4)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance8x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance8x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_h_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_h)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_v)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_hv_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_hv_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_hv)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad4x4_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad4x4_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp8_sad4x4)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp8_sad8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x16_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp8_sad8x16)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad16x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad16x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp8_sad16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); +#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); unsigned int vp8_sad16x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); @@ -300,91 +262,87 @@ unsigned int vp8_sad16x16_wmt(const unsigned char *src_ptr, int src_stride, cons unsigned int vp8_sad16x16_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); RTCD_EXTERN unsigned int (*vp8_sad16x16)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad4x4x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad4x4x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad16x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); + +unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad16x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad16x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +RTCD_EXTERN unsigned int (*vp8_sad16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad16x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad16x8x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad16x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad4x4x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad4x4x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); - -void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad8x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); - -void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad8x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad8x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad16x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); void vp8_sad16x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); RTCD_EXTERN void (*vp8_sad16x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad4x4_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad4x4_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +RTCD_EXTERN unsigned int (*vp8_sad4x4)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); + +void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad4x4x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad4x4x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad4x4x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad4x4x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad4x4x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad4x4x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); + +unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x16_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +RTCD_EXTERN unsigned int (*vp8_sad8x16)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); + +void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad8x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad8x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad8x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad8x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad8x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +RTCD_EXTERN unsigned int (*vp8_sad8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_get_mb_ss_c(const short *); -unsigned int vp8_get_mb_ss_mmx(const short *); -unsigned int vp8_get_mb_ss_sse2(const short *); -RTCD_EXTERN unsigned int (*vp8_get_mb_ss)(const short *); +void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad8x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_mse16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad8x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_mse16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -unsigned int vp8_get4x4sse_cs_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -RTCD_EXTERN unsigned int (*vp8_get4x4sse_cs)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); - -void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad8x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); void vp8_short_fdct4x4_c(short *input, short *output, int pitch); void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch); @@ -396,87 +354,135 @@ void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch); void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch); RTCD_EXTERN void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); + +void vp8_short_inv_walsh4x4_c(short *input, short *output); +void vp8_short_inv_walsh4x4_mmx(short *input, short *output); +void vp8_short_inv_walsh4x4_sse2(short *input, short *output); +RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *output); + +void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + void vp8_short_walsh4x4_c(short *input, short *output, int pitch); void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch); RTCD_EXTERN void (*vp8_short_walsh4x4)(short *input, short *output, int pitch); -void vp8_regular_quantize_b_c(struct block *, struct blockd *); -void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); -RTCD_EXTERN void (*vp8_regular_quantize_b)(struct block *, struct blockd *); +void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_fast_quantize_b_c(struct block *, struct blockd *); -void vp8_fast_quantize_b_sse2(struct block *, struct blockd *); -void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *); -RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); +void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); -#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c +void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); -#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_c +void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_quantize_mb_c(struct macroblock *); -#define vp8_quantize_mb vp8_quantize_mb_c +unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_mse16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -void vp8_quantize_mby_c(struct macroblock *); -#define vp8_quantize_mby vp8_quantize_mby_c +unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -void vp8_quantize_mbuv_c(struct macroblock *); -#define vp8_quantize_mbuv vp8_quantize_mbuv_c +unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -int vp8_block_error_c(short *coeff, short *dqcoeff); -int vp8_block_error_mmx(short *coeff, short *dqcoeff); -int vp8_block_error_xmm(short *coeff, short *dqcoeff); -RTCD_EXTERN int (*vp8_block_error)(short *coeff, short *dqcoeff); +unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance4x4)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -int vp8_mbblock_error_c(struct macroblock *mb, int dc); -int vp8_mbblock_error_mmx(struct macroblock *mb, int dc); -int vp8_mbblock_error_xmm(struct macroblock *mb, int dc); -RTCD_EXTERN int (*vp8_mbblock_error)(struct macroblock *mb, int dc); +unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance8x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -int vp8_mbuverror_c(struct macroblock *mb); -int vp8_mbuverror_mmx(struct macroblock *mb); -int vp8_mbuverror_xmm(struct macroblock *mb); -RTCD_EXTERN int (*vp8_mbuverror)(struct macroblock *mb); +unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance8x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); void vp8_subtract_b_c(struct block *be, struct blockd *bd, int pitch); void vp8_subtract_b_mmx(struct block *be, struct blockd *bd, int pitch); void vp8_subtract_b_sse2(struct block *be, struct blockd *bd, int pitch); RTCD_EXTERN void (*vp8_subtract_b)(struct block *be, struct blockd *bd, int pitch); -void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -RTCD_EXTERN void (*vp8_subtract_mby)(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); - void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); RTCD_EXTERN void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); -int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); - -int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); - -int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_diamond_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +RTCD_EXTERN void (*vp8_subtract_mby)(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); RTCD_EXTERN void (*vp8_temporal_filter_apply)(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c +unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance16x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -int vp8_denoiser_filter_c(struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset); -int vp8_denoiser_filter_sse2(struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset); -RTCD_EXTERN int (*vp8_denoiser_filter)(struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset); +unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance16x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance4x4)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance8x16)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance8x8)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_h_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_h)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_hv_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_hv_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_hv)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_variance_halfpixvar16x16_v)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); void vp8_rtcd(void); @@ -488,348 +494,256 @@ static void setup_rtcd_internal(void) (void)flags; - vp8_clear_system_state = vp8_clear_system_state_c; - if (flags & HAS_MMX) vp8_clear_system_state = vpx_reset_mmx_state; - - vp8_dequantize_b = vp8_dequantize_b_c; - if (flags & HAS_MMX) vp8_dequantize_b = vp8_dequantize_b_mmx; - - vp8_dequant_idct_add = vp8_dequant_idct_add_c; - if (flags & HAS_MMX) vp8_dequant_idct_add = vp8_dequant_idct_add_mmx; - - vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_c; - if (flags & HAS_MMX) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_mmx; - if (flags & HAS_SSE2) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_sse2; - - vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_c; - if (flags & HAS_MMX) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_mmx; - if (flags & HAS_SSE2) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_sse2; - - vp8_loop_filter_mbv = vp8_loop_filter_mbv_c; - if (flags & HAS_MMX) vp8_loop_filter_mbv = vp8_loop_filter_mbv_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_mbv = vp8_loop_filter_mbv_sse2; - - vp8_loop_filter_bv = vp8_loop_filter_bv_c; - if (flags & HAS_MMX) vp8_loop_filter_bv = vp8_loop_filter_bv_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_bv = vp8_loop_filter_bv_sse2; - - vp8_loop_filter_mbh = vp8_loop_filter_mbh_c; - if (flags & HAS_MMX) vp8_loop_filter_mbh = vp8_loop_filter_mbh_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_mbh = vp8_loop_filter_mbh_sse2; - - vp8_loop_filter_bh = vp8_loop_filter_bh_c; - if (flags & HAS_MMX) vp8_loop_filter_bh = vp8_loop_filter_bh_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_bh = vp8_loop_filter_bh_sse2; - - vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_c; - if (flags & HAS_MMX) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_sse2; - - vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_c; - if (flags & HAS_MMX) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_sse2; - - vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_c; - if (flags & HAS_MMX) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_sse2; - - vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_c; - if (flags & HAS_MMX) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_mmx; - if (flags & HAS_SSE2) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_sse2; - - vp8_short_idct4x4llm = vp8_short_idct4x4llm_c; - if (flags & HAS_MMX) vp8_short_idct4x4llm = vp8_short_idct4x4llm_mmx; - - - vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_c; - if (flags & HAS_MMX) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_mmx; - if (flags & HAS_SSE2) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_sse2; - - vp8_dc_only_idct_add = vp8_dc_only_idct_add_c; - if (flags & HAS_MMX) vp8_dc_only_idct_add = vp8_dc_only_idct_add_mmx; - - vp8_copy_mem16x16 = vp8_copy_mem16x16_c; - if (flags & HAS_MMX) vp8_copy_mem16x16 = vp8_copy_mem16x16_mmx; - if (flags & HAS_SSE2) vp8_copy_mem16x16 = vp8_copy_mem16x16_sse2; - - vp8_copy_mem8x8 = vp8_copy_mem8x8_c; - if (flags & HAS_MMX) vp8_copy_mem8x8 = vp8_copy_mem8x8_mmx; - - vp8_copy_mem8x4 = vp8_copy_mem8x4_c; - if (flags & HAS_MMX) vp8_copy_mem8x4 = vp8_copy_mem8x4_mmx; - - vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_c; - if (flags & HAS_SSE2) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_sse2; - if (flags & HAS_SSSE3) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_ssse3; - - vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_c; - if (flags & HAS_SSE2) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_sse2; - if (flags & HAS_SSSE3) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_ssse3; - - - vp8_mbpost_proc_down = vp8_mbpost_proc_down_c; - if (flags & HAS_MMX) vp8_mbpost_proc_down = vp8_mbpost_proc_down_mmx; - if (flags & HAS_SSE2) vp8_mbpost_proc_down = vp8_mbpost_proc_down_xmm; - - vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_c; - if (flags & HAS_SSE2) vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_xmm; - - vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c; - if (flags & HAS_SSE2) vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_sse2; - - vp8_plane_add_noise = vp8_plane_add_noise_c; - if (flags & HAS_MMX) vp8_plane_add_noise = vp8_plane_add_noise_mmx; - if (flags & HAS_SSE2) vp8_plane_add_noise = vp8_plane_add_noise_wmt; - - - - - vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_c; - if (flags & HAS_SSE2) vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_sse2; - - vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_c; - if (flags & HAS_SSE2) vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_sse2; - - - vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_c; - if (flags & HAS_MMX) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_mmx; - if (flags & HAS_SSE2) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; - - vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_c; - if (flags & HAS_MMX) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_mmx; - if (flags & HAS_SSE2) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; - - vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_c; - if (flags & HAS_MMX) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_mmx; - if (flags & HAS_SSE2) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; - - vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_c; - if (flags & HAS_MMX) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; - if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; - vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_c; if (flags & HAS_MMX) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_mmx; if (flags & HAS_SSE2) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_sse2; if (flags & HAS_SSSE3) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_ssse3; - + vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_c; + if (flags & HAS_MMX) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_mmx; + vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c; + if (flags & HAS_MMX) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_mmx; vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_c; if (flags & HAS_MMX) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_mmx; if (flags & HAS_SSE2) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_sse2; if (flags & HAS_SSSE3) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_ssse3; - - vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c; - if (flags & HAS_MMX) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_mmx; - - vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_c; - if (flags & HAS_MMX) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_mmx; - - vp8_variance4x4 = vp8_variance4x4_c; - if (flags & HAS_MMX) vp8_variance4x4 = vp8_variance4x4_mmx; - if (flags & HAS_SSE2) vp8_variance4x4 = vp8_variance4x4_wmt; - - vp8_variance8x8 = vp8_variance8x8_c; - if (flags & HAS_MMX) vp8_variance8x8 = vp8_variance8x8_mmx; - if (flags & HAS_SSE2) vp8_variance8x8 = vp8_variance8x8_wmt; - - vp8_variance8x16 = vp8_variance8x16_c; - if (flags & HAS_MMX) vp8_variance8x16 = vp8_variance8x16_mmx; - if (flags & HAS_SSE2) vp8_variance8x16 = vp8_variance8x16_wmt; - - vp8_variance16x8 = vp8_variance16x8_c; - if (flags & HAS_MMX) vp8_variance16x8 = vp8_variance16x8_mmx; - if (flags & HAS_SSE2) vp8_variance16x8 = vp8_variance16x8_wmt; - - vp8_variance16x16 = vp8_variance16x16_c; - if (flags & HAS_MMX) vp8_variance16x16 = vp8_variance16x16_mmx; - if (flags & HAS_SSE2) vp8_variance16x16 = vp8_variance16x16_wmt; - - vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_c; - if (flags & HAS_MMX) vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_wmt; - - vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_c; - if (flags & HAS_MMX) vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_wmt; - - vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_c; - if (flags & HAS_MMX) vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_wmt; - - vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_c; - if (flags & HAS_MMX) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_wmt; - if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_ssse3; - - vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_c; - if (flags & HAS_MMX) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt; - if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ssse3; - - vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_c; - if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_mmx; - if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_wmt; - - vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_c; - if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_mmx; - if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_wmt; - - vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_c; - if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_mmx; - if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_wmt; - - vp8_sad4x4 = vp8_sad4x4_c; - if (flags & HAS_MMX) vp8_sad4x4 = vp8_sad4x4_mmx; - if (flags & HAS_SSE2) vp8_sad4x4 = vp8_sad4x4_wmt; - - vp8_sad8x8 = vp8_sad8x8_c; - if (flags & HAS_MMX) vp8_sad8x8 = vp8_sad8x8_mmx; - if (flags & HAS_SSE2) vp8_sad8x8 = vp8_sad8x8_wmt; - - vp8_sad8x16 = vp8_sad8x16_c; - if (flags & HAS_MMX) vp8_sad8x16 = vp8_sad8x16_mmx; - if (flags & HAS_SSE2) vp8_sad8x16 = vp8_sad8x16_wmt; - - vp8_sad16x8 = vp8_sad16x8_c; - if (flags & HAS_MMX) vp8_sad16x8 = vp8_sad16x8_mmx; - if (flags & HAS_SSE2) vp8_sad16x8 = vp8_sad16x8_wmt; - + vp8_block_error = vp8_block_error_c; + if (flags & HAS_MMX) vp8_block_error = vp8_block_error_mmx; + if (flags & HAS_SSE2) vp8_block_error = vp8_block_error_xmm; + vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_c; + if (flags & HAS_SSE2) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_sse2; + if (flags & HAS_SSSE3) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_ssse3; + vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_c; + if (flags & HAS_SSE2) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_sse2; + if (flags & HAS_SSSE3) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_ssse3; + vp8_clear_system_state = vp8_clear_system_state_c; + if (flags & HAS_MMX) vp8_clear_system_state = vpx_reset_mmx_state; + vp8_copy32xn = vp8_copy32xn_c; + if (flags & HAS_SSE2) vp8_copy32xn = vp8_copy32xn_sse2; + if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3; + vp8_copy_mem16x16 = vp8_copy_mem16x16_c; + if (flags & HAS_MMX) vp8_copy_mem16x16 = vp8_copy_mem16x16_mmx; + if (flags & HAS_SSE2) vp8_copy_mem16x16 = vp8_copy_mem16x16_sse2; + vp8_copy_mem8x4 = vp8_copy_mem8x4_c; + if (flags & HAS_MMX) vp8_copy_mem8x4 = vp8_copy_mem8x4_mmx; + vp8_copy_mem8x8 = vp8_copy_mem8x8_c; + if (flags & HAS_MMX) vp8_copy_mem8x8 = vp8_copy_mem8x8_mmx; + vp8_dc_only_idct_add = vp8_dc_only_idct_add_c; + if (flags & HAS_MMX) vp8_dc_only_idct_add = vp8_dc_only_idct_add_mmx; + vp8_denoiser_filter = vp8_denoiser_filter_c; + if (flags & HAS_SSE2) vp8_denoiser_filter = vp8_denoiser_filter_sse2; + vp8_denoiser_filter_uv = vp8_denoiser_filter_uv_c; + if (flags & HAS_SSE2) vp8_denoiser_filter_uv = vp8_denoiser_filter_uv_sse2; + vp8_dequant_idct_add = vp8_dequant_idct_add_c; + if (flags & HAS_MMX) vp8_dequant_idct_add = vp8_dequant_idct_add_mmx; + vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_c; + if (flags & HAS_MMX) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_mmx; + if (flags & HAS_SSE2) vp8_dequant_idct_add_uv_block = vp8_dequant_idct_add_uv_block_sse2; + vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_c; + if (flags & HAS_MMX) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_mmx; + if (flags & HAS_SSE2) vp8_dequant_idct_add_y_block = vp8_dequant_idct_add_y_block_sse2; + vp8_dequantize_b = vp8_dequantize_b_c; + if (flags & HAS_MMX) vp8_dequantize_b = vp8_dequantize_b_mmx; + vp8_diamond_search_sad = vp8_diamond_search_sad_c; + if (flags & HAS_SSE3) vp8_diamond_search_sad = vp8_diamond_search_sadx4; + vp8_fast_quantize_b = vp8_fast_quantize_b_c; + if (flags & HAS_SSE2) vp8_fast_quantize_b = vp8_fast_quantize_b_sse2; + if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3; + vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_c; + if (flags & HAS_SSE2) vp8_filter_by_weight16x16 = vp8_filter_by_weight16x16_sse2; + vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_c; + if (flags & HAS_SSE2) vp8_filter_by_weight8x8 = vp8_filter_by_weight8x8_sse2; + vp8_full_search_sad = vp8_full_search_sad_c; + if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; + if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; + vp8_get4x4sse_cs = vp8_get4x4sse_cs_c; + if (flags & HAS_MMX) vp8_get4x4sse_cs = vp8_get4x4sse_cs_mmx; + vp8_get_mb_ss = vp8_get_mb_ss_c; + if (flags & HAS_MMX) vp8_get_mb_ss = vp8_get_mb_ss_mmx; + if (flags & HAS_SSE2) vp8_get_mb_ss = vp8_get_mb_ss_sse2; + vp8_loop_filter_bh = vp8_loop_filter_bh_c; + if (flags & HAS_MMX) vp8_loop_filter_bh = vp8_loop_filter_bh_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_bh = vp8_loop_filter_bh_sse2; + vp8_loop_filter_bv = vp8_loop_filter_bv_c; + if (flags & HAS_MMX) vp8_loop_filter_bv = vp8_loop_filter_bv_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_bv = vp8_loop_filter_bv_sse2; + vp8_loop_filter_mbh = vp8_loop_filter_mbh_c; + if (flags & HAS_MMX) vp8_loop_filter_mbh = vp8_loop_filter_mbh_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_mbh = vp8_loop_filter_mbh_sse2; + vp8_loop_filter_mbv = vp8_loop_filter_mbv_c; + if (flags & HAS_MMX) vp8_loop_filter_mbv = vp8_loop_filter_mbv_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_mbv = vp8_loop_filter_mbv_sse2; + vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_bh = vp8_loop_filter_bhs_sse2; + vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_bv = vp8_loop_filter_bvs_sse2; + vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_mbh = vp8_loop_filter_simple_horizontal_edge_sse2; + vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_c; + if (flags & HAS_MMX) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_mmx; + if (flags & HAS_SSE2) vp8_loop_filter_simple_mbv = vp8_loop_filter_simple_vertical_edge_sse2; + vp8_mbblock_error = vp8_mbblock_error_c; + if (flags & HAS_MMX) vp8_mbblock_error = vp8_mbblock_error_mmx; + if (flags & HAS_SSE2) vp8_mbblock_error = vp8_mbblock_error_xmm; + vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_c; + if (flags & HAS_SSE2) vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_xmm; + vp8_mbpost_proc_down = vp8_mbpost_proc_down_c; + if (flags & HAS_MMX) vp8_mbpost_proc_down = vp8_mbpost_proc_down_mmx; + if (flags & HAS_SSE2) vp8_mbpost_proc_down = vp8_mbpost_proc_down_xmm; + vp8_mbuverror = vp8_mbuverror_c; + if (flags & HAS_MMX) vp8_mbuverror = vp8_mbuverror_mmx; + if (flags & HAS_SSE2) vp8_mbuverror = vp8_mbuverror_xmm; + vp8_mse16x16 = vp8_mse16x16_c; + if (flags & HAS_MMX) vp8_mse16x16 = vp8_mse16x16_mmx; + if (flags & HAS_SSE2) vp8_mse16x16 = vp8_mse16x16_wmt; + vp8_plane_add_noise = vp8_plane_add_noise_c; + if (flags & HAS_MMX) vp8_plane_add_noise = vp8_plane_add_noise_mmx; + if (flags & HAS_SSE2) vp8_plane_add_noise = vp8_plane_add_noise_wmt; + vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c; + if (flags & HAS_SSE2) vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_sse2; + vp8_refining_search_sad = vp8_refining_search_sad_c; + if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4; + vp8_regular_quantize_b = vp8_regular_quantize_b_c; + if (flags & HAS_SSE2) vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; + if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1; vp8_sad16x16 = vp8_sad16x16_c; if (flags & HAS_MMX) vp8_sad16x16 = vp8_sad16x16_mmx; if (flags & HAS_SSE2) vp8_sad16x16 = vp8_sad16x16_wmt; if (flags & HAS_SSE3) vp8_sad16x16 = vp8_sad16x16_sse3; - - vp8_sad4x4x3 = vp8_sad4x4x3_c; - if (flags & HAS_SSE3) vp8_sad4x4x3 = vp8_sad4x4x3_sse3; - - vp8_sad8x8x3 = vp8_sad8x8x3_c; - if (flags & HAS_SSE3) vp8_sad8x8x3 = vp8_sad8x8x3_sse3; - - vp8_sad8x16x3 = vp8_sad8x16x3_c; - if (flags & HAS_SSE3) vp8_sad8x16x3 = vp8_sad8x16x3_sse3; - - vp8_sad16x8x3 = vp8_sad16x8x3_c; - if (flags & HAS_SSE3) vp8_sad16x8x3 = vp8_sad16x8x3_sse3; - if (flags & HAS_SSSE3) vp8_sad16x8x3 = vp8_sad16x8x3_ssse3; - vp8_sad16x16x3 = vp8_sad16x16x3_c; if (flags & HAS_SSE3) vp8_sad16x16x3 = vp8_sad16x16x3_sse3; if (flags & HAS_SSSE3) vp8_sad16x16x3 = vp8_sad16x16x3_ssse3; - - vp8_sad4x4x8 = vp8_sad4x4x8_c; - if (flags & HAS_SSE4_1) vp8_sad4x4x8 = vp8_sad4x4x8_sse4; - - vp8_sad8x8x8 = vp8_sad8x8x8_c; - if (flags & HAS_SSE4_1) vp8_sad8x8x8 = vp8_sad8x8x8_sse4; - - vp8_sad8x16x8 = vp8_sad8x16x8_c; - if (flags & HAS_SSE4_1) vp8_sad8x16x8 = vp8_sad8x16x8_sse4; - - vp8_sad16x8x8 = vp8_sad16x8x8_c; - if (flags & HAS_SSE4_1) vp8_sad16x8x8 = vp8_sad16x8x8_sse4; - - vp8_sad16x16x8 = vp8_sad16x16x8_c; - if (flags & HAS_SSE4_1) vp8_sad16x16x8 = vp8_sad16x16x8_sse4; - - vp8_sad4x4x4d = vp8_sad4x4x4d_c; - if (flags & HAS_SSE3) vp8_sad4x4x4d = vp8_sad4x4x4d_sse3; - - vp8_sad8x8x4d = vp8_sad8x8x4d_c; - if (flags & HAS_SSE3) vp8_sad8x8x4d = vp8_sad8x8x4d_sse3; - - vp8_sad8x16x4d = vp8_sad8x16x4d_c; - if (flags & HAS_SSE3) vp8_sad8x16x4d = vp8_sad8x16x4d_sse3; - - vp8_sad16x8x4d = vp8_sad16x8x4d_c; - if (flags & HAS_SSE3) vp8_sad16x8x4d = vp8_sad16x8x4d_sse3; - vp8_sad16x16x4d = vp8_sad16x16x4d_c; if (flags & HAS_SSE3) vp8_sad16x16x4d = vp8_sad16x16x4d_sse3; - - vp8_get_mb_ss = vp8_get_mb_ss_c; - if (flags & HAS_MMX) vp8_get_mb_ss = vp8_get_mb_ss_mmx; - if (flags & HAS_SSE2) vp8_get_mb_ss = vp8_get_mb_ss_sse2; - - vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_c; - if (flags & HAS_MMX) vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_mmx; - if (flags & HAS_SSE2) vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_wmt; - - vp8_mse16x16 = vp8_mse16x16_c; - if (flags & HAS_MMX) vp8_mse16x16 = vp8_mse16x16_mmx; - if (flags & HAS_SSE2) vp8_mse16x16 = vp8_mse16x16_wmt; - - vp8_get4x4sse_cs = vp8_get4x4sse_cs_c; - if (flags & HAS_MMX) vp8_get4x4sse_cs = vp8_get4x4sse_cs_mmx; - - vp8_copy32xn = vp8_copy32xn_c; - if (flags & HAS_SSE2) vp8_copy32xn = vp8_copy32xn_sse2; - if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3; - + vp8_sad16x16x8 = vp8_sad16x16x8_c; + if (flags & HAS_SSE4_1) vp8_sad16x16x8 = vp8_sad16x16x8_sse4; + vp8_sad16x8 = vp8_sad16x8_c; + if (flags & HAS_MMX) vp8_sad16x8 = vp8_sad16x8_mmx; + if (flags & HAS_SSE2) vp8_sad16x8 = vp8_sad16x8_wmt; + vp8_sad16x8x3 = vp8_sad16x8x3_c; + if (flags & HAS_SSE3) vp8_sad16x8x3 = vp8_sad16x8x3_sse3; + if (flags & HAS_SSSE3) vp8_sad16x8x3 = vp8_sad16x8x3_ssse3; + vp8_sad16x8x4d = vp8_sad16x8x4d_c; + if (flags & HAS_SSE3) vp8_sad16x8x4d = vp8_sad16x8x4d_sse3; + vp8_sad16x8x8 = vp8_sad16x8x8_c; + if (flags & HAS_SSE4_1) vp8_sad16x8x8 = vp8_sad16x8x8_sse4; + vp8_sad4x4 = vp8_sad4x4_c; + if (flags & HAS_MMX) vp8_sad4x4 = vp8_sad4x4_mmx; + if (flags & HAS_SSE2) vp8_sad4x4 = vp8_sad4x4_wmt; + vp8_sad4x4x3 = vp8_sad4x4x3_c; + if (flags & HAS_SSE3) vp8_sad4x4x3 = vp8_sad4x4x3_sse3; + vp8_sad4x4x4d = vp8_sad4x4x4d_c; + if (flags & HAS_SSE3) vp8_sad4x4x4d = vp8_sad4x4x4d_sse3; + vp8_sad4x4x8 = vp8_sad4x4x8_c; + if (flags & HAS_SSE4_1) vp8_sad4x4x8 = vp8_sad4x4x8_sse4; + vp8_sad8x16 = vp8_sad8x16_c; + if (flags & HAS_MMX) vp8_sad8x16 = vp8_sad8x16_mmx; + if (flags & HAS_SSE2) vp8_sad8x16 = vp8_sad8x16_wmt; + vp8_sad8x16x3 = vp8_sad8x16x3_c; + if (flags & HAS_SSE3) vp8_sad8x16x3 = vp8_sad8x16x3_sse3; + vp8_sad8x16x4d = vp8_sad8x16x4d_c; + if (flags & HAS_SSE3) vp8_sad8x16x4d = vp8_sad8x16x4d_sse3; + vp8_sad8x16x8 = vp8_sad8x16x8_c; + if (flags & HAS_SSE4_1) vp8_sad8x16x8 = vp8_sad8x16x8_sse4; + vp8_sad8x8 = vp8_sad8x8_c; + if (flags & HAS_MMX) vp8_sad8x8 = vp8_sad8x8_mmx; + if (flags & HAS_SSE2) vp8_sad8x8 = vp8_sad8x8_wmt; + vp8_sad8x8x3 = vp8_sad8x8x3_c; + if (flags & HAS_SSE3) vp8_sad8x8x3 = vp8_sad8x8x3_sse3; + vp8_sad8x8x4d = vp8_sad8x8x4d_c; + if (flags & HAS_SSE3) vp8_sad8x8x4d = vp8_sad8x8x4d_sse3; + vp8_sad8x8x8 = vp8_sad8x8x8_c; + if (flags & HAS_SSE4_1) vp8_sad8x8x8 = vp8_sad8x8x8_sse4; vp8_short_fdct4x4 = vp8_short_fdct4x4_c; if (flags & HAS_MMX) vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx; if (flags & HAS_SSE2) vp8_short_fdct4x4 = vp8_short_fdct4x4_sse2; - vp8_short_fdct8x4 = vp8_short_fdct8x4_c; if (flags & HAS_MMX) vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx; if (flags & HAS_SSE2) vp8_short_fdct8x4 = vp8_short_fdct8x4_sse2; - + vp8_short_idct4x4llm = vp8_short_idct4x4llm_c; + if (flags & HAS_MMX) vp8_short_idct4x4llm = vp8_short_idct4x4llm_mmx; + vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_c; + if (flags & HAS_MMX) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_mmx; + if (flags & HAS_SSE2) vp8_short_inv_walsh4x4 = vp8_short_inv_walsh4x4_sse2; vp8_short_walsh4x4 = vp8_short_walsh4x4_c; if (flags & HAS_SSE2) vp8_short_walsh4x4 = vp8_short_walsh4x4_sse2; - - vp8_regular_quantize_b = vp8_regular_quantize_b_c; - if (flags & HAS_SSE2) vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; - - vp8_fast_quantize_b = vp8_fast_quantize_b_c; - if (flags & HAS_SSE2) vp8_fast_quantize_b = vp8_fast_quantize_b_sse2; - if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3; - - - - - - - vp8_block_error = vp8_block_error_c; - if (flags & HAS_MMX) vp8_block_error = vp8_block_error_mmx; - if (flags & HAS_SSE2) vp8_block_error = vp8_block_error_xmm; - - vp8_mbblock_error = vp8_mbblock_error_c; - if (flags & HAS_MMX) vp8_mbblock_error = vp8_mbblock_error_mmx; - if (flags & HAS_SSE2) vp8_mbblock_error = vp8_mbblock_error_xmm; - - vp8_mbuverror = vp8_mbuverror_c; - if (flags & HAS_MMX) vp8_mbuverror = vp8_mbuverror_mmx; - if (flags & HAS_SSE2) vp8_mbuverror = vp8_mbuverror_xmm; - + vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_c; + if (flags & HAS_MMX) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_mmx; + if (flags & HAS_SSE2) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; + vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_c; + if (flags & HAS_MMX) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; + if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; + vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_c; + if (flags & HAS_MMX) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_mmx; + if (flags & HAS_SSE2) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; + vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_c; + if (flags & HAS_MMX) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_mmx; + if (flags & HAS_SSE2) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; + vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_c; + if (flags & HAS_MMX) vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_mse16x16 = vp8_sub_pixel_mse16x16_wmt; + vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_c; + if (flags & HAS_MMX) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt; + if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ssse3; + vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_c; + if (flags & HAS_MMX) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_wmt; + if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_ssse3; + vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_c; + if (flags & HAS_MMX) vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_wmt; + vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_c; + if (flags & HAS_MMX) vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_wmt; + vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_c; + if (flags & HAS_MMX) vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_mmx; + if (flags & HAS_SSE2) vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_wmt; vp8_subtract_b = vp8_subtract_b_c; if (flags & HAS_MMX) vp8_subtract_b = vp8_subtract_b_mmx; if (flags & HAS_SSE2) vp8_subtract_b = vp8_subtract_b_sse2; - - vp8_subtract_mby = vp8_subtract_mby_c; - if (flags & HAS_MMX) vp8_subtract_mby = vp8_subtract_mby_mmx; - if (flags & HAS_SSE2) vp8_subtract_mby = vp8_subtract_mby_sse2; - vp8_subtract_mbuv = vp8_subtract_mbuv_c; if (flags & HAS_MMX) vp8_subtract_mbuv = vp8_subtract_mbuv_mmx; if (flags & HAS_SSE2) vp8_subtract_mbuv = vp8_subtract_mbuv_sse2; - - vp8_full_search_sad = vp8_full_search_sad_c; - if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; - if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; - - vp8_refining_search_sad = vp8_refining_search_sad_c; - if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4; - - vp8_diamond_search_sad = vp8_diamond_search_sad_c; - if (flags & HAS_SSE3) vp8_diamond_search_sad = vp8_diamond_search_sadx4; - + vp8_subtract_mby = vp8_subtract_mby_c; + if (flags & HAS_MMX) vp8_subtract_mby = vp8_subtract_mby_mmx; + if (flags & HAS_SSE2) vp8_subtract_mby = vp8_subtract_mby_sse2; vp8_temporal_filter_apply = vp8_temporal_filter_apply_c; if (flags & HAS_SSE2) vp8_temporal_filter_apply = vp8_temporal_filter_apply_sse2; - - - vp8_denoiser_filter = vp8_denoiser_filter_c; - if (flags & HAS_SSE2) vp8_denoiser_filter = vp8_denoiser_filter_sse2; + vp8_variance16x16 = vp8_variance16x16_c; + if (flags & HAS_MMX) vp8_variance16x16 = vp8_variance16x16_mmx; + if (flags & HAS_SSE2) vp8_variance16x16 = vp8_variance16x16_wmt; + vp8_variance16x8 = vp8_variance16x8_c; + if (flags & HAS_MMX) vp8_variance16x8 = vp8_variance16x8_mmx; + if (flags & HAS_SSE2) vp8_variance16x8 = vp8_variance16x8_wmt; + vp8_variance4x4 = vp8_variance4x4_c; + if (flags & HAS_MMX) vp8_variance4x4 = vp8_variance4x4_mmx; + if (flags & HAS_SSE2) vp8_variance4x4 = vp8_variance4x4_wmt; + vp8_variance8x16 = vp8_variance8x16_c; + if (flags & HAS_MMX) vp8_variance8x16 = vp8_variance8x16_mmx; + if (flags & HAS_SSE2) vp8_variance8x16 = vp8_variance8x16_wmt; + vp8_variance8x8 = vp8_variance8x8_c; + if (flags & HAS_MMX) vp8_variance8x8 = vp8_variance8x8_mmx; + if (flags & HAS_SSE2) vp8_variance8x8 = vp8_variance8x8_wmt; + vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_c; + if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_mmx; + if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_h = vp8_variance_halfpixvar16x16_h_wmt; + vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_c; + if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_mmx; + if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_hv = vp8_variance_halfpixvar16x16_hv_wmt; + vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_c; + if (flags & HAS_MMX) vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_mmx; + if (flags & HAS_SSE2) vp8_variance_halfpixvar16x16_v = vp8_variance_halfpixvar16x16_v_wmt; } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vp8_rtcd_x86_64-darwin9-gcc.h b/media/libvpx/vp8_rtcd_x86_64-darwin9-gcc.h index ee5ba1c3f5cb..e1c90900c587 100644 --- a/media/libvpx/vp8_rtcd_x86_64-darwin9-gcc.h +++ b/media/libvpx/vp8_rtcd_x86_64-darwin9-gcc.h @@ -7,6 +7,10 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + /* * VP8 */ @@ -22,32 +26,146 @@ struct variance_vtable; union int_mv; struct yv12_buffer_config; +void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_mmx + +void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_mmx + +void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_b vp8_blend_b_c + +void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_mb_inner vp8_blend_mb_inner_c + +void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_mb_outer vp8_blend_mb_outer_c + +int vp8_block_error_c(short *coeff, short *dqcoeff); +int vp8_block_error_mmx(short *coeff, short *dqcoeff); +int vp8_block_error_xmm(short *coeff, short *dqcoeff); +#define vp8_block_error vp8_block_error_xmm + +void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +void vp8_build_intra_predictors_mbuv_s_sse2(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +void vp8_build_intra_predictors_mbuv_s_ssse3(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +RTCD_EXTERN void (*vp8_build_intra_predictors_mbuv_s)(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); + +void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +void vp8_build_intra_predictors_mby_s_sse2(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +void vp8_build_intra_predictors_mby_s_ssse3(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +RTCD_EXTERN void (*vp8_build_intra_predictors_mby_s)(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); + void vp8_clear_system_state_c(); void vpx_reset_mmx_state(); #define vp8_clear_system_state vpx_reset_mmx_state -void vp8_dequantize_b_c(struct blockd*, short *dqc); -void vp8_dequantize_b_mmx(struct blockd*, short *dqc); -#define vp8_dequantize_b vp8_dequantize_b_mmx +void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); + +void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem16x16 vp8_copy_mem16x16_sse2 + +void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem8x4 vp8_copy_mem8x4_mmx + +void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem8x8 vp8_copy_mem8x8_mmx + +void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +#define vp8_dc_only_idct_add vp8_dc_only_idct_add_mmx + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter vp8_denoiser_filter_sse2 + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_sse2 void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride); #define vp8_dequant_idct_add vp8_dequant_idct_add_mmx -void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2 - void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); void vp8_dequant_idct_add_uv_block_mmx(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_sse2 -void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -#define vp8_loop_filter_mbv vp8_loop_filter_mbv_sse2 +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2 + +void vp8_dequantize_b_c(struct blockd*, short *dqc); +void vp8_dequantize_b_mmx(struct blockd*, short *dqc); +#define vp8_dequantize_b vp8_dequantize_b_mmx + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_diamond_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +void vp8_fast_quantize_b_sse2(struct block *, struct blockd *); +void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); + +void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); +#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_c + +void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight16x16 vp8_filter_by_weight16x16_sse2 + +void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c + +void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_sse2 + +int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); + +unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vp8_get4x4sse_cs_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +#define vp8_get4x4sse_cs vp8_get4x4sse_cs_mmx + +unsigned int vp8_get_mb_ss_c(const short *); +unsigned int vp8_get_mb_ss_mmx(const short *); +unsigned int vp8_get_mb_ss_sse2(const short *); +#define vp8_get_mb_ss vp8_get_mb_ss_sse2 + +void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); +#define vp8_intra4x4_predict vp8_intra4x4_predict_c + +void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bh vp8_loop_filter_bh_sse2 void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); void vp8_loop_filter_bv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); @@ -59,240 +177,84 @@ void vp8_loop_filter_mbh_mmx(unsigned char *y, unsigned char *u, unsigned char * void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); #define vp8_loop_filter_mbh vp8_loop_filter_mbh_sse2 -void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -#define vp8_loop_filter_bh vp8_loop_filter_bh_sse2 - -void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_vertical_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_sse2 - -void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_sse2 - -void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_sse2 +void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbv vp8_loop_filter_mbv_sse2 void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); void vp8_loop_filter_bhs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); #define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_sse2 -void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -#define vp8_short_idct4x4llm vp8_short_idct4x4llm_mmx +void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_sse2 -void vp8_short_inv_walsh4x4_1_c(short *input, short *output); -#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_sse2 -void vp8_short_inv_walsh4x4_c(short *input, short *output); -void vp8_short_inv_walsh4x4_mmx(short *input, short *output); -void vp8_short_inv_walsh4x4_sse2(short *input, short *output); -#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_sse2 +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_sse2 -void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -#define vp8_dc_only_idct_add vp8_dc_only_idct_add_mmx +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +int vp8_mbblock_error_mmx(struct macroblock *mb, int dc); +int vp8_mbblock_error_xmm(struct macroblock *mb, int dc); +#define vp8_mbblock_error vp8_mbblock_error_xmm -void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -#define vp8_copy_mem16x16 vp8_copy_mem16x16_sse2 - -void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -#define vp8_copy_mem8x8 vp8_copy_mem8x8_mmx - -void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -#define vp8_copy_mem8x4 vp8_copy_mem8x4_mmx - -void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -void vp8_build_intra_predictors_mby_s_sse2(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -void vp8_build_intra_predictors_mby_s_ssse3(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -RTCD_EXTERN void (*vp8_build_intra_predictors_mby_s)(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); - -void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -void vp8_build_intra_predictors_mbuv_s_sse2(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -void vp8_build_intra_predictors_mbuv_s_ssse3(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -RTCD_EXTERN void (*vp8_build_intra_predictors_mbuv_s)(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); - -void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); -#define vp8_intra4x4_predict vp8_intra4x4_predict_c +void vp8_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vp8_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vp8_mbpost_proc_across_ip vp8_mbpost_proc_across_ip_xmm void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); #define vp8_mbpost_proc_down vp8_mbpost_proc_down_xmm -void vp8_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vp8_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); -#define vp8_mbpost_proc_across_ip vp8_mbpost_proc_across_ip_xmm +int vp8_mbuverror_c(struct macroblock *mb); +int vp8_mbuverror_mmx(struct macroblock *mb); +int vp8_mbuverror_xmm(struct macroblock *mb); +#define vp8_mbuverror vp8_mbuverror_xmm -void vp8_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -void vp8_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -#define vp8_post_proc_down_and_across_mb_row vp8_post_proc_down_and_across_mb_row_sse2 +unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_mse16x16 vp8_mse16x16_wmt void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_mmx(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_wmt(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); #define vp8_plane_add_noise vp8_plane_add_noise_wmt -void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_inner vp8_blend_mb_inner_c +void vp8_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +void vp8_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +#define vp8_post_proc_down_and_across_mb_row vp8_post_proc_down_and_across_mb_row_sse2 -void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_outer vp8_blend_mb_outer_c +void vp8_quantize_mb_c(struct macroblock *); +#define vp8_quantize_mb vp8_quantize_mb_c -void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_b vp8_blend_b_c +void vp8_quantize_mbuv_c(struct macroblock *); +#define vp8_quantize_mbuv vp8_quantize_mbuv_c -void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -#define vp8_filter_by_weight16x16 vp8_filter_by_weight16x16_sse2 +void vp8_quantize_mby_c(struct macroblock *); +#define vp8_quantize_mby vp8_quantize_mby_c -void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -#define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_sse2 +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse4_1(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_regular_quantize_b)(struct block *, struct blockd *); -void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_mmx - -void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_mmx - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance4x4 vp8_variance4x4_wmt - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x8 vp8_variance8x8_wmt - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x16 vp8_variance8x16_wmt - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x8 vp8_variance16x8_wmt - -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x16 vp8_variance16x16_wmt - -unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_variance4x4 vp8_sub_pixel_variance4x4_wmt - -unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_variance8x8 vp8_sub_pixel_variance8x8_wmt - -unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_variance8x16 vp8_sub_pixel_variance8x16_wmt - -unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_h_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_wmt - -unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_wmt - -unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_hv_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_hv_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_wmt - -unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad4x4_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad4x4_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp8_sad4x4 vp8_sad4x4_wmt - -unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp8_sad8x8 vp8_sad8x8_wmt - -unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x16_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp8_sad8x16 vp8_sad8x16_wmt - -unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad16x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad16x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp8_sad16x8 vp8_sad16x8_wmt +void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); +#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); unsigned int vp8_sad16x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); @@ -300,91 +262,87 @@ unsigned int vp8_sad16x16_wmt(const unsigned char *src_ptr, int src_stride, cons unsigned int vp8_sad16x16_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); RTCD_EXTERN unsigned int (*vp8_sad16x16)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad4x4x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad4x4x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad16x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); + +unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad16x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad16x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +#define vp8_sad16x8 vp8_sad16x8_wmt void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad16x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad16x8x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad16x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad4x4x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad4x4x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); - -void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad8x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); - -void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad8x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad8x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad16x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); void vp8_sad16x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); RTCD_EXTERN void (*vp8_sad16x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad4x4_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad4x4_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +#define vp8_sad4x4 vp8_sad4x4_wmt + +void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad4x4x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad4x4x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad4x4x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad4x4x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad4x4x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad4x4x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); + +unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x16_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +#define vp8_sad8x16 vp8_sad8x16_wmt + +void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad8x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad8x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad8x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad8x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad8x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +#define vp8_sad8x8 vp8_sad8x8_wmt -unsigned int vp8_get_mb_ss_c(const short *); -unsigned int vp8_get_mb_ss_mmx(const short *); -unsigned int vp8_get_mb_ss_sse2(const short *); -#define vp8_get_mb_ss vp8_get_mb_ss_sse2 +void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad8x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_wmt +void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad8x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_mse16x16 vp8_mse16x16_wmt - -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -unsigned int vp8_get4x4sse_cs_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -#define vp8_get4x4sse_cs vp8_get4x4sse_cs_mmx - -void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad8x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); void vp8_short_fdct4x4_c(short *input, short *output, int pitch); void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch); @@ -396,87 +354,135 @@ void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch); void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch); #define vp8_short_fdct8x4 vp8_short_fdct8x4_sse2 +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +#define vp8_short_idct4x4llm vp8_short_idct4x4llm_mmx + +void vp8_short_inv_walsh4x4_c(short *input, short *output); +void vp8_short_inv_walsh4x4_mmx(short *input, short *output); +void vp8_short_inv_walsh4x4_sse2(short *input, short *output); +#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_sse2 + +void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + void vp8_short_walsh4x4_c(short *input, short *output, int pitch); void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch); #define vp8_short_walsh4x4 vp8_short_walsh4x4_sse2 -void vp8_regular_quantize_b_c(struct block *, struct blockd *); -void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); -#define vp8_regular_quantize_b vp8_regular_quantize_b_sse2 +void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_fast_quantize_b_c(struct block *, struct blockd *); -void vp8_fast_quantize_b_sse2(struct block *, struct blockd *); -void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *); -RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); +void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); -#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c +void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); -#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_c +void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_quantize_mb_c(struct macroblock *); -#define vp8_quantize_mb vp8_quantize_mb_c +unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_wmt -void vp8_quantize_mby_c(struct macroblock *); -#define vp8_quantize_mby vp8_quantize_mby_c +unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -void vp8_quantize_mbuv_c(struct macroblock *); -#define vp8_quantize_mbuv vp8_quantize_mbuv_c +unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -int vp8_block_error_c(short *coeff, short *dqcoeff); -int vp8_block_error_mmx(short *coeff, short *dqcoeff); -int vp8_block_error_xmm(short *coeff, short *dqcoeff); -#define vp8_block_error vp8_block_error_xmm +unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_variance4x4 vp8_sub_pixel_variance4x4_wmt -int vp8_mbblock_error_c(struct macroblock *mb, int dc); -int vp8_mbblock_error_mmx(struct macroblock *mb, int dc); -int vp8_mbblock_error_xmm(struct macroblock *mb, int dc); -#define vp8_mbblock_error vp8_mbblock_error_xmm +unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_variance8x16 vp8_sub_pixel_variance8x16_wmt -int vp8_mbuverror_c(struct macroblock *mb); -int vp8_mbuverror_mmx(struct macroblock *mb); -int vp8_mbuverror_xmm(struct macroblock *mb); -#define vp8_mbuverror vp8_mbuverror_xmm +unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_variance8x8 vp8_sub_pixel_variance8x8_wmt void vp8_subtract_b_c(struct block *be, struct blockd *bd, int pitch); void vp8_subtract_b_mmx(struct block *be, struct blockd *bd, int pitch); void vp8_subtract_b_sse2(struct block *be, struct blockd *bd, int pitch); #define vp8_subtract_b vp8_subtract_b_sse2 -void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -#define vp8_subtract_mby vp8_subtract_mby_sse2 - void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); #define vp8_subtract_mbuv vp8_subtract_mbuv_sse2 -int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); - -int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); - -int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_diamond_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +#define vp8_subtract_mby vp8_subtract_mby_sse2 void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); #define vp8_temporal_filter_apply vp8_temporal_filter_apply_sse2 -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c +unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance16x16 vp8_variance16x16_wmt -int vp8_denoiser_filter_c(struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset); -int vp8_denoiser_filter_sse2(struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset); -#define vp8_denoiser_filter vp8_denoiser_filter_sse2 +unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance16x8 vp8_variance16x8_wmt + +unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance4x4 vp8_variance4x4_wmt + +unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance8x16 vp8_variance8x16_wmt + +unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance8x8 vp8_variance8x8_wmt + +unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_h_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_wmt + +unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_hv_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_hv_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_wmt + +unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_wmt void vp8_rtcd(void); @@ -487,169 +493,79 @@ static void setup_rtcd_internal(void) int flags = x86_simd_caps(); (void)flags; - - - - - - - - - - - - - - - - - - - - - - vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_sse2; - if (flags & HAS_SSSE3) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_ssse3; - - vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_sse2; - if (flags & HAS_SSSE3) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_ssse3; - - - - - - - - - - - - - vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; - - vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; - - vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; - - vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; - if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_sse2; if (flags & HAS_SSSE3) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_ssse3; - vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_sse2; if (flags & HAS_SSSE3) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_ssse3; - - - - - - - - - - - - vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_wmt; - if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_ssse3; - - vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt; - if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ssse3; - - - - - - - - - vp8_sad16x16 = vp8_sad16x16_wmt; - if (flags & HAS_SSE3) vp8_sad16x16 = vp8_sad16x16_sse3; - - vp8_sad4x4x3 = vp8_sad4x4x3_c; - if (flags & HAS_SSE3) vp8_sad4x4x3 = vp8_sad4x4x3_sse3; - - vp8_sad8x8x3 = vp8_sad8x8x3_c; - if (flags & HAS_SSE3) vp8_sad8x8x3 = vp8_sad8x8x3_sse3; - - vp8_sad8x16x3 = vp8_sad8x16x3_c; - if (flags & HAS_SSE3) vp8_sad8x16x3 = vp8_sad8x16x3_sse3; - - vp8_sad16x8x3 = vp8_sad16x8x3_c; - if (flags & HAS_SSE3) vp8_sad16x8x3 = vp8_sad16x8x3_sse3; - if (flags & HAS_SSSE3) vp8_sad16x8x3 = vp8_sad16x8x3_ssse3; - - vp8_sad16x16x3 = vp8_sad16x16x3_c; - if (flags & HAS_SSE3) vp8_sad16x16x3 = vp8_sad16x16x3_sse3; - if (flags & HAS_SSSE3) vp8_sad16x16x3 = vp8_sad16x16x3_ssse3; - - vp8_sad4x4x8 = vp8_sad4x4x8_c; - if (flags & HAS_SSE4_1) vp8_sad4x4x8 = vp8_sad4x4x8_sse4; - - vp8_sad8x8x8 = vp8_sad8x8x8_c; - if (flags & HAS_SSE4_1) vp8_sad8x8x8 = vp8_sad8x8x8_sse4; - - vp8_sad8x16x8 = vp8_sad8x16x8_c; - if (flags & HAS_SSE4_1) vp8_sad8x16x8 = vp8_sad8x16x8_sse4; - - vp8_sad16x8x8 = vp8_sad16x8x8_c; - if (flags & HAS_SSE4_1) vp8_sad16x8x8 = vp8_sad16x8x8_sse4; - - vp8_sad16x16x8 = vp8_sad16x16x8_c; - if (flags & HAS_SSE4_1) vp8_sad16x16x8 = vp8_sad16x16x8_sse4; - - vp8_sad4x4x4d = vp8_sad4x4x4d_c; - if (flags & HAS_SSE3) vp8_sad4x4x4d = vp8_sad4x4x4d_sse3; - - vp8_sad8x8x4d = vp8_sad8x8x4d_c; - if (flags & HAS_SSE3) vp8_sad8x8x4d = vp8_sad8x8x4d_sse3; - - vp8_sad8x16x4d = vp8_sad8x16x4d_c; - if (flags & HAS_SSE3) vp8_sad8x16x4d = vp8_sad8x16x4d_sse3; - - vp8_sad16x8x4d = vp8_sad16x8x4d_c; - if (flags & HAS_SSE3) vp8_sad16x8x4d = vp8_sad16x8x4d_sse3; - - vp8_sad16x16x4d = vp8_sad16x16x4d_c; - if (flags & HAS_SSE3) vp8_sad16x16x4d = vp8_sad16x16x4d_sse3; - - - - - + vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_sse2; + if (flags & HAS_SSSE3) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_ssse3; + vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_sse2; + if (flags & HAS_SSSE3) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_ssse3; vp8_copy32xn = vp8_copy32xn_sse2; if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3; - - - - - + vp8_diamond_search_sad = vp8_diamond_search_sad_c; + if (flags & HAS_SSE3) vp8_diamond_search_sad = vp8_diamond_search_sadx4; vp8_fast_quantize_b = vp8_fast_quantize_b_sse2; if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3; - - - - - - - - - - - - vp8_full_search_sad = vp8_full_search_sad_c; if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; - vp8_refining_search_sad = vp8_refining_search_sad_c; if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4; - - vp8_diamond_search_sad = vp8_diamond_search_sad_c; - if (flags & HAS_SSE3) vp8_diamond_search_sad = vp8_diamond_search_sadx4; + vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; + if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1; + vp8_sad16x16 = vp8_sad16x16_wmt; + if (flags & HAS_SSE3) vp8_sad16x16 = vp8_sad16x16_sse3; + vp8_sad16x16x3 = vp8_sad16x16x3_c; + if (flags & HAS_SSE3) vp8_sad16x16x3 = vp8_sad16x16x3_sse3; + if (flags & HAS_SSSE3) vp8_sad16x16x3 = vp8_sad16x16x3_ssse3; + vp8_sad16x16x4d = vp8_sad16x16x4d_c; + if (flags & HAS_SSE3) vp8_sad16x16x4d = vp8_sad16x16x4d_sse3; + vp8_sad16x16x8 = vp8_sad16x16x8_c; + if (flags & HAS_SSE4_1) vp8_sad16x16x8 = vp8_sad16x16x8_sse4; + vp8_sad16x8x3 = vp8_sad16x8x3_c; + if (flags & HAS_SSE3) vp8_sad16x8x3 = vp8_sad16x8x3_sse3; + if (flags & HAS_SSSE3) vp8_sad16x8x3 = vp8_sad16x8x3_ssse3; + vp8_sad16x8x4d = vp8_sad16x8x4d_c; + if (flags & HAS_SSE3) vp8_sad16x8x4d = vp8_sad16x8x4d_sse3; + vp8_sad16x8x8 = vp8_sad16x8x8_c; + if (flags & HAS_SSE4_1) vp8_sad16x8x8 = vp8_sad16x8x8_sse4; + vp8_sad4x4x3 = vp8_sad4x4x3_c; + if (flags & HAS_SSE3) vp8_sad4x4x3 = vp8_sad4x4x3_sse3; + vp8_sad4x4x4d = vp8_sad4x4x4d_c; + if (flags & HAS_SSE3) vp8_sad4x4x4d = vp8_sad4x4x4d_sse3; + vp8_sad4x4x8 = vp8_sad4x4x8_c; + if (flags & HAS_SSE4_1) vp8_sad4x4x8 = vp8_sad4x4x8_sse4; + vp8_sad8x16x3 = vp8_sad8x16x3_c; + if (flags & HAS_SSE3) vp8_sad8x16x3 = vp8_sad8x16x3_sse3; + vp8_sad8x16x4d = vp8_sad8x16x4d_c; + if (flags & HAS_SSE3) vp8_sad8x16x4d = vp8_sad8x16x4d_sse3; + vp8_sad8x16x8 = vp8_sad8x16x8_c; + if (flags & HAS_SSE4_1) vp8_sad8x16x8 = vp8_sad8x16x8_sse4; + vp8_sad8x8x3 = vp8_sad8x8x3_c; + if (flags & HAS_SSE3) vp8_sad8x8x3 = vp8_sad8x8x3_sse3; + vp8_sad8x8x4d = vp8_sad8x8x4d_c; + if (flags & HAS_SSE3) vp8_sad8x8x4d = vp8_sad8x8x4d_sse3; + vp8_sad8x8x8 = vp8_sad8x8x8_c; + if (flags & HAS_SSE4_1) vp8_sad8x8x8 = vp8_sad8x8x8_sse4; + vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; + vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; + if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; + vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; + vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; + vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt; + if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ssse3; + vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_wmt; + if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_ssse3; } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vp8_rtcd_x86_64-linux-gcc.h b/media/libvpx/vp8_rtcd_x86_64-linux-gcc.h index ee5ba1c3f5cb..e1c90900c587 100644 --- a/media/libvpx/vp8_rtcd_x86_64-linux-gcc.h +++ b/media/libvpx/vp8_rtcd_x86_64-linux-gcc.h @@ -7,6 +7,10 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + /* * VP8 */ @@ -22,32 +26,146 @@ struct variance_vtable; union int_mv; struct yv12_buffer_config; +void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_mmx + +void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_mmx + +void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_b vp8_blend_b_c + +void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_mb_inner vp8_blend_mb_inner_c + +void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_mb_outer vp8_blend_mb_outer_c + +int vp8_block_error_c(short *coeff, short *dqcoeff); +int vp8_block_error_mmx(short *coeff, short *dqcoeff); +int vp8_block_error_xmm(short *coeff, short *dqcoeff); +#define vp8_block_error vp8_block_error_xmm + +void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +void vp8_build_intra_predictors_mbuv_s_sse2(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +void vp8_build_intra_predictors_mbuv_s_ssse3(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +RTCD_EXTERN void (*vp8_build_intra_predictors_mbuv_s)(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); + +void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +void vp8_build_intra_predictors_mby_s_sse2(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +void vp8_build_intra_predictors_mby_s_ssse3(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +RTCD_EXTERN void (*vp8_build_intra_predictors_mby_s)(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); + void vp8_clear_system_state_c(); void vpx_reset_mmx_state(); #define vp8_clear_system_state vpx_reset_mmx_state -void vp8_dequantize_b_c(struct blockd*, short *dqc); -void vp8_dequantize_b_mmx(struct blockd*, short *dqc); -#define vp8_dequantize_b vp8_dequantize_b_mmx +void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); + +void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem16x16 vp8_copy_mem16x16_sse2 + +void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem8x4 vp8_copy_mem8x4_mmx + +void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem8x8 vp8_copy_mem8x8_mmx + +void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +#define vp8_dc_only_idct_add vp8_dc_only_idct_add_mmx + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter vp8_denoiser_filter_sse2 + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_sse2 void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride); #define vp8_dequant_idct_add vp8_dequant_idct_add_mmx -void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2 - void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); void vp8_dequant_idct_add_uv_block_mmx(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_sse2 -void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -#define vp8_loop_filter_mbv vp8_loop_filter_mbv_sse2 +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2 + +void vp8_dequantize_b_c(struct blockd*, short *dqc); +void vp8_dequantize_b_mmx(struct blockd*, short *dqc); +#define vp8_dequantize_b vp8_dequantize_b_mmx + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_diamond_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +void vp8_fast_quantize_b_sse2(struct block *, struct blockd *); +void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); + +void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); +#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_c + +void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight16x16 vp8_filter_by_weight16x16_sse2 + +void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c + +void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_sse2 + +int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); + +unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vp8_get4x4sse_cs_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +#define vp8_get4x4sse_cs vp8_get4x4sse_cs_mmx + +unsigned int vp8_get_mb_ss_c(const short *); +unsigned int vp8_get_mb_ss_mmx(const short *); +unsigned int vp8_get_mb_ss_sse2(const short *); +#define vp8_get_mb_ss vp8_get_mb_ss_sse2 + +void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); +#define vp8_intra4x4_predict vp8_intra4x4_predict_c + +void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bh vp8_loop_filter_bh_sse2 void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); void vp8_loop_filter_bv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); @@ -59,240 +177,84 @@ void vp8_loop_filter_mbh_mmx(unsigned char *y, unsigned char *u, unsigned char * void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); #define vp8_loop_filter_mbh vp8_loop_filter_mbh_sse2 -void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -#define vp8_loop_filter_bh vp8_loop_filter_bh_sse2 - -void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_vertical_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_sse2 - -void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_sse2 - -void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_sse2 +void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbv vp8_loop_filter_mbv_sse2 void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); void vp8_loop_filter_bhs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); #define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_sse2 -void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -#define vp8_short_idct4x4llm vp8_short_idct4x4llm_mmx +void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_sse2 -void vp8_short_inv_walsh4x4_1_c(short *input, short *output); -#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_sse2 -void vp8_short_inv_walsh4x4_c(short *input, short *output); -void vp8_short_inv_walsh4x4_mmx(short *input, short *output); -void vp8_short_inv_walsh4x4_sse2(short *input, short *output); -#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_sse2 +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_sse2 -void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -#define vp8_dc_only_idct_add vp8_dc_only_idct_add_mmx +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +int vp8_mbblock_error_mmx(struct macroblock *mb, int dc); +int vp8_mbblock_error_xmm(struct macroblock *mb, int dc); +#define vp8_mbblock_error vp8_mbblock_error_xmm -void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -#define vp8_copy_mem16x16 vp8_copy_mem16x16_sse2 - -void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -#define vp8_copy_mem8x8 vp8_copy_mem8x8_mmx - -void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -#define vp8_copy_mem8x4 vp8_copy_mem8x4_mmx - -void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -void vp8_build_intra_predictors_mby_s_sse2(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -void vp8_build_intra_predictors_mby_s_ssse3(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -RTCD_EXTERN void (*vp8_build_intra_predictors_mby_s)(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); - -void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -void vp8_build_intra_predictors_mbuv_s_sse2(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -void vp8_build_intra_predictors_mbuv_s_ssse3(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -RTCD_EXTERN void (*vp8_build_intra_predictors_mbuv_s)(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); - -void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); -#define vp8_intra4x4_predict vp8_intra4x4_predict_c +void vp8_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vp8_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vp8_mbpost_proc_across_ip vp8_mbpost_proc_across_ip_xmm void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); #define vp8_mbpost_proc_down vp8_mbpost_proc_down_xmm -void vp8_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vp8_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); -#define vp8_mbpost_proc_across_ip vp8_mbpost_proc_across_ip_xmm +int vp8_mbuverror_c(struct macroblock *mb); +int vp8_mbuverror_mmx(struct macroblock *mb); +int vp8_mbuverror_xmm(struct macroblock *mb); +#define vp8_mbuverror vp8_mbuverror_xmm -void vp8_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -void vp8_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -#define vp8_post_proc_down_and_across_mb_row vp8_post_proc_down_and_across_mb_row_sse2 +unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_mse16x16 vp8_mse16x16_wmt void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_mmx(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_wmt(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); #define vp8_plane_add_noise vp8_plane_add_noise_wmt -void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_inner vp8_blend_mb_inner_c +void vp8_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +void vp8_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +#define vp8_post_proc_down_and_across_mb_row vp8_post_proc_down_and_across_mb_row_sse2 -void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_outer vp8_blend_mb_outer_c +void vp8_quantize_mb_c(struct macroblock *); +#define vp8_quantize_mb vp8_quantize_mb_c -void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_b vp8_blend_b_c +void vp8_quantize_mbuv_c(struct macroblock *); +#define vp8_quantize_mbuv vp8_quantize_mbuv_c -void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -#define vp8_filter_by_weight16x16 vp8_filter_by_weight16x16_sse2 +void vp8_quantize_mby_c(struct macroblock *); +#define vp8_quantize_mby vp8_quantize_mby_c -void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -#define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_sse2 +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse4_1(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_regular_quantize_b)(struct block *, struct blockd *); -void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_mmx - -void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_mmx - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance4x4 vp8_variance4x4_wmt - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x8 vp8_variance8x8_wmt - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x16 vp8_variance8x16_wmt - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x8 vp8_variance16x8_wmt - -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x16 vp8_variance16x16_wmt - -unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_variance4x4 vp8_sub_pixel_variance4x4_wmt - -unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_variance8x8 vp8_sub_pixel_variance8x8_wmt - -unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_variance8x16 vp8_sub_pixel_variance8x16_wmt - -unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_h_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_wmt - -unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_wmt - -unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_hv_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_hv_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_wmt - -unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad4x4_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad4x4_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp8_sad4x4 vp8_sad4x4_wmt - -unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp8_sad8x8 vp8_sad8x8_wmt - -unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x16_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp8_sad8x16 vp8_sad8x16_wmt - -unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad16x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad16x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp8_sad16x8 vp8_sad16x8_wmt +void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); +#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); unsigned int vp8_sad16x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); @@ -300,91 +262,87 @@ unsigned int vp8_sad16x16_wmt(const unsigned char *src_ptr, int src_stride, cons unsigned int vp8_sad16x16_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); RTCD_EXTERN unsigned int (*vp8_sad16x16)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad4x4x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad4x4x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad16x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); + +unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad16x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad16x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +#define vp8_sad16x8 vp8_sad16x8_wmt void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad16x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad16x8x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad16x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad4x4x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad4x4x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); - -void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad8x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); - -void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad8x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad8x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad16x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); void vp8_sad16x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); RTCD_EXTERN void (*vp8_sad16x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad4x4_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad4x4_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +#define vp8_sad4x4 vp8_sad4x4_wmt + +void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad4x4x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad4x4x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad4x4x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad4x4x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad4x4x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad4x4x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); + +unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x16_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +#define vp8_sad8x16 vp8_sad8x16_wmt + +void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad8x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad8x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad8x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad8x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad8x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +#define vp8_sad8x8 vp8_sad8x8_wmt -unsigned int vp8_get_mb_ss_c(const short *); -unsigned int vp8_get_mb_ss_mmx(const short *); -unsigned int vp8_get_mb_ss_sse2(const short *); -#define vp8_get_mb_ss vp8_get_mb_ss_sse2 +void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad8x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_wmt +void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad8x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_mse16x16 vp8_mse16x16_wmt - -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -unsigned int vp8_get4x4sse_cs_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -#define vp8_get4x4sse_cs vp8_get4x4sse_cs_mmx - -void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad8x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); void vp8_short_fdct4x4_c(short *input, short *output, int pitch); void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch); @@ -396,87 +354,135 @@ void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch); void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch); #define vp8_short_fdct8x4 vp8_short_fdct8x4_sse2 +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +#define vp8_short_idct4x4llm vp8_short_idct4x4llm_mmx + +void vp8_short_inv_walsh4x4_c(short *input, short *output); +void vp8_short_inv_walsh4x4_mmx(short *input, short *output); +void vp8_short_inv_walsh4x4_sse2(short *input, short *output); +#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_sse2 + +void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + void vp8_short_walsh4x4_c(short *input, short *output, int pitch); void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch); #define vp8_short_walsh4x4 vp8_short_walsh4x4_sse2 -void vp8_regular_quantize_b_c(struct block *, struct blockd *); -void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); -#define vp8_regular_quantize_b vp8_regular_quantize_b_sse2 +void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_fast_quantize_b_c(struct block *, struct blockd *); -void vp8_fast_quantize_b_sse2(struct block *, struct blockd *); -void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *); -RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); +void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); -#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c +void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); -#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_c +void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_quantize_mb_c(struct macroblock *); -#define vp8_quantize_mb vp8_quantize_mb_c +unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_wmt -void vp8_quantize_mby_c(struct macroblock *); -#define vp8_quantize_mby vp8_quantize_mby_c +unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -void vp8_quantize_mbuv_c(struct macroblock *); -#define vp8_quantize_mbuv vp8_quantize_mbuv_c +unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -int vp8_block_error_c(short *coeff, short *dqcoeff); -int vp8_block_error_mmx(short *coeff, short *dqcoeff); -int vp8_block_error_xmm(short *coeff, short *dqcoeff); -#define vp8_block_error vp8_block_error_xmm +unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_variance4x4 vp8_sub_pixel_variance4x4_wmt -int vp8_mbblock_error_c(struct macroblock *mb, int dc); -int vp8_mbblock_error_mmx(struct macroblock *mb, int dc); -int vp8_mbblock_error_xmm(struct macroblock *mb, int dc); -#define vp8_mbblock_error vp8_mbblock_error_xmm +unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_variance8x16 vp8_sub_pixel_variance8x16_wmt -int vp8_mbuverror_c(struct macroblock *mb); -int vp8_mbuverror_mmx(struct macroblock *mb); -int vp8_mbuverror_xmm(struct macroblock *mb); -#define vp8_mbuverror vp8_mbuverror_xmm +unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_variance8x8 vp8_sub_pixel_variance8x8_wmt void vp8_subtract_b_c(struct block *be, struct blockd *bd, int pitch); void vp8_subtract_b_mmx(struct block *be, struct blockd *bd, int pitch); void vp8_subtract_b_sse2(struct block *be, struct blockd *bd, int pitch); #define vp8_subtract_b vp8_subtract_b_sse2 -void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -#define vp8_subtract_mby vp8_subtract_mby_sse2 - void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); #define vp8_subtract_mbuv vp8_subtract_mbuv_sse2 -int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); - -int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); - -int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_diamond_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +#define vp8_subtract_mby vp8_subtract_mby_sse2 void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); #define vp8_temporal_filter_apply vp8_temporal_filter_apply_sse2 -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c +unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance16x16 vp8_variance16x16_wmt -int vp8_denoiser_filter_c(struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset); -int vp8_denoiser_filter_sse2(struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset); -#define vp8_denoiser_filter vp8_denoiser_filter_sse2 +unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance16x8 vp8_variance16x8_wmt + +unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance4x4 vp8_variance4x4_wmt + +unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance8x16 vp8_variance8x16_wmt + +unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance8x8 vp8_variance8x8_wmt + +unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_h_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_wmt + +unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_hv_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_hv_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_wmt + +unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_wmt void vp8_rtcd(void); @@ -487,169 +493,79 @@ static void setup_rtcd_internal(void) int flags = x86_simd_caps(); (void)flags; - - - - - - - - - - - - - - - - - - - - - - vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_sse2; - if (flags & HAS_SSSE3) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_ssse3; - - vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_sse2; - if (flags & HAS_SSSE3) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_ssse3; - - - - - - - - - - - - - vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; - - vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; - - vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; - - vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; - if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_sse2; if (flags & HAS_SSSE3) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_ssse3; - vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_sse2; if (flags & HAS_SSSE3) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_ssse3; - - - - - - - - - - - - vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_wmt; - if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_ssse3; - - vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt; - if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ssse3; - - - - - - - - - vp8_sad16x16 = vp8_sad16x16_wmt; - if (flags & HAS_SSE3) vp8_sad16x16 = vp8_sad16x16_sse3; - - vp8_sad4x4x3 = vp8_sad4x4x3_c; - if (flags & HAS_SSE3) vp8_sad4x4x3 = vp8_sad4x4x3_sse3; - - vp8_sad8x8x3 = vp8_sad8x8x3_c; - if (flags & HAS_SSE3) vp8_sad8x8x3 = vp8_sad8x8x3_sse3; - - vp8_sad8x16x3 = vp8_sad8x16x3_c; - if (flags & HAS_SSE3) vp8_sad8x16x3 = vp8_sad8x16x3_sse3; - - vp8_sad16x8x3 = vp8_sad16x8x3_c; - if (flags & HAS_SSE3) vp8_sad16x8x3 = vp8_sad16x8x3_sse3; - if (flags & HAS_SSSE3) vp8_sad16x8x3 = vp8_sad16x8x3_ssse3; - - vp8_sad16x16x3 = vp8_sad16x16x3_c; - if (flags & HAS_SSE3) vp8_sad16x16x3 = vp8_sad16x16x3_sse3; - if (flags & HAS_SSSE3) vp8_sad16x16x3 = vp8_sad16x16x3_ssse3; - - vp8_sad4x4x8 = vp8_sad4x4x8_c; - if (flags & HAS_SSE4_1) vp8_sad4x4x8 = vp8_sad4x4x8_sse4; - - vp8_sad8x8x8 = vp8_sad8x8x8_c; - if (flags & HAS_SSE4_1) vp8_sad8x8x8 = vp8_sad8x8x8_sse4; - - vp8_sad8x16x8 = vp8_sad8x16x8_c; - if (flags & HAS_SSE4_1) vp8_sad8x16x8 = vp8_sad8x16x8_sse4; - - vp8_sad16x8x8 = vp8_sad16x8x8_c; - if (flags & HAS_SSE4_1) vp8_sad16x8x8 = vp8_sad16x8x8_sse4; - - vp8_sad16x16x8 = vp8_sad16x16x8_c; - if (flags & HAS_SSE4_1) vp8_sad16x16x8 = vp8_sad16x16x8_sse4; - - vp8_sad4x4x4d = vp8_sad4x4x4d_c; - if (flags & HAS_SSE3) vp8_sad4x4x4d = vp8_sad4x4x4d_sse3; - - vp8_sad8x8x4d = vp8_sad8x8x4d_c; - if (flags & HAS_SSE3) vp8_sad8x8x4d = vp8_sad8x8x4d_sse3; - - vp8_sad8x16x4d = vp8_sad8x16x4d_c; - if (flags & HAS_SSE3) vp8_sad8x16x4d = vp8_sad8x16x4d_sse3; - - vp8_sad16x8x4d = vp8_sad16x8x4d_c; - if (flags & HAS_SSE3) vp8_sad16x8x4d = vp8_sad16x8x4d_sse3; - - vp8_sad16x16x4d = vp8_sad16x16x4d_c; - if (flags & HAS_SSE3) vp8_sad16x16x4d = vp8_sad16x16x4d_sse3; - - - - - + vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_sse2; + if (flags & HAS_SSSE3) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_ssse3; + vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_sse2; + if (flags & HAS_SSSE3) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_ssse3; vp8_copy32xn = vp8_copy32xn_sse2; if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3; - - - - - + vp8_diamond_search_sad = vp8_diamond_search_sad_c; + if (flags & HAS_SSE3) vp8_diamond_search_sad = vp8_diamond_search_sadx4; vp8_fast_quantize_b = vp8_fast_quantize_b_sse2; if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3; - - - - - - - - - - - - vp8_full_search_sad = vp8_full_search_sad_c; if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; - vp8_refining_search_sad = vp8_refining_search_sad_c; if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4; - - vp8_diamond_search_sad = vp8_diamond_search_sad_c; - if (flags & HAS_SSE3) vp8_diamond_search_sad = vp8_diamond_search_sadx4; + vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; + if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1; + vp8_sad16x16 = vp8_sad16x16_wmt; + if (flags & HAS_SSE3) vp8_sad16x16 = vp8_sad16x16_sse3; + vp8_sad16x16x3 = vp8_sad16x16x3_c; + if (flags & HAS_SSE3) vp8_sad16x16x3 = vp8_sad16x16x3_sse3; + if (flags & HAS_SSSE3) vp8_sad16x16x3 = vp8_sad16x16x3_ssse3; + vp8_sad16x16x4d = vp8_sad16x16x4d_c; + if (flags & HAS_SSE3) vp8_sad16x16x4d = vp8_sad16x16x4d_sse3; + vp8_sad16x16x8 = vp8_sad16x16x8_c; + if (flags & HAS_SSE4_1) vp8_sad16x16x8 = vp8_sad16x16x8_sse4; + vp8_sad16x8x3 = vp8_sad16x8x3_c; + if (flags & HAS_SSE3) vp8_sad16x8x3 = vp8_sad16x8x3_sse3; + if (flags & HAS_SSSE3) vp8_sad16x8x3 = vp8_sad16x8x3_ssse3; + vp8_sad16x8x4d = vp8_sad16x8x4d_c; + if (flags & HAS_SSE3) vp8_sad16x8x4d = vp8_sad16x8x4d_sse3; + vp8_sad16x8x8 = vp8_sad16x8x8_c; + if (flags & HAS_SSE4_1) vp8_sad16x8x8 = vp8_sad16x8x8_sse4; + vp8_sad4x4x3 = vp8_sad4x4x3_c; + if (flags & HAS_SSE3) vp8_sad4x4x3 = vp8_sad4x4x3_sse3; + vp8_sad4x4x4d = vp8_sad4x4x4d_c; + if (flags & HAS_SSE3) vp8_sad4x4x4d = vp8_sad4x4x4d_sse3; + vp8_sad4x4x8 = vp8_sad4x4x8_c; + if (flags & HAS_SSE4_1) vp8_sad4x4x8 = vp8_sad4x4x8_sse4; + vp8_sad8x16x3 = vp8_sad8x16x3_c; + if (flags & HAS_SSE3) vp8_sad8x16x3 = vp8_sad8x16x3_sse3; + vp8_sad8x16x4d = vp8_sad8x16x4d_c; + if (flags & HAS_SSE3) vp8_sad8x16x4d = vp8_sad8x16x4d_sse3; + vp8_sad8x16x8 = vp8_sad8x16x8_c; + if (flags & HAS_SSE4_1) vp8_sad8x16x8 = vp8_sad8x16x8_sse4; + vp8_sad8x8x3 = vp8_sad8x8x3_c; + if (flags & HAS_SSE3) vp8_sad8x8x3 = vp8_sad8x8x3_sse3; + vp8_sad8x8x4d = vp8_sad8x8x4d_c; + if (flags & HAS_SSE3) vp8_sad8x8x4d = vp8_sad8x8x4d_sse3; + vp8_sad8x8x8 = vp8_sad8x8x8_c; + if (flags & HAS_SSE4_1) vp8_sad8x8x8 = vp8_sad8x8x8_sse4; + vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; + vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; + if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; + vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; + vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; + vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt; + if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ssse3; + vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_wmt; + if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_ssse3; } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vp8_rtcd_x86_64-win64-gcc.h b/media/libvpx/vp8_rtcd_x86_64-win64-gcc.h index ee5ba1c3f5cb..e1c90900c587 100644 --- a/media/libvpx/vp8_rtcd_x86_64-win64-gcc.h +++ b/media/libvpx/vp8_rtcd_x86_64-win64-gcc.h @@ -7,6 +7,10 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + /* * VP8 */ @@ -22,32 +26,146 @@ struct variance_vtable; union int_mv; struct yv12_buffer_config; +void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_mmx + +void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_mmx + +void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_b vp8_blend_b_c + +void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_mb_inner vp8_blend_mb_inner_c + +void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_mb_outer vp8_blend_mb_outer_c + +int vp8_block_error_c(short *coeff, short *dqcoeff); +int vp8_block_error_mmx(short *coeff, short *dqcoeff); +int vp8_block_error_xmm(short *coeff, short *dqcoeff); +#define vp8_block_error vp8_block_error_xmm + +void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +void vp8_build_intra_predictors_mbuv_s_sse2(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +void vp8_build_intra_predictors_mbuv_s_ssse3(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +RTCD_EXTERN void (*vp8_build_intra_predictors_mbuv_s)(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); + +void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +void vp8_build_intra_predictors_mby_s_sse2(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +void vp8_build_intra_predictors_mby_s_ssse3(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +RTCD_EXTERN void (*vp8_build_intra_predictors_mby_s)(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); + void vp8_clear_system_state_c(); void vpx_reset_mmx_state(); #define vp8_clear_system_state vpx_reset_mmx_state -void vp8_dequantize_b_c(struct blockd*, short *dqc); -void vp8_dequantize_b_mmx(struct blockd*, short *dqc); -#define vp8_dequantize_b vp8_dequantize_b_mmx +void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); + +void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem16x16 vp8_copy_mem16x16_sse2 + +void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem8x4 vp8_copy_mem8x4_mmx + +void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem8x8 vp8_copy_mem8x8_mmx + +void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +#define vp8_dc_only_idct_add vp8_dc_only_idct_add_mmx + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter vp8_denoiser_filter_sse2 + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_sse2 void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride); #define vp8_dequant_idct_add vp8_dequant_idct_add_mmx -void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2 - void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); void vp8_dequant_idct_add_uv_block_mmx(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_sse2 -void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -#define vp8_loop_filter_mbv vp8_loop_filter_mbv_sse2 +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2 + +void vp8_dequantize_b_c(struct blockd*, short *dqc); +void vp8_dequantize_b_mmx(struct blockd*, short *dqc); +#define vp8_dequantize_b vp8_dequantize_b_mmx + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_diamond_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +void vp8_fast_quantize_b_sse2(struct block *, struct blockd *); +void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); + +void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); +#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_c + +void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight16x16 vp8_filter_by_weight16x16_sse2 + +void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c + +void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_sse2 + +int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); + +unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vp8_get4x4sse_cs_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +#define vp8_get4x4sse_cs vp8_get4x4sse_cs_mmx + +unsigned int vp8_get_mb_ss_c(const short *); +unsigned int vp8_get_mb_ss_mmx(const short *); +unsigned int vp8_get_mb_ss_sse2(const short *); +#define vp8_get_mb_ss vp8_get_mb_ss_sse2 + +void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); +#define vp8_intra4x4_predict vp8_intra4x4_predict_c + +void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bh vp8_loop_filter_bh_sse2 void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); void vp8_loop_filter_bv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); @@ -59,240 +177,84 @@ void vp8_loop_filter_mbh_mmx(unsigned char *y, unsigned char *u, unsigned char * void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); #define vp8_loop_filter_mbh vp8_loop_filter_mbh_sse2 -void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -#define vp8_loop_filter_bh vp8_loop_filter_bh_sse2 - -void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_vertical_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_sse2 - -void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_sse2 - -void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_sse2 +void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbv vp8_loop_filter_mbv_sse2 void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); void vp8_loop_filter_bhs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); #define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_sse2 -void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -#define vp8_short_idct4x4llm vp8_short_idct4x4llm_mmx +void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_sse2 -void vp8_short_inv_walsh4x4_1_c(short *input, short *output); -#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_sse2 -void vp8_short_inv_walsh4x4_c(short *input, short *output); -void vp8_short_inv_walsh4x4_mmx(short *input, short *output); -void vp8_short_inv_walsh4x4_sse2(short *input, short *output); -#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_sse2 +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_sse2 -void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -#define vp8_dc_only_idct_add vp8_dc_only_idct_add_mmx +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +int vp8_mbblock_error_mmx(struct macroblock *mb, int dc); +int vp8_mbblock_error_xmm(struct macroblock *mb, int dc); +#define vp8_mbblock_error vp8_mbblock_error_xmm -void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -#define vp8_copy_mem16x16 vp8_copy_mem16x16_sse2 - -void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -#define vp8_copy_mem8x8 vp8_copy_mem8x8_mmx - -void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -#define vp8_copy_mem8x4 vp8_copy_mem8x4_mmx - -void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -void vp8_build_intra_predictors_mby_s_sse2(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -void vp8_build_intra_predictors_mby_s_ssse3(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -RTCD_EXTERN void (*vp8_build_intra_predictors_mby_s)(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); - -void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -void vp8_build_intra_predictors_mbuv_s_sse2(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -void vp8_build_intra_predictors_mbuv_s_ssse3(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -RTCD_EXTERN void (*vp8_build_intra_predictors_mbuv_s)(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); - -void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); -#define vp8_intra4x4_predict vp8_intra4x4_predict_c +void vp8_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vp8_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vp8_mbpost_proc_across_ip vp8_mbpost_proc_across_ip_xmm void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); #define vp8_mbpost_proc_down vp8_mbpost_proc_down_xmm -void vp8_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vp8_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); -#define vp8_mbpost_proc_across_ip vp8_mbpost_proc_across_ip_xmm +int vp8_mbuverror_c(struct macroblock *mb); +int vp8_mbuverror_mmx(struct macroblock *mb); +int vp8_mbuverror_xmm(struct macroblock *mb); +#define vp8_mbuverror vp8_mbuverror_xmm -void vp8_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -void vp8_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -#define vp8_post_proc_down_and_across_mb_row vp8_post_proc_down_and_across_mb_row_sse2 +unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_mse16x16 vp8_mse16x16_wmt void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_mmx(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_wmt(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); #define vp8_plane_add_noise vp8_plane_add_noise_wmt -void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_inner vp8_blend_mb_inner_c +void vp8_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +void vp8_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +#define vp8_post_proc_down_and_across_mb_row vp8_post_proc_down_and_across_mb_row_sse2 -void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_outer vp8_blend_mb_outer_c +void vp8_quantize_mb_c(struct macroblock *); +#define vp8_quantize_mb vp8_quantize_mb_c -void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_b vp8_blend_b_c +void vp8_quantize_mbuv_c(struct macroblock *); +#define vp8_quantize_mbuv vp8_quantize_mbuv_c -void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -#define vp8_filter_by_weight16x16 vp8_filter_by_weight16x16_sse2 +void vp8_quantize_mby_c(struct macroblock *); +#define vp8_quantize_mby vp8_quantize_mby_c -void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -#define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_sse2 +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse4_1(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_regular_quantize_b)(struct block *, struct blockd *); -void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_mmx - -void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_mmx - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance4x4 vp8_variance4x4_wmt - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x8 vp8_variance8x8_wmt - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x16 vp8_variance8x16_wmt - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x8 vp8_variance16x8_wmt - -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x16 vp8_variance16x16_wmt - -unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_variance4x4 vp8_sub_pixel_variance4x4_wmt - -unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_variance8x8 vp8_sub_pixel_variance8x8_wmt - -unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_variance8x16 vp8_sub_pixel_variance8x16_wmt - -unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_h_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_wmt - -unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_wmt - -unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_hv_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_hv_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_wmt - -unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad4x4_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad4x4_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp8_sad4x4 vp8_sad4x4_wmt - -unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp8_sad8x8 vp8_sad8x8_wmt - -unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x16_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp8_sad8x16 vp8_sad8x16_wmt - -unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad16x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad16x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp8_sad16x8 vp8_sad16x8_wmt +void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); +#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); unsigned int vp8_sad16x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); @@ -300,91 +262,87 @@ unsigned int vp8_sad16x16_wmt(const unsigned char *src_ptr, int src_stride, cons unsigned int vp8_sad16x16_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); RTCD_EXTERN unsigned int (*vp8_sad16x16)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad4x4x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad4x4x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad16x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); + +unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad16x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad16x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +#define vp8_sad16x8 vp8_sad16x8_wmt void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad16x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad16x8x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad16x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad4x4x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad4x4x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); - -void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad8x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); - -void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad8x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad8x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad16x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); void vp8_sad16x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); RTCD_EXTERN void (*vp8_sad16x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad4x4_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad4x4_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +#define vp8_sad4x4 vp8_sad4x4_wmt + +void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad4x4x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad4x4x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad4x4x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad4x4x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad4x4x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad4x4x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); + +unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x16_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +#define vp8_sad8x16 vp8_sad8x16_wmt + +void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad8x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad8x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad8x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad8x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad8x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +#define vp8_sad8x8 vp8_sad8x8_wmt -unsigned int vp8_get_mb_ss_c(const short *); -unsigned int vp8_get_mb_ss_mmx(const short *); -unsigned int vp8_get_mb_ss_sse2(const short *); -#define vp8_get_mb_ss vp8_get_mb_ss_sse2 +void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad8x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_wmt +void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad8x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_mse16x16 vp8_mse16x16_wmt - -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -unsigned int vp8_get4x4sse_cs_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -#define vp8_get4x4sse_cs vp8_get4x4sse_cs_mmx - -void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad8x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); void vp8_short_fdct4x4_c(short *input, short *output, int pitch); void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch); @@ -396,87 +354,135 @@ void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch); void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch); #define vp8_short_fdct8x4 vp8_short_fdct8x4_sse2 +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +#define vp8_short_idct4x4llm vp8_short_idct4x4llm_mmx + +void vp8_short_inv_walsh4x4_c(short *input, short *output); +void vp8_short_inv_walsh4x4_mmx(short *input, short *output); +void vp8_short_inv_walsh4x4_sse2(short *input, short *output); +#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_sse2 + +void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + void vp8_short_walsh4x4_c(short *input, short *output, int pitch); void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch); #define vp8_short_walsh4x4 vp8_short_walsh4x4_sse2 -void vp8_regular_quantize_b_c(struct block *, struct blockd *); -void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); -#define vp8_regular_quantize_b vp8_regular_quantize_b_sse2 +void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_fast_quantize_b_c(struct block *, struct blockd *); -void vp8_fast_quantize_b_sse2(struct block *, struct blockd *); -void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *); -RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); +void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); -#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c +void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); -#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_c +void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_quantize_mb_c(struct macroblock *); -#define vp8_quantize_mb vp8_quantize_mb_c +unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_wmt -void vp8_quantize_mby_c(struct macroblock *); -#define vp8_quantize_mby vp8_quantize_mby_c +unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -void vp8_quantize_mbuv_c(struct macroblock *); -#define vp8_quantize_mbuv vp8_quantize_mbuv_c +unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -int vp8_block_error_c(short *coeff, short *dqcoeff); -int vp8_block_error_mmx(short *coeff, short *dqcoeff); -int vp8_block_error_xmm(short *coeff, short *dqcoeff); -#define vp8_block_error vp8_block_error_xmm +unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_variance4x4 vp8_sub_pixel_variance4x4_wmt -int vp8_mbblock_error_c(struct macroblock *mb, int dc); -int vp8_mbblock_error_mmx(struct macroblock *mb, int dc); -int vp8_mbblock_error_xmm(struct macroblock *mb, int dc); -#define vp8_mbblock_error vp8_mbblock_error_xmm +unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_variance8x16 vp8_sub_pixel_variance8x16_wmt -int vp8_mbuverror_c(struct macroblock *mb); -int vp8_mbuverror_mmx(struct macroblock *mb); -int vp8_mbuverror_xmm(struct macroblock *mb); -#define vp8_mbuverror vp8_mbuverror_xmm +unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_variance8x8 vp8_sub_pixel_variance8x8_wmt void vp8_subtract_b_c(struct block *be, struct blockd *bd, int pitch); void vp8_subtract_b_mmx(struct block *be, struct blockd *bd, int pitch); void vp8_subtract_b_sse2(struct block *be, struct blockd *bd, int pitch); #define vp8_subtract_b vp8_subtract_b_sse2 -void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -#define vp8_subtract_mby vp8_subtract_mby_sse2 - void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); #define vp8_subtract_mbuv vp8_subtract_mbuv_sse2 -int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); - -int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); - -int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_diamond_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +#define vp8_subtract_mby vp8_subtract_mby_sse2 void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); #define vp8_temporal_filter_apply vp8_temporal_filter_apply_sse2 -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c +unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance16x16 vp8_variance16x16_wmt -int vp8_denoiser_filter_c(struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset); -int vp8_denoiser_filter_sse2(struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset); -#define vp8_denoiser_filter vp8_denoiser_filter_sse2 +unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance16x8 vp8_variance16x8_wmt + +unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance4x4 vp8_variance4x4_wmt + +unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance8x16 vp8_variance8x16_wmt + +unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance8x8 vp8_variance8x8_wmt + +unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_h_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_wmt + +unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_hv_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_hv_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_wmt + +unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_wmt void vp8_rtcd(void); @@ -487,169 +493,79 @@ static void setup_rtcd_internal(void) int flags = x86_simd_caps(); (void)flags; - - - - - - - - - - - - - - - - - - - - - - vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_sse2; - if (flags & HAS_SSSE3) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_ssse3; - - vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_sse2; - if (flags & HAS_SSSE3) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_ssse3; - - - - - - - - - - - - - vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; - - vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; - - vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; - - vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; - if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_sse2; if (flags & HAS_SSSE3) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_ssse3; - vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_sse2; if (flags & HAS_SSSE3) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_ssse3; - - - - - - - - - - - - vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_wmt; - if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_ssse3; - - vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt; - if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ssse3; - - - - - - - - - vp8_sad16x16 = vp8_sad16x16_wmt; - if (flags & HAS_SSE3) vp8_sad16x16 = vp8_sad16x16_sse3; - - vp8_sad4x4x3 = vp8_sad4x4x3_c; - if (flags & HAS_SSE3) vp8_sad4x4x3 = vp8_sad4x4x3_sse3; - - vp8_sad8x8x3 = vp8_sad8x8x3_c; - if (flags & HAS_SSE3) vp8_sad8x8x3 = vp8_sad8x8x3_sse3; - - vp8_sad8x16x3 = vp8_sad8x16x3_c; - if (flags & HAS_SSE3) vp8_sad8x16x3 = vp8_sad8x16x3_sse3; - - vp8_sad16x8x3 = vp8_sad16x8x3_c; - if (flags & HAS_SSE3) vp8_sad16x8x3 = vp8_sad16x8x3_sse3; - if (flags & HAS_SSSE3) vp8_sad16x8x3 = vp8_sad16x8x3_ssse3; - - vp8_sad16x16x3 = vp8_sad16x16x3_c; - if (flags & HAS_SSE3) vp8_sad16x16x3 = vp8_sad16x16x3_sse3; - if (flags & HAS_SSSE3) vp8_sad16x16x3 = vp8_sad16x16x3_ssse3; - - vp8_sad4x4x8 = vp8_sad4x4x8_c; - if (flags & HAS_SSE4_1) vp8_sad4x4x8 = vp8_sad4x4x8_sse4; - - vp8_sad8x8x8 = vp8_sad8x8x8_c; - if (flags & HAS_SSE4_1) vp8_sad8x8x8 = vp8_sad8x8x8_sse4; - - vp8_sad8x16x8 = vp8_sad8x16x8_c; - if (flags & HAS_SSE4_1) vp8_sad8x16x8 = vp8_sad8x16x8_sse4; - - vp8_sad16x8x8 = vp8_sad16x8x8_c; - if (flags & HAS_SSE4_1) vp8_sad16x8x8 = vp8_sad16x8x8_sse4; - - vp8_sad16x16x8 = vp8_sad16x16x8_c; - if (flags & HAS_SSE4_1) vp8_sad16x16x8 = vp8_sad16x16x8_sse4; - - vp8_sad4x4x4d = vp8_sad4x4x4d_c; - if (flags & HAS_SSE3) vp8_sad4x4x4d = vp8_sad4x4x4d_sse3; - - vp8_sad8x8x4d = vp8_sad8x8x4d_c; - if (flags & HAS_SSE3) vp8_sad8x8x4d = vp8_sad8x8x4d_sse3; - - vp8_sad8x16x4d = vp8_sad8x16x4d_c; - if (flags & HAS_SSE3) vp8_sad8x16x4d = vp8_sad8x16x4d_sse3; - - vp8_sad16x8x4d = vp8_sad16x8x4d_c; - if (flags & HAS_SSE3) vp8_sad16x8x4d = vp8_sad16x8x4d_sse3; - - vp8_sad16x16x4d = vp8_sad16x16x4d_c; - if (flags & HAS_SSE3) vp8_sad16x16x4d = vp8_sad16x16x4d_sse3; - - - - - + vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_sse2; + if (flags & HAS_SSSE3) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_ssse3; + vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_sse2; + if (flags & HAS_SSSE3) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_ssse3; vp8_copy32xn = vp8_copy32xn_sse2; if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3; - - - - - + vp8_diamond_search_sad = vp8_diamond_search_sad_c; + if (flags & HAS_SSE3) vp8_diamond_search_sad = vp8_diamond_search_sadx4; vp8_fast_quantize_b = vp8_fast_quantize_b_sse2; if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3; - - - - - - - - - - - - vp8_full_search_sad = vp8_full_search_sad_c; if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; - vp8_refining_search_sad = vp8_refining_search_sad_c; if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4; - - vp8_diamond_search_sad = vp8_diamond_search_sad_c; - if (flags & HAS_SSE3) vp8_diamond_search_sad = vp8_diamond_search_sadx4; + vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; + if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1; + vp8_sad16x16 = vp8_sad16x16_wmt; + if (flags & HAS_SSE3) vp8_sad16x16 = vp8_sad16x16_sse3; + vp8_sad16x16x3 = vp8_sad16x16x3_c; + if (flags & HAS_SSE3) vp8_sad16x16x3 = vp8_sad16x16x3_sse3; + if (flags & HAS_SSSE3) vp8_sad16x16x3 = vp8_sad16x16x3_ssse3; + vp8_sad16x16x4d = vp8_sad16x16x4d_c; + if (flags & HAS_SSE3) vp8_sad16x16x4d = vp8_sad16x16x4d_sse3; + vp8_sad16x16x8 = vp8_sad16x16x8_c; + if (flags & HAS_SSE4_1) vp8_sad16x16x8 = vp8_sad16x16x8_sse4; + vp8_sad16x8x3 = vp8_sad16x8x3_c; + if (flags & HAS_SSE3) vp8_sad16x8x3 = vp8_sad16x8x3_sse3; + if (flags & HAS_SSSE3) vp8_sad16x8x3 = vp8_sad16x8x3_ssse3; + vp8_sad16x8x4d = vp8_sad16x8x4d_c; + if (flags & HAS_SSE3) vp8_sad16x8x4d = vp8_sad16x8x4d_sse3; + vp8_sad16x8x8 = vp8_sad16x8x8_c; + if (flags & HAS_SSE4_1) vp8_sad16x8x8 = vp8_sad16x8x8_sse4; + vp8_sad4x4x3 = vp8_sad4x4x3_c; + if (flags & HAS_SSE3) vp8_sad4x4x3 = vp8_sad4x4x3_sse3; + vp8_sad4x4x4d = vp8_sad4x4x4d_c; + if (flags & HAS_SSE3) vp8_sad4x4x4d = vp8_sad4x4x4d_sse3; + vp8_sad4x4x8 = vp8_sad4x4x8_c; + if (flags & HAS_SSE4_1) vp8_sad4x4x8 = vp8_sad4x4x8_sse4; + vp8_sad8x16x3 = vp8_sad8x16x3_c; + if (flags & HAS_SSE3) vp8_sad8x16x3 = vp8_sad8x16x3_sse3; + vp8_sad8x16x4d = vp8_sad8x16x4d_c; + if (flags & HAS_SSE3) vp8_sad8x16x4d = vp8_sad8x16x4d_sse3; + vp8_sad8x16x8 = vp8_sad8x16x8_c; + if (flags & HAS_SSE4_1) vp8_sad8x16x8 = vp8_sad8x16x8_sse4; + vp8_sad8x8x3 = vp8_sad8x8x3_c; + if (flags & HAS_SSE3) vp8_sad8x8x3 = vp8_sad8x8x3_sse3; + vp8_sad8x8x4d = vp8_sad8x8x4d_c; + if (flags & HAS_SSE3) vp8_sad8x8x4d = vp8_sad8x8x4d_sse3; + vp8_sad8x8x8 = vp8_sad8x8x8_c; + if (flags & HAS_SSE4_1) vp8_sad8x8x8 = vp8_sad8x8x8_sse4; + vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; + vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; + if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; + vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; + vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; + vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt; + if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ssse3; + vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_wmt; + if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_ssse3; } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vp8_rtcd_x86_64-win64-vs8.h b/media/libvpx/vp8_rtcd_x86_64-win64-vs8.h index ee5ba1c3f5cb..e1c90900c587 100644 --- a/media/libvpx/vp8_rtcd_x86_64-win64-vs8.h +++ b/media/libvpx/vp8_rtcd_x86_64-win64-vs8.h @@ -7,6 +7,10 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + /* * VP8 */ @@ -22,32 +26,146 @@ struct variance_vtable; union int_mv; struct yv12_buffer_config; +void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_mmx + +void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_mmx + +void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); + +void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_b vp8_blend_b_c + +void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_mb_inner vp8_blend_mb_inner_c + +void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_mb_outer vp8_blend_mb_outer_c + +int vp8_block_error_c(short *coeff, short *dqcoeff); +int vp8_block_error_mmx(short *coeff, short *dqcoeff); +int vp8_block_error_xmm(short *coeff, short *dqcoeff); +#define vp8_block_error vp8_block_error_xmm + +void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +void vp8_build_intra_predictors_mbuv_s_sse2(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +void vp8_build_intra_predictors_mbuv_s_ssse3(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); +RTCD_EXTERN void (*vp8_build_intra_predictors_mbuv_s)(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); + +void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +void vp8_build_intra_predictors_mby_s_sse2(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +void vp8_build_intra_predictors_mby_s_ssse3(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); +RTCD_EXTERN void (*vp8_build_intra_predictors_mby_s)(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); + void vp8_clear_system_state_c(); void vpx_reset_mmx_state(); #define vp8_clear_system_state vpx_reset_mmx_state -void vp8_dequantize_b_c(struct blockd*, short *dqc); -void vp8_dequantize_b_mmx(struct blockd*, short *dqc); -#define vp8_dequantize_b vp8_dequantize_b_mmx +void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); + +void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem16x16 vp8_copy_mem16x16_sse2 + +void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem8x4 vp8_copy_mem8x4_mmx + +void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem8x8 vp8_copy_mem8x8_mmx + +void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +#define vp8_dc_only_idct_add vp8_dc_only_idct_add_mmx + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter vp8_denoiser_filter_sse2 + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_sse2 void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride); #define vp8_dequant_idct_add vp8_dequant_idct_add_mmx -void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); -#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2 - void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); void vp8_dequant_idct_add_uv_block_mmx(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_sse2 -void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -#define vp8_loop_filter_mbv vp8_loop_filter_mbv_sse2 +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_mmx(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2 + +void vp8_dequantize_b_c(struct blockd*, short *dqc); +void vp8_dequantize_b_mmx(struct blockd*, short *dqc); +#define vp8_dequantize_b vp8_dequantize_b_mmx + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_diamond_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +void vp8_fast_quantize_b_sse2(struct block *, struct blockd *); +void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); + +void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); +#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_c + +void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight16x16 vp8_filter_by_weight16x16_sse2 + +void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c + +void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_sse2 + +int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); + +unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vp8_get4x4sse_cs_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +#define vp8_get4x4sse_cs vp8_get4x4sse_cs_mmx + +unsigned int vp8_get_mb_ss_c(const short *); +unsigned int vp8_get_mb_ss_mmx(const short *); +unsigned int vp8_get_mb_ss_sse2(const short *); +#define vp8_get_mb_ss vp8_get_mb_ss_sse2 + +void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); +#define vp8_intra4x4_predict vp8_intra4x4_predict_c + +void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bh vp8_loop_filter_bh_sse2 void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); void vp8_loop_filter_bv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); @@ -59,240 +177,84 @@ void vp8_loop_filter_mbh_mmx(unsigned char *y, unsigned char *u, unsigned char * void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); #define vp8_loop_filter_mbh vp8_loop_filter_mbh_sse2 -void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); -#define vp8_loop_filter_bh vp8_loop_filter_bh_sse2 - -void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_vertical_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_sse2 - -void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_sse2 - -void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); -void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); -#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_sse2 +void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_mmx(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbv vp8_loop_filter_mbv_sse2 void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); void vp8_loop_filter_bhs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); #define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_sse2 -void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); -#define vp8_short_idct4x4llm vp8_short_idct4x4llm_mmx +void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_sse2 -void vp8_short_inv_walsh4x4_1_c(short *input, short *output); -#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_sse2 -void vp8_short_inv_walsh4x4_c(short *input, short *output); -void vp8_short_inv_walsh4x4_mmx(short *input, short *output); -void vp8_short_inv_walsh4x4_sse2(short *input, short *output); -#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_sse2 +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_mmx(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_sse2 -void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); -#define vp8_dc_only_idct_add vp8_dc_only_idct_add_mmx +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +int vp8_mbblock_error_mmx(struct macroblock *mb, int dc); +int vp8_mbblock_error_xmm(struct macroblock *mb, int dc); +#define vp8_mbblock_error vp8_mbblock_error_xmm -void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -#define vp8_copy_mem16x16 vp8_copy_mem16x16_sse2 - -void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -#define vp8_copy_mem8x8 vp8_copy_mem8x8_mmx - -void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); -#define vp8_copy_mem8x4 vp8_copy_mem8x4_mmx - -void vp8_build_intra_predictors_mby_s_c(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -void vp8_build_intra_predictors_mby_s_sse2(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -void vp8_build_intra_predictors_mby_s_ssse3(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); -RTCD_EXTERN void (*vp8_build_intra_predictors_mby_s)(struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride); - -void vp8_build_intra_predictors_mbuv_s_c(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -void vp8_build_intra_predictors_mbuv_s_sse2(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -void vp8_build_intra_predictors_mbuv_s_ssse3(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); -RTCD_EXTERN void (*vp8_build_intra_predictors_mbuv_s)(struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride); - -void vp8_intra4x4_predict_c(unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left); -#define vp8_intra4x4_predict vp8_intra4x4_predict_c +void vp8_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vp8_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vp8_mbpost_proc_across_ip vp8_mbpost_proc_across_ip_xmm void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); #define vp8_mbpost_proc_down vp8_mbpost_proc_down_xmm -void vp8_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vp8_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); -#define vp8_mbpost_proc_across_ip vp8_mbpost_proc_across_ip_xmm +int vp8_mbuverror_c(struct macroblock *mb); +int vp8_mbuverror_mmx(struct macroblock *mb); +int vp8_mbuverror_xmm(struct macroblock *mb); +#define vp8_mbuverror vp8_mbuverror_xmm -void vp8_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -void vp8_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -#define vp8_post_proc_down_and_across_mb_row vp8_post_proc_down_and_across_mb_row_sse2 +unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_mse16x16 vp8_mse16x16_wmt void vp8_plane_add_noise_c(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_mmx(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); void vp8_plane_add_noise_wmt(unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch); #define vp8_plane_add_noise vp8_plane_add_noise_wmt -void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_inner vp8_blend_mb_inner_c +void vp8_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +void vp8_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +#define vp8_post_proc_down_and_across_mb_row vp8_post_proc_down_and_across_mb_row_sse2 -void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_mb_outer vp8_blend_mb_outer_c +void vp8_quantize_mb_c(struct macroblock *); +#define vp8_quantize_mb vp8_quantize_mb_c -void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); -#define vp8_blend_b vp8_blend_b_c +void vp8_quantize_mbuv_c(struct macroblock *); +#define vp8_quantize_mbuv vp8_quantize_mbuv_c -void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -void vp8_filter_by_weight16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -#define vp8_filter_by_weight16x16 vp8_filter_by_weight16x16_sse2 +void vp8_quantize_mby_c(struct macroblock *); +#define vp8_quantize_mby vp8_quantize_mby_c -void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -#define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_sse2 +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); -#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); +void vp8_regular_quantize_b_sse4_1(struct block *, struct blockd *); +RTCD_EXTERN void (*vp8_regular_quantize_b)(struct block *, struct blockd *); -void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); - -void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_mmx - -void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_mmx - -unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance4x4 vp8_variance4x4_wmt - -unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x8 vp8_variance8x8_wmt - -unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance8x16 vp8_variance8x16_wmt - -unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x8 vp8_variance16x8_wmt - -unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance16x16 vp8_variance16x16_wmt - -unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_variance4x4 vp8_sub_pixel_variance4x4_wmt - -unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_variance8x8 vp8_sub_pixel_variance8x8_wmt - -unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_variance8x16 vp8_sub_pixel_variance8x16_wmt - -unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x8_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_variance16x16_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); - -unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_h_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_wmt - -unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_wmt - -unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_hv_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_variance_halfpixvar16x16_hv_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_wmt - -unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad4x4_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad4x4_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp8_sad4x4 vp8_sad4x4_wmt - -unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp8_sad8x8 vp8_sad8x8_wmt - -unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad8x16_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp8_sad8x16 vp8_sad8x16_wmt - -unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad16x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp8_sad16x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp8_sad16x8 vp8_sad16x8_wmt +void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); +#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); unsigned int vp8_sad16x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); @@ -300,91 +262,87 @@ unsigned int vp8_sad16x16_wmt(const unsigned char *src_ptr, int src_stride, cons unsigned int vp8_sad16x16_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); RTCD_EXTERN unsigned int (*vp8_sad16x16)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); -void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad4x4x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad4x4x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad8x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad16x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad16x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); + +unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad16x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad16x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +#define vp8_sad16x8 vp8_sad16x8_wmt void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad16x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad16x8x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad16x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x3_ssse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad4x4x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad4x4x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); - -void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad8x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); - -void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad8x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad8x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad16x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad16x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); void vp8_sad16x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); RTCD_EXTERN void (*vp8_sad16x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad4x4_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad4x4_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +#define vp8_sad4x4 vp8_sad4x4_wmt + +void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad4x4x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad4x4x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad4x4x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad4x4x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad8x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad8x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad4x4x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad4x4x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); + +unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x16_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x16_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +#define vp8_sad8x16 vp8_sad8x16_wmt + +void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad8x16x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x16x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); void vp8_sad8x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); RTCD_EXTERN void (*vp8_sad8x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad8x16x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad8x16x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); -void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp8_sad16x16x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp8_sad16x16x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x8_mmx(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +unsigned int vp8_sad8x8_wmt(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad); +#define vp8_sad8x8 vp8_sad8x8_wmt -unsigned int vp8_get_mb_ss_c(const short *); -unsigned int vp8_get_mb_ss_mmx(const short *); -unsigned int vp8_get_mb_ss_sse2(const short *); -#define vp8_get_mb_ss vp8_get_mb_ss_sse2 +void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp8_sad8x8x3_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x3)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array); -unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -unsigned int vp8_sub_pixel_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_wmt +void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp8_sad8x8x4d_sse3(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x4d)(const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array); -unsigned int vp8_mse16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp8_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); -#define vp8_mse16x16 vp8_mse16x16_wmt - -unsigned int vp8_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -unsigned int vp8_get4x4sse_cs_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); -#define vp8_get4x4sse_cs vp8_get4x4sse_cs_mmx - -void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); -RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n); +void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +void vp8_sad8x8x8_sse4(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); +RTCD_EXTERN void (*vp8_sad8x8x8)(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array); void vp8_short_fdct4x4_c(short *input, short *output, int pitch); void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch); @@ -396,87 +354,135 @@ void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch); void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch); #define vp8_short_fdct8x4 vp8_short_fdct8x4_sse2 +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +#define vp8_short_idct4x4llm vp8_short_idct4x4llm_mmx + +void vp8_short_inv_walsh4x4_c(short *input, short *output); +void vp8_short_inv_walsh4x4_mmx(short *input, short *output); +void vp8_short_inv_walsh4x4_sse2(short *input, short *output); +#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_sse2 + +void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + void vp8_short_walsh4x4_c(short *input, short *output, int pitch); void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch); #define vp8_short_walsh4x4 vp8_short_walsh4x4_sse2 -void vp8_regular_quantize_b_c(struct block *, struct blockd *); -void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); -#define vp8_regular_quantize_b vp8_regular_quantize_b_sse2 +void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_fast_quantize_b_c(struct block *, struct blockd *); -void vp8_fast_quantize_b_sse2(struct block *, struct blockd *); -void vp8_fast_quantize_b_ssse3(struct block *, struct blockd *); -RTCD_EXTERN void (*vp8_fast_quantize_b)(struct block *, struct blockd *); +void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_regular_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); -#define vp8_regular_quantize_b_pair vp8_regular_quantize_b_pair_c +void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_fast_quantize_b_pair_c(struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2); -#define vp8_fast_quantize_b_pair vp8_fast_quantize_b_pair_c +void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); -void vp8_quantize_mb_c(struct macroblock *); -#define vp8_quantize_mb vp8_quantize_mb_c +unsigned int vp8_sub_pixel_mse16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_mse16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_mse16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_mse16x16 vp8_sub_pixel_mse16x16_wmt -void vp8_quantize_mby_c(struct macroblock *); -#define vp8_quantize_mby vp8_quantize_mby_c +unsigned int vp8_sub_pixel_variance16x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x16_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x16)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -void vp8_quantize_mbuv_c(struct macroblock *); -#define vp8_quantize_mbuv vp8_quantize_mbuv_c +unsigned int vp8_sub_pixel_variance16x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance16x8_ssse3(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp8_sub_pixel_variance16x8)(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); -int vp8_block_error_c(short *coeff, short *dqcoeff); -int vp8_block_error_mmx(short *coeff, short *dqcoeff); -int vp8_block_error_xmm(short *coeff, short *dqcoeff); -#define vp8_block_error vp8_block_error_xmm +unsigned int vp8_sub_pixel_variance4x4_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_variance4x4 vp8_sub_pixel_variance4x4_wmt -int vp8_mbblock_error_c(struct macroblock *mb, int dc); -int vp8_mbblock_error_mmx(struct macroblock *mb, int dc); -int vp8_mbblock_error_xmm(struct macroblock *mb, int dc); -#define vp8_mbblock_error vp8_mbblock_error_xmm +unsigned int vp8_sub_pixel_variance8x16_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_variance8x16 vp8_sub_pixel_variance8x16_wmt -int vp8_mbuverror_c(struct macroblock *mb); -int vp8_mbuverror_mmx(struct macroblock *mb); -int vp8_mbuverror_xmm(struct macroblock *mb); -#define vp8_mbuverror vp8_mbuverror_xmm +unsigned int vp8_sub_pixel_variance8x8_c(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +unsigned int vp8_sub_pixel_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse); +#define vp8_sub_pixel_variance8x8 vp8_sub_pixel_variance8x8_wmt void vp8_subtract_b_c(struct block *be, struct blockd *bd, int pitch); void vp8_subtract_b_mmx(struct block *be, struct blockd *bd, int pitch); void vp8_subtract_b_sse2(struct block *be, struct blockd *bd, int pitch); #define vp8_subtract_b vp8_subtract_b_sse2 -void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); -#define vp8_subtract_mby vp8_subtract_mby_sse2 - void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride); #define vp8_subtract_mbuv vp8_subtract_mbuv_sse2 -int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); - -int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); - -int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_diamond_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride); +#define vp8_subtract_mby vp8_subtract_mby_sse2 void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count); #define vp8_temporal_filter_apply vp8_temporal_filter_apply_sse2 -void vp8_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vp8_yv12_copy_partial_frame vp8_yv12_copy_partial_frame_c +unsigned int vp8_variance16x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance16x16 vp8_variance16x16_wmt -int vp8_denoiser_filter_c(struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset); -int vp8_denoiser_filter_sse2(struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset); -#define vp8_denoiser_filter vp8_denoiser_filter_sse2 +unsigned int vp8_variance16x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance16x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance16x8 vp8_variance16x8_wmt + +unsigned int vp8_variance4x4_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance4x4_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance4x4_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance4x4 vp8_variance4x4_wmt + +unsigned int vp8_variance8x16_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x16_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x16_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance8x16 vp8_variance8x16_wmt + +unsigned int vp8_variance8x8_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x8_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance8x8_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance8x8 vp8_variance8x8_wmt + +unsigned int vp8_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_h_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_h_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_wmt + +unsigned int vp8_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_hv_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_hv_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_wmt + +unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_v_mmx(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp8_variance_halfpixvar16x16_v_wmt(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse); +#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_wmt void vp8_rtcd(void); @@ -487,169 +493,79 @@ static void setup_rtcd_internal(void) int flags = x86_simd_caps(); (void)flags; - - - - - - - - - - - - - - - - - - - - - - vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_sse2; - if (flags & HAS_SSSE3) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_ssse3; - - vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_sse2; - if (flags & HAS_SSSE3) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_ssse3; - - - - - - - - - - - - - vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; - - vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; - - vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; - if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; - - vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; - if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_sse2; if (flags & HAS_SSSE3) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_ssse3; - vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_sse2; if (flags & HAS_SSSE3) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_ssse3; - - - - - - - - - - - - vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_wmt; - if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_ssse3; - - vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt; - if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ssse3; - - - - - - - - - vp8_sad16x16 = vp8_sad16x16_wmt; - if (flags & HAS_SSE3) vp8_sad16x16 = vp8_sad16x16_sse3; - - vp8_sad4x4x3 = vp8_sad4x4x3_c; - if (flags & HAS_SSE3) vp8_sad4x4x3 = vp8_sad4x4x3_sse3; - - vp8_sad8x8x3 = vp8_sad8x8x3_c; - if (flags & HAS_SSE3) vp8_sad8x8x3 = vp8_sad8x8x3_sse3; - - vp8_sad8x16x3 = vp8_sad8x16x3_c; - if (flags & HAS_SSE3) vp8_sad8x16x3 = vp8_sad8x16x3_sse3; - - vp8_sad16x8x3 = vp8_sad16x8x3_c; - if (flags & HAS_SSE3) vp8_sad16x8x3 = vp8_sad16x8x3_sse3; - if (flags & HAS_SSSE3) vp8_sad16x8x3 = vp8_sad16x8x3_ssse3; - - vp8_sad16x16x3 = vp8_sad16x16x3_c; - if (flags & HAS_SSE3) vp8_sad16x16x3 = vp8_sad16x16x3_sse3; - if (flags & HAS_SSSE3) vp8_sad16x16x3 = vp8_sad16x16x3_ssse3; - - vp8_sad4x4x8 = vp8_sad4x4x8_c; - if (flags & HAS_SSE4_1) vp8_sad4x4x8 = vp8_sad4x4x8_sse4; - - vp8_sad8x8x8 = vp8_sad8x8x8_c; - if (flags & HAS_SSE4_1) vp8_sad8x8x8 = vp8_sad8x8x8_sse4; - - vp8_sad8x16x8 = vp8_sad8x16x8_c; - if (flags & HAS_SSE4_1) vp8_sad8x16x8 = vp8_sad8x16x8_sse4; - - vp8_sad16x8x8 = vp8_sad16x8x8_c; - if (flags & HAS_SSE4_1) vp8_sad16x8x8 = vp8_sad16x8x8_sse4; - - vp8_sad16x16x8 = vp8_sad16x16x8_c; - if (flags & HAS_SSE4_1) vp8_sad16x16x8 = vp8_sad16x16x8_sse4; - - vp8_sad4x4x4d = vp8_sad4x4x4d_c; - if (flags & HAS_SSE3) vp8_sad4x4x4d = vp8_sad4x4x4d_sse3; - - vp8_sad8x8x4d = vp8_sad8x8x4d_c; - if (flags & HAS_SSE3) vp8_sad8x8x4d = vp8_sad8x8x4d_sse3; - - vp8_sad8x16x4d = vp8_sad8x16x4d_c; - if (flags & HAS_SSE3) vp8_sad8x16x4d = vp8_sad8x16x4d_sse3; - - vp8_sad16x8x4d = vp8_sad16x8x4d_c; - if (flags & HAS_SSE3) vp8_sad16x8x4d = vp8_sad16x8x4d_sse3; - - vp8_sad16x16x4d = vp8_sad16x16x4d_c; - if (flags & HAS_SSE3) vp8_sad16x16x4d = vp8_sad16x16x4d_sse3; - - - - - + vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_sse2; + if (flags & HAS_SSSE3) vp8_build_intra_predictors_mbuv_s = vp8_build_intra_predictors_mbuv_s_ssse3; + vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_sse2; + if (flags & HAS_SSSE3) vp8_build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s_ssse3; vp8_copy32xn = vp8_copy32xn_sse2; if (flags & HAS_SSE3) vp8_copy32xn = vp8_copy32xn_sse3; - - - - - + vp8_diamond_search_sad = vp8_diamond_search_sad_c; + if (flags & HAS_SSE3) vp8_diamond_search_sad = vp8_diamond_search_sadx4; vp8_fast_quantize_b = vp8_fast_quantize_b_sse2; if (flags & HAS_SSSE3) vp8_fast_quantize_b = vp8_fast_quantize_b_ssse3; - - - - - - - - - - - - vp8_full_search_sad = vp8_full_search_sad_c; if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; - vp8_refining_search_sad = vp8_refining_search_sad_c; if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4; - - vp8_diamond_search_sad = vp8_diamond_search_sad_c; - if (flags & HAS_SSE3) vp8_diamond_search_sad = vp8_diamond_search_sadx4; + vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; + if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1; + vp8_sad16x16 = vp8_sad16x16_wmt; + if (flags & HAS_SSE3) vp8_sad16x16 = vp8_sad16x16_sse3; + vp8_sad16x16x3 = vp8_sad16x16x3_c; + if (flags & HAS_SSE3) vp8_sad16x16x3 = vp8_sad16x16x3_sse3; + if (flags & HAS_SSSE3) vp8_sad16x16x3 = vp8_sad16x16x3_ssse3; + vp8_sad16x16x4d = vp8_sad16x16x4d_c; + if (flags & HAS_SSE3) vp8_sad16x16x4d = vp8_sad16x16x4d_sse3; + vp8_sad16x16x8 = vp8_sad16x16x8_c; + if (flags & HAS_SSE4_1) vp8_sad16x16x8 = vp8_sad16x16x8_sse4; + vp8_sad16x8x3 = vp8_sad16x8x3_c; + if (flags & HAS_SSE3) vp8_sad16x8x3 = vp8_sad16x8x3_sse3; + if (flags & HAS_SSSE3) vp8_sad16x8x3 = vp8_sad16x8x3_ssse3; + vp8_sad16x8x4d = vp8_sad16x8x4d_c; + if (flags & HAS_SSE3) vp8_sad16x8x4d = vp8_sad16x8x4d_sse3; + vp8_sad16x8x8 = vp8_sad16x8x8_c; + if (flags & HAS_SSE4_1) vp8_sad16x8x8 = vp8_sad16x8x8_sse4; + vp8_sad4x4x3 = vp8_sad4x4x3_c; + if (flags & HAS_SSE3) vp8_sad4x4x3 = vp8_sad4x4x3_sse3; + vp8_sad4x4x4d = vp8_sad4x4x4d_c; + if (flags & HAS_SSE3) vp8_sad4x4x4d = vp8_sad4x4x4d_sse3; + vp8_sad4x4x8 = vp8_sad4x4x8_c; + if (flags & HAS_SSE4_1) vp8_sad4x4x8 = vp8_sad4x4x8_sse4; + vp8_sad8x16x3 = vp8_sad8x16x3_c; + if (flags & HAS_SSE3) vp8_sad8x16x3 = vp8_sad8x16x3_sse3; + vp8_sad8x16x4d = vp8_sad8x16x4d_c; + if (flags & HAS_SSE3) vp8_sad8x16x4d = vp8_sad8x16x4d_sse3; + vp8_sad8x16x8 = vp8_sad8x16x8_c; + if (flags & HAS_SSE4_1) vp8_sad8x16x8 = vp8_sad8x16x8_sse4; + vp8_sad8x8x3 = vp8_sad8x8x3_c; + if (flags & HAS_SSE3) vp8_sad8x8x3 = vp8_sad8x8x3_sse3; + vp8_sad8x8x4d = vp8_sad8x8x4d_c; + if (flags & HAS_SSE3) vp8_sad8x8x4d = vp8_sad8x8x4d_sse3; + vp8_sad8x8x8 = vp8_sad8x8x8_c; + if (flags & HAS_SSE4_1) vp8_sad8x8x8 = vp8_sad8x8x8_sse4; + vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_ssse3; + vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_mmx; + if (flags & HAS_SSSE3) vp8_sixtap_predict4x4 = vp8_sixtap_predict4x4_ssse3; + vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x4 = vp8_sixtap_predict8x4_ssse3; + vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_sse2; + if (flags & HAS_SSSE3) vp8_sixtap_predict8x8 = vp8_sixtap_predict8x8_ssse3; + vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_wmt; + if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ssse3; + vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_wmt; + if (flags & HAS_SSSE3) vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_ssse3; } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c b/media/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c index d8b24bfaf465..f0881b5ae9cd 100644 --- a/media/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c +++ b/media/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c @@ -25,12 +25,14 @@ void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, // Account for the vertical phase needing 3 lines prior and 4 lines post int intermediate_height = h + 7; - if (x_step_q4 != 16 || y_step_q4 != 16) - return vp9_convolve8_c(src, src_stride, - dst, dst_stride, - filter_x, x_step_q4, - filter_y, y_step_q4, - w, h); + if (x_step_q4 != 16 || y_step_q4 != 16) { + vp9_convolve8_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + return; + } /* Filter starting 3 lines back. The neon implementation will ignore the * given height and filter a multiple of 4 lines. Since this goes in to @@ -57,12 +59,14 @@ void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72); int intermediate_height = h + 7; - if (x_step_q4 != 16 || y_step_q4 != 16) - return vp9_convolve8_avg_c(src, src_stride, - dst, dst_stride, - filter_x, x_step_q4, - filter_y, y_step_q4, - w, h); + if (x_step_q4 != 16 || y_step_q4 != 16) { + vp9_convolve8_avg_c(src, src_stride, + dst, dst_stride, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + return; + } /* This implementation has the same issues as above. In addition, we only want * to average the values after both passes. diff --git a/media/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm b/media/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm similarity index 100% rename from media/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm rename to media/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm diff --git a/media/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm b/media/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.asm similarity index 100% rename from media/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm rename to media/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.asm diff --git a/media/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_1_add_neon.asm b/media/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm similarity index 100% rename from media/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_1_add_neon.asm rename to media/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm diff --git a/media/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm b/media/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm similarity index 99% rename from media/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm rename to media/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm index 388a7d719838..72e933eee967 100644 --- a/media/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm +++ b/media/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm @@ -72,7 +72,7 @@ cospi_31_64 EQU 804 ; reg1 = output[first_offset] ; reg2 = output[second_offset] ; for proper address calculation, the last offset used when manipulating - ; output, wethere reading or storing) must be passed in. use 0 for first + ; output, whether reading or storing) must be passed in. use 0 for first ; use. MACRO LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 @@ -88,7 +88,7 @@ cospi_31_64 EQU 804 ; output[first_offset] = reg1 ; output[second_offset] = reg2 ; for proper address calculation, the last offset used when manipulating - ; output, wethere reading or storing) must be passed in. use 0 for first + ; output, whether reading or storing) must be passed in. use 0 for first ; use. MACRO STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 @@ -242,7 +242,7 @@ cospi_31_64 EQU 804 ; TODO(cd): have special case to re-use constants when they are similar for ; consecutive butterflies ; TODO(cd): have special case when both constants are the same, do the - ; additions/substractions before the multiplies. + ; additions/subtractions before the multiplies. ; generate the constants ; generate scalar constants mov r8, #$first_constant & 0xFF00 @@ -260,7 +260,7 @@ cospi_31_64 EQU 804 vmull.s16 q11, $regB, d31 vmull.s16 q12, $regC, d31 ; (used) five for intermediate (q8-q12), one for constants (q15) - ; do some addition/substractions (to get back two register) + ; do some addition/subtractions (to get back two register) vsub.s32 q8, q8, q10 vsub.s32 q9, q9, q11 ; do more multiplications (ordered for maximum latency hiding) @@ -268,7 +268,7 @@ cospi_31_64 EQU 804 vmull.s16 q11, $regA, d30 vmull.s16 q15, $regB, d30 ; (used) six for intermediate (q8-q12, q15) - ; do more addition/substractions + ; do more addition/subtractions vadd.s32 q11, q12, q11 vadd.s32 q10, q10, q15 ; (used) four for intermediate (q8-q11) diff --git a/media/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm b/media/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm similarity index 100% rename from media/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm rename to media/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm diff --git a/media/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm b/media/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.asm similarity index 100% rename from media/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm rename to media/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.asm diff --git a/media/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm b/media/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm similarity index 100% rename from media/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm rename to media/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm diff --git a/media/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm b/media/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm similarity index 98% rename from media/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm rename to media/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm index 54764008bdf8..ab5bb69202a4 100644 --- a/media/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm +++ b/media/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm @@ -9,7 +9,7 @@ ; EXPORT |vp9_idct8x8_64_add_neon| - EXPORT |vp9_idct8x8_10_add_neon| + EXPORT |vp9_idct8x8_12_add_neon| ARM REQUIRE8 PRESERVE8 @@ -310,13 +310,13 @@ bx lr ENDP ; |vp9_idct8x8_64_add_neon| -;void vp9_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +;void vp9_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) ; ; r0 int16_t input ; r1 uint8_t *dest ; r2 int dest_stride) -|vp9_idct8x8_10_add_neon| PROC +|vp9_idct8x8_12_add_neon| PROC push {r4-r9} vpush {d8-d15} vld1.s16 {q8,q9}, [r0]! @@ -514,6 +514,6 @@ vpop {d8-d15} pop {r4-r9} bx lr - ENDP ; |vp9_idct8x8_10_add_neon| + ENDP ; |vp9_idct8x8_12_add_neon| END diff --git a/media/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm b/media/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm similarity index 100% rename from media/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm rename to media/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm diff --git a/media/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm b/media/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm similarity index 99% rename from media/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm rename to media/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm index 93d3af3011c2..b41f5661b805 100644 --- a/media/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm +++ b/media/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm @@ -576,6 +576,7 @@ vld1.s16 {q14,q15}, [r0]! push {r0-r10} + vpush {d8-d15} ; transpose the input data TRANSPOSE8X8 @@ -636,6 +637,7 @@ iadst_iadst IADST8X8_1D end_vp9_iht8x8_64_add_neon + vpop {d8-d15} pop {r0-r10} ; ROUND_POWER_OF_TWO(temp_out[j], 5) diff --git a/media/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm b/media/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm new file mode 100644 index 000000000000..5b8ec20287dc --- /dev/null +++ b/media/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm @@ -0,0 +1,199 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_lpf_horizontal_4_dual_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p, +; const uint8_t *blimit0, +; const uint8_t *limit0, +; const uint8_t *thresh0, +; const uint8_t *blimit1, +; const uint8_t *limit1, +; const uint8_t *thresh1) +; r0 uint8_t *s, +; r1 int p, +; r2 const uint8_t *blimit0, +; r3 const uint8_t *limit0, +; sp const uint8_t *thresh0, +; sp+4 const uint8_t *blimit1, +; sp+8 const uint8_t *limit1, +; sp+12 const uint8_t *thresh1, + +|vp9_lpf_horizontal_4_dual_neon| PROC + push {lr} + + ldr r12, [sp, #4] ; load thresh0 + vld1.8 {d0}, [r2] ; load blimit0 to first half q + vld1.8 {d2}, [r3] ; load limit0 to first half q + + add r1, r1, r1 ; double pitch + ldr r2, [sp, #8] ; load blimit1 + + vld1.8 {d4}, [r12] ; load thresh0 to first half q + + ldr r3, [sp, #12] ; load limit1 + ldr r12, [sp, #16] ; load thresh1 + vld1.8 {d1}, [r2] ; load blimit1 to 2nd half q + + sub r2, r0, r1, lsl #1 ; s[-4 * p] + + vld1.8 {d3}, [r3] ; load limit1 to 2nd half q + vld1.8 {d5}, [r12] ; load thresh1 to 2nd half q + + vpush {d8-d15} ; save neon registers + + add r3, r2, r1, lsr #1 ; s[-3 * p] + + vld1.u8 {q3}, [r2@64], r1 ; p3 + vld1.u8 {q4}, [r3@64], r1 ; p2 + vld1.u8 {q5}, [r2@64], r1 ; p1 + vld1.u8 {q6}, [r3@64], r1 ; p0 + vld1.u8 {q7}, [r2@64], r1 ; q0 + vld1.u8 {q8}, [r3@64], r1 ; q1 + vld1.u8 {q9}, [r2@64] ; q2 + vld1.u8 {q10}, [r3@64] ; q3 + + sub r2, r2, r1, lsl #1 + sub r3, r3, r1, lsl #1 + + bl vp9_loop_filter_neon_16 + + vst1.u8 {q5}, [r2@64], r1 ; store op1 + vst1.u8 {q6}, [r3@64], r1 ; store op0 + vst1.u8 {q7}, [r2@64], r1 ; store oq0 + vst1.u8 {q8}, [r3@64], r1 ; store oq1 + + vpop {d8-d15} ; restore neon registers + + pop {pc} + ENDP ; |vp9_lpf_horizontal_4_dual_neon| + +; void vp9_loop_filter_neon_16(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. This function uses +; registers d8-d15, so the calling function must save those registers. +; +; r0-r3, r12 PRESERVE +; q0 blimit +; q1 limit +; q2 thresh +; q3 p3 +; q4 p2 +; q5 p1 +; q6 p0 +; q7 q0 +; q8 q1 +; q9 q2 +; q10 q3 +; +; Outputs: +; q5 op1 +; q6 op0 +; q7 oq0 +; q8 oq1 +|vp9_loop_filter_neon_16| PROC + + ; filter_mask + vabd.u8 q11, q3, q4 ; m1 = abs(p3 - p2) + vabd.u8 q12, q4, q5 ; m2 = abs(p2 - p1) + vabd.u8 q13, q5, q6 ; m3 = abs(p1 - p0) + vabd.u8 q14, q8, q7 ; m4 = abs(q1 - q0) + vabd.u8 q3, q9, q8 ; m5 = abs(q2 - q1) + vabd.u8 q4, q10, q9 ; m6 = abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 q11, q11, q12 ; m7 = max(m1, m2) + vmax.u8 q12, q13, q14 ; m8 = max(m3, m4) + + vabd.u8 q9, q6, q7 ; abs(p0 - q0) + + vmax.u8 q3, q3, q4 ; m9 = max(m5, m6) + + vmov.u8 q10, #0x80 + + vmax.u8 q15, q11, q12 ; m10 = max(m7, m8) + + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + vmax.u8 q15, q15, q3 ; m11 = max(m10, m9) + + vabd.u8 q2, q5, q8 ; a = abs(p1 - q1) + vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2 + + veor q7, q7, q10 ; qs0 + + vcge.u8 q15, q1, q15 ; abs(m11) > limit + + vshr.u8 q2, q2, #1 ; a = a / 2 + veor q6, q6, q10 ; ps0 + + veor q5, q5, q10 ; ps1 + vqadd.u8 q9, q9, q2 ; a = b + a + + veor q8, q8, q10 ; qs1 + + vmov.u16 q4, #3 + + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q11, d15, d13 + + vcge.u8 q9, q0, q9 ; a > blimit + + vqsub.s8 q1, q5, q8 ; filter = clamp(ps1-qs1) + vorr q14, q13, q14 ; hev + + vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) + vmul.i16 q11, q11, q4 + + vand q1, q1, q14 ; filter &= hev + vand q15, q15, q9 ; mask + + vmov.u8 q4, #3 + + vaddw.s8 q2, q2, d2 ; filter + 3 * (qs0 - ps0) + vaddw.s8 q11, q11, d3 + + vmov.u8 q9, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d2, q2 + vqmovn.s16 d3, q11 + vand q1, q1, q15 ; filter &= mask + + vqadd.s8 q2, q1, q4 ; filter2 = clamp(filter+3) + vqadd.s8 q1, q1, q9 ; filter1 = clamp(filter+4) + vshr.s8 q2, q2, #3 ; filter2 >>= 3 + vshr.s8 q1, q1, #3 ; filter1 >>= 3 + + + vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + filter2) + vqsub.s8 q0, q7, q1 ; u = clamp(qs0 - filter1) + + ; outer tap adjustments + vrshr.s8 q1, q1, #1 ; filter = ++filter1 >> 1 + + veor q7, q0, q10 ; *oq0 = u^0x80 + + vbic q1, q1, q14 ; filter &= ~hev + + vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + filter) + vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - filter) + + veor q6, q11, q10 ; *op0 = u^0x80 + veor q5, q13, q10 ; *op1 = u^0x80 + veor q8, q12, q10 ; *oq1 = u^0x80 + + bx lr + ENDP ; |vp9_loop_filter_neon_16| + + END diff --git a/media/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/media/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c new file mode 100644 index 000000000000..bc6a17cd16fd --- /dev/null +++ b/media/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" + +void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_horizontal_8(s, p, blimit0, limit0, thresh0, 1); + vp9_lpf_horizontal_8(s + 8, p, blimit1, limit1, thresh1, 1); +} + +void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1); + vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1); +} + +void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1); + vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1); +} + +void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh); + vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh); +} diff --git a/media/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm b/media/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm index 8b4fe5dccf6d..4430322171dd 100644 --- a/media/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm +++ b/media/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm @@ -8,10 +8,10 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp9_loop_filter_horizontal_edge_neon| - EXPORT |vp9_loop_filter_vertical_edge_neon| - EXPORT |vp9_mbloop_filter_horizontal_edge_neon| - EXPORT |vp9_mbloop_filter_vertical_edge_neon| + EXPORT |vp9_lpf_horizontal_4_neon| + EXPORT |vp9_lpf_vertical_4_neon| + EXPORT |vp9_lpf_horizontal_8_neon| + EXPORT |vp9_lpf_vertical_8_neon| ARM AREA ||.text||, CODE, READONLY, ALIGN=2 @@ -21,12 +21,12 @@ ; TODO(fgalligan): See about removing the count code as this function is only ; called with a count of 1. ; -; void vp9_loop_filter_horizontal_edge_neon(uint8_t *s, -; int p /* pitch */, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh, -; int count) +; void vp9_lpf_horizontal_4_neon(uint8_t *s, +; int p /* pitch */, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) ; ; r0 uint8_t *s, ; r1 int p, /* pitch */ @@ -34,7 +34,7 @@ ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, ; sp+4 int count -|vp9_loop_filter_horizontal_edge_neon| PROC +|vp9_lpf_horizontal_4_neon| PROC push {lr} vld1.8 {d0[]}, [r2] ; duplicate *blimit @@ -77,19 +77,19 @@ count_lf_h_loop end_vp9_lf_h_edge pop {pc} - ENDP ; |vp9_loop_filter_horizontal_edge_neon| + ENDP ; |vp9_lpf_horizontal_4_neon| ; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter ; works on 16 iterations at a time. ; TODO(fgalligan): See about removing the count code as this function is only ; called with a count of 1. ; -; void vp9_loop_filter_vertical_edge_neon(uint8_t *s, -; int p /* pitch */, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh, -; int count) +; void vp9_lpf_vertical_4_neon(uint8_t *s, +; int p /* pitch */, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) ; ; r0 uint8_t *s, ; r1 int p, /* pitch */ @@ -97,7 +97,7 @@ end_vp9_lf_h_edge ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, ; sp+4 int count -|vp9_loop_filter_vertical_edge_neon| PROC +|vp9_lpf_vertical_4_neon| PROC push {lr} vld1.8 {d0[]}, [r2] ; duplicate *blimit @@ -158,7 +158,7 @@ count_lf_v_loop end_vp9_lf_v_edge pop {pc} - ENDP ; |vp9_loop_filter_vertical_edge_neon| + ENDP ; |vp9_lpf_vertical_4_neon| ; void vp9_loop_filter_neon(); ; This is a helper function for the loopfilters. The invidual functions do the @@ -276,18 +276,18 @@ end_vp9_lf_v_edge bx lr ENDP ; |vp9_loop_filter_neon| -; void vp9_mbloop_filter_horizontal_edge_neon(uint8_t *s, int p, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh, -; int count) +; void vp9_lpf_horizontal_8_neon(uint8_t *s, int p, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) ; r0 uint8_t *s, ; r1 int p, /* pitch */ ; r2 const uint8_t *blimit, ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, ; sp+4 int count -|vp9_mbloop_filter_horizontal_edge_neon| PROC +|vp9_lpf_horizontal_8_neon| PROC push {r4-r5, lr} vld1.8 {d0[]}, [r2] ; duplicate *blimit @@ -333,14 +333,14 @@ count_mblf_h_loop end_vp9_mblf_h_edge pop {r4-r5, pc} - ENDP ; |vp9_mbloop_filter_horizontal_edge_neon| + ENDP ; |vp9_lpf_horizontal_8_neon| -; void vp9_mbloop_filter_vertical_edge_neon(uint8_t *s, -; int pitch, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh, -; int count) +; void vp9_lpf_vertical_8_neon(uint8_t *s, +; int pitch, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) ; ; r0 uint8_t *s, ; r1 int pitch, @@ -348,7 +348,7 @@ end_vp9_mblf_h_edge ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, ; sp+4 int count -|vp9_mbloop_filter_vertical_edge_neon| PROC +|vp9_lpf_vertical_8_neon| PROC push {r4-r5, lr} vld1.8 {d0[]}, [r2] ; duplicate *blimit @@ -420,7 +420,7 @@ count_mblf_v_loop end_vp9_mblf_v_edge pop {r4-r5, pc} - ENDP ; |vp9_mbloop_filter_vertical_edge_neon| + ENDP ; |vp9_lpf_vertical_8_neon| ; void vp9_mbloop_filter_neon(); ; This is a helper function for the loopfilters. The invidual functions do the diff --git a/media/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm b/media/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm index 2e8001b918b0..5fe2bba4644f 100644 --- a/media/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm +++ b/media/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm @@ -8,23 +8,23 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp9_mb_lpf_horizontal_edge_w_neon| - EXPORT |vp9_mb_lpf_vertical_edge_w_neon| + EXPORT |vp9_lpf_horizontal_16_neon| + EXPORT |vp9_lpf_vertical_16_neon| ARM AREA ||.text||, CODE, READONLY, ALIGN=2 -; void vp9_mb_lpf_horizontal_edge_w_neon(uint8_t *s, int p, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh -; int count) +; void vp9_lpf_horizontal_16_neon(uint8_t *s, int p, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh +; int count) ; r0 uint8_t *s, ; r1 int p, /* pitch */ ; r2 const uint8_t *blimit, ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, -|vp9_mb_lpf_horizontal_edge_w_neon| PROC +|vp9_lpf_horizontal_16_neon| PROC push {r4-r8, lr} vpush {d8-d15} ldr r4, [sp, #88] ; load thresh @@ -115,18 +115,18 @@ h_next vpop {d8-d15} pop {r4-r8, pc} - ENDP ; |vp9_mb_lpf_horizontal_edge_w_neon| + ENDP ; |vp9_lpf_horizontal_16_neon| -; void vp9_mb_lpf_vertical_edge_w_neon(uint8_t *s, int p, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) +; void vp9_lpf_vertical_16_neon(uint8_t *s, int p, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) ; r0 uint8_t *s, ; r1 int p, /* pitch */ ; r2 const uint8_t *blimit, ; r3 const uint8_t *limit, ; sp const uint8_t *thresh, -|vp9_mb_lpf_vertical_edge_w_neon| PROC +|vp9_lpf_vertical_16_neon| PROC push {r4-r8, lr} vpush {d8-d15} ldr r4, [sp, #88] ; load thresh @@ -279,7 +279,7 @@ v_end vpop {d8-d15} pop {r4-r8, pc} - ENDP ; |vp9_mb_lpf_vertical_edge_w_neon| + ENDP ; |vp9_lpf_vertical_16_neon| ; void vp9_wide_mbfilter_neon(); ; This is a helper function for the loopfilters. The invidual functions do the @@ -439,6 +439,9 @@ v_end tst r7, #1 bxne lr + orrs r5, r5, r6 ; Check for 0 + orreq r7, r7, #2 ; Only do mbfilter branch + ; mbfilter flat && mask branch ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's ; and using vibt on the q's? diff --git a/media/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.asm b/media/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.asm new file mode 100644 index 000000000000..dc9856fa8872 --- /dev/null +++ b/media/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.asm @@ -0,0 +1,634 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_v_predictor_4x4_neon| + EXPORT |vp9_v_predictor_8x8_neon| + EXPORT |vp9_v_predictor_16x16_neon| + EXPORT |vp9_v_predictor_32x32_neon| + EXPORT |vp9_h_predictor_4x4_neon| + EXPORT |vp9_h_predictor_8x8_neon| + EXPORT |vp9_h_predictor_16x16_neon| + EXPORT |vp9_h_predictor_32x32_neon| + EXPORT |vp9_tm_predictor_4x4_neon| + EXPORT |vp9_tm_predictor_8x8_neon| + EXPORT |vp9_tm_predictor_16x16_neon| + EXPORT |vp9_tm_predictor_32x32_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_v_predictor_4x4_neon| PROC + vld1.32 {d0[0]}, [r2] + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + bx lr + ENDP ; |vp9_v_predictor_4x4_neon| + +;void vp9_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_v_predictor_8x8_neon| PROC + vld1.8 {d0}, [r2] + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + bx lr + ENDP ; |vp9_v_predictor_8x8_neon| + +;void vp9_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_v_predictor_16x16_neon| PROC + vld1.8 {q0}, [r2] + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + bx lr + ENDP ; |vp9_v_predictor_16x16_neon| + +;void vp9_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_v_predictor_32x32_neon| PROC + vld1.8 {q0, q1}, [r2] + mov r2, #2 +loop_v + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + subs r2, r2, #1 + bgt loop_v + bx lr + ENDP ; |vp9_v_predictor_32x32_neon| + +;void vp9_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_h_predictor_4x4_neon| PROC + vld1.32 {d1[0]}, [r3] + vdup.8 d0, d1[0] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[1] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[2] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[3] + vst1.32 {d0[0]}, [r0], r1 + bx lr + ENDP ; |vp9_h_predictor_4x4_neon| + +;void vp9_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_h_predictor_8x8_neon| PROC + vld1.64 {d1}, [r3] + vdup.8 d0, d1[0] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[1] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[2] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[3] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[4] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[5] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[6] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[7] + vst1.64 {d0}, [r0], r1 + bx lr + ENDP ; |vp9_h_predictor_8x8_neon| + +;void vp9_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_h_predictor_16x16_neon| PROC + vld1.8 {q1}, [r3] + vdup.8 q0, d2[0] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[1] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[2] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[3] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[4] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[5] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[6] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[7] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[0] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[1] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[2] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[3] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[4] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[5] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[6] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[7] + vst1.8 {q0}, [r0], r1 + bx lr + ENDP ; |vp9_h_predictor_16x16_neon| + +;void vp9_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_h_predictor_32x32_neon| PROC + sub r1, r1, #16 + mov r2, #2 +loop_h + vld1.8 {q1}, [r3]! + vdup.8 q0, d2[0] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[1] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[2] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[3] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[4] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[5] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[6] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[7] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[0] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[1] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[2] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[3] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[4] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[5] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[6] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[7] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + subs r2, r2, #1 + bgt loop_h + bx lr + ENDP ; |vp9_h_predictor_32x32_neon| + +;void vp9_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_tm_predictor_4x4_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + ldrb r12, [r12] + vdup.u8 d0, r12 + + ; Load above 4 pixels + vld1.32 {d2[0]}, [r2] + + ; Compute above - ytop_left + vsubl.u8 q3, d2, d0 + + ; Load left row by row and compute left + (above - ytop_left) + ; 1st row and 2nd row + ldrb r12, [r3], #1 + ldrb r2, [r3], #1 + vdup.u16 q1, r12 + vdup.u16 q2, r2 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + + ; 3rd row and 4th row + ldrb r12, [r3], #1 + ldrb r2, [r3], #1 + vdup.u16 q1, r12 + vdup.u16 q2, r2 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + bx lr + ENDP ; |vp9_tm_predictor_4x4_neon| + +;void vp9_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_tm_predictor_8x8_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + ldrb r12, [r12] + vdup.u8 d0, r12 + + ; preload 8 left + vld1.8 {d30}, [r3] + + ; Load above 8 pixels + vld1.64 {d2}, [r2] + + vmovl.u8 q10, d30 + + ; Compute above - ytop_left + vsubl.u8 q3, d2, d0 + + ; Load left row by row and compute left + (above - ytop_left) + ; 1st row and 2nd row + vdup.16 q0, d20[0] + vdup.16 q1, d20[1] + vadd.s16 q0, q3, q0 + vadd.s16 q1, q3, q1 + + ; 3rd row and 4th row + vdup.16 q8, d20[2] + vdup.16 q9, d20[3] + vadd.s16 q8, q3, q8 + vadd.s16 q9, q3, q9 + + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q8 + vqmovun.s16 d3, q9 + + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + vst1.64 {d2}, [r0], r1 + vst1.64 {d3}, [r0], r1 + + ; 5th row and 6th row + vdup.16 q0, d21[0] + vdup.16 q1, d21[1] + vadd.s16 q0, q3, q0 + vadd.s16 q1, q3, q1 + + ; 7th row and 8th row + vdup.16 q8, d21[2] + vdup.16 q9, d21[3] + vadd.s16 q8, q3, q8 + vadd.s16 q9, q3, q9 + + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q8 + vqmovun.s16 d3, q9 + + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + vst1.64 {d2}, [r0], r1 + vst1.64 {d3}, [r0], r1 + + bx lr + ENDP ; |vp9_tm_predictor_8x8_neon| + +;void vp9_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_tm_predictor_16x16_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + ldrb r12, [r12] + vdup.u8 q0, r12 + + ; Load above 8 pixels + vld1.8 {q1}, [r2] + + ; preload 8 left into r12 + vld1.8 {d18}, [r3]! + + ; Compute above - ytop_left + vsubl.u8 q2, d2, d0 + vsubl.u8 q3, d3, d1 + + vmovl.u8 q10, d18 + + ; Load left row by row and compute left + (above - ytop_left) + ; Process 8 rows in each single loop and loop 2 times to process 16 rows. + mov r2, #2 + +loop_16x16_neon + ; Process two rows. + vdup.16 q0, d20[0] + vdup.16 q8, d20[1] + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d20[2] ; proload next 2 rows data + vdup.16 q8, d20[3] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + ; Process two rows. + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d21[0] ; proload next 2 rows data + vdup.16 q8, d21[1] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d21[2] ; proload next 2 rows data + vdup.16 q8, d21[3] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vld1.8 {d18}, [r3]! ; preload 8 left into r12 + vmovl.u8 q10, d18 + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + subs r2, r2, #1 + bgt loop_16x16_neon + + bx lr + ENDP ; |vp9_tm_predictor_16x16_neon| + +;void vp9_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_tm_predictor_32x32_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + ldrb r12, [r12] + vdup.u8 q0, r12 + + ; Load above 32 pixels + vld1.8 {q1}, [r2]! + vld1.8 {q2}, [r2] + + ; preload 8 left pixels + vld1.8 {d26}, [r3]! + + ; Compute above - ytop_left + vsubl.u8 q8, d2, d0 + vsubl.u8 q9, d3, d1 + vsubl.u8 q10, d4, d0 + vsubl.u8 q11, d5, d1 + + vmovl.u8 q3, d26 + + ; Load left row by row and compute left + (above - ytop_left) + ; Process 8 rows in each single loop and loop 4 times to process 32 rows. + mov r2, #4 + +loop_32x32_neon + ; Process two rows. + vdup.16 q0, d6[0] + vdup.16 q2, d6[1] + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q1, d6[2] + vdup.16 q2, d6[3] + vst1.64 {d24-d27}, [r0], r1 + + ; Process two rows. + vadd.s16 q12, q1, q8 + vadd.s16 q13, q1, q9 + vadd.s16 q14, q1, q10 + vadd.s16 q15, q1, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q0, d7[0] + vdup.16 q2, d7[1] + vst1.64 {d24-d27}, [r0], r1 + + ; Process two rows. + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q0, d7[2] + vdup.16 q2, d7[3] + vst1.64 {d24-d27}, [r0], r1 + + ; Process two rows. + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vld1.8 {d0}, [r3]! ; preload 8 left pixels + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vmovl.u8 q3, d0 + vst1.64 {d24-d27}, [r0], r1 + + subs r2, r2, #1 + bgt loop_32x32_neon + + bx lr + ENDP ; |vp9_tm_predictor_32x32_neon| + + END diff --git a/media/libvpx/vp9/common/generic/vp9_systemdependent.c b/media/libvpx/vp9/common/generic/vp9_systemdependent.c deleted file mode 100644 index 536febb65223..000000000000 --- a/media/libvpx/vp9/common/generic/vp9_systemdependent.c +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "./vpx_config.h" -#include "./vp9_rtcd.h" -#include "vp9/common/vp9_onyxc_int.h" - -void vp9_machine_specific_config(VP9_COMMON *cm) { - (void)cm; - vp9_rtcd(); -} diff --git a/media/libvpx/vp9/common/vp9_alloccommon.c b/media/libvpx/vp9/common/vp9_alloccommon.c index d2981601b8e9..21ae8d575ad2 100644 --- a/media/libvpx/vp9/common/vp9_alloccommon.c +++ b/media/libvpx/vp9/common/vp9_alloccommon.c @@ -8,54 +8,33 @@ * be found in the AUTHORS file in the root of the source tree. */ - #include "./vpx_config.h" #include "vpx_mem/vpx_mem.h" #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_entropymv.h" -#include "vp9/common/vp9_findnearmv.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_systemdependent.h" -void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi) { - const int stride = cm->mode_info_stride; +static void clear_mi_border(const VP9_COMMON *cm, MODE_INFO *mi) { int i; - // Clear down top border row - vpx_memset(mi, 0, sizeof(MODE_INFO) * stride); + // Top border row + vpx_memset(mi, 0, sizeof(*mi) * cm->mi_stride); - // Clear left border column - for (i = 1; i < cm->mi_rows + 1; i++) - vpx_memset(&mi[i * stride], 0, sizeof(MODE_INFO)); + // Left border column + for (i = 1; i < cm->mi_rows + 1; ++i) + vpx_memset(&mi[i * cm->mi_stride], 0, sizeof(*mi)); } -void vp9_free_frame_buffers(VP9_COMMON *cm) { - int i; +void vp9_set_mb_mi(VP9_COMMON *cm, int width, int height) { + const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2); + const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2); - for (i = 0; i < NUM_YV12_BUFFERS; i++) - vp9_free_frame_buffer(&cm->yv12_fb[i]); - - vp9_free_frame_buffer(&cm->post_proc_buffer); - - vpx_free(cm->mip); - vpx_free(cm->prev_mip); - vpx_free(cm->last_frame_seg_map); - vpx_free(cm->mi_grid_base); - vpx_free(cm->prev_mi_grid_base); - - cm->mip = NULL; - cm->prev_mip = NULL; - cm->last_frame_seg_map = NULL; - cm->mi_grid_base = NULL; - cm->prev_mi_grid_base = NULL; -} - -static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) { cm->mi_cols = aligned_width >> MI_SIZE_LOG2; cm->mi_rows = aligned_height >> MI_SIZE_LOG2; - cm->mode_info_stride = cm->mi_cols + MI_BLOCK_SIZE; + cm->mi_stride = calc_mi_size(cm->mi_cols); cm->mb_cols = (cm->mi_cols + 1) >> 1; cm->mb_rows = (cm->mi_rows + 1) >> 1; @@ -63,167 +42,194 @@ static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) { } static void setup_mi(VP9_COMMON *cm) { - cm->mi = cm->mip + cm->mode_info_stride + 1; - cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1; - cm->mi_grid_visible = cm->mi_grid_base + cm->mode_info_stride + 1; - cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1; + cm->mi = cm->mip + cm->mi_stride + 1; + cm->prev_mi = cm->prev_mip + cm->mi_stride + 1; + cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1; + cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1; - vpx_memset(cm->mip, 0, - cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO)); + vpx_memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip)); - vpx_memset(cm->mi_grid_base, 0, - cm->mode_info_stride * (cm->mi_rows + 1) * - sizeof(*cm->mi_grid_base)); + vpx_memset(cm->mi_grid_base, 0, cm->mi_stride * (cm->mi_rows + 1) * + sizeof(*cm->mi_grid_base)); - vp9_update_mode_info_border(cm, cm->mip); - vp9_update_mode_info_border(cm, cm->prev_mip); + clear_mi_border(cm, cm->prev_mip); } -int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) { - const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2); - const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2); - const int ss_x = cm->subsampling_x; - const int ss_y = cm->subsampling_y; - int mi_size; +static int alloc_mi(VP9_COMMON *cm, int mi_size) { + int i; - if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y, - VP9BORDERINPIXELS) < 0) - goto fail; + for (i = 0; i < 2; ++i) { + cm->mip_array[i] = + (MODE_INFO *)vpx_calloc(mi_size, sizeof(MODE_INFO)); + if (cm->mip_array[i] == NULL) + return 1; - set_mb_mi(cm, aligned_width, aligned_height); + cm->mi_grid_base_array[i] = + (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO*)); + if (cm->mi_grid_base_array[i] == NULL) + return 1; + } - // Allocation - mi_size = cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE); + cm->mi_alloc_size = mi_size; - vpx_free(cm->mip); - cm->mip = vpx_calloc(mi_size, sizeof(MODE_INFO)); - if (!cm->mip) - goto fail; + // Init the index. + cm->mi_idx = 0; + cm->prev_mi_idx = 1; - vpx_free(cm->prev_mip); - cm->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO)); - if (!cm->prev_mip) - goto fail; + cm->mip = cm->mip_array[cm->mi_idx]; + cm->prev_mip = cm->mip_array[cm->prev_mi_idx]; + cm->mi_grid_base = cm->mi_grid_base_array[cm->mi_idx]; + cm->prev_mi_grid_base = cm->mi_grid_base_array[cm->prev_mi_idx]; - vpx_free(cm->mi_grid_base); - cm->mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->mi_grid_base)); - if (!cm->mi_grid_base) - goto fail; + return 0; +} - vpx_free(cm->prev_mi_grid_base); - cm->prev_mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base)); - if (!cm->prev_mi_grid_base) - goto fail; +static void free_mi(VP9_COMMON *cm) { + int i; - setup_mi(cm); + for (i = 0; i < 2; ++i) { + vpx_free(cm->mip_array[i]); + cm->mip_array[i] = NULL; + vpx_free(cm->mi_grid_base_array[i]); + cm->mi_grid_base_array[i] = NULL; + } + + cm->mip = NULL; + cm->prev_mip = NULL; + cm->mi_grid_base = NULL; + cm->prev_mi_grid_base = NULL; +} + +void vp9_free_ref_frame_buffers(VP9_COMMON *cm) { + int i; + + for (i = 0; i < FRAME_BUFFERS; ++i) { + vp9_free_frame_buffer(&cm->frame_bufs[i].buf); + + if (cm->frame_bufs[i].ref_count > 0 && + cm->frame_bufs[i].raw_frame_buffer.data != NULL) { + cm->release_fb_cb(cm->cb_priv, &cm->frame_bufs[i].raw_frame_buffer); + cm->frame_bufs[i].ref_count = 0; + } + } + + vp9_free_frame_buffer(&cm->post_proc_buffer); +} + +void vp9_free_context_buffers(VP9_COMMON *cm) { + free_mi(cm); - // Create the segmentation map structure and set to 0. vpx_free(cm->last_frame_seg_map); - cm->last_frame_seg_map = vpx_calloc(cm->mi_rows * cm->mi_cols, 1); - if (!cm->last_frame_seg_map) + cm->last_frame_seg_map = NULL; + + vpx_free(cm->above_context); + cm->above_context = NULL; + + vpx_free(cm->above_seg_context); + cm->above_seg_context = NULL; +} + +int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) { + vp9_free_context_buffers(cm); + + vp9_set_mb_mi(cm, width, height); + if (alloc_mi(cm, cm->mi_stride * calc_mi_size(cm->mi_rows))) goto fail; + cm->last_frame_seg_map = (uint8_t *)vpx_calloc(cm->mi_rows * cm->mi_cols, 1); + if (!cm->last_frame_seg_map) goto fail; + + cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc( + 2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE, + sizeof(*cm->above_context)); + if (!cm->above_context) goto fail; + + cm->above_seg_context = (PARTITION_CONTEXT *)vpx_calloc( + mi_cols_aligned_to_sb(cm->mi_cols), sizeof(*cm->above_seg_context)); + if (!cm->above_seg_context) goto fail; + return 0; fail: - vp9_free_frame_buffers(cm); + vp9_free_context_buffers(cm); return 1; } -int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) { +static void init_frame_bufs(VP9_COMMON *cm) { int i; - const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2); - const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2); + cm->new_fb_idx = FRAME_BUFFERS - 1; + cm->frame_bufs[cm->new_fb_idx].ref_count = 1; + + for (i = 0; i < REF_FRAMES; ++i) { + cm->ref_frame_map[i] = i; + cm->frame_bufs[i].ref_count = 1; + } +} + +int vp9_alloc_ref_frame_buffers(VP9_COMMON *cm, int width, int height) { + int i; const int ss_x = cm->subsampling_x; const int ss_y = cm->subsampling_y; - int mi_size; - vp9_free_frame_buffers(cm); + vp9_free_ref_frame_buffers(cm); - for (i = 0; i < NUM_YV12_BUFFERS; i++) { - cm->fb_idx_ref_cnt[i] = 0; - if (vp9_alloc_frame_buffer(&cm->yv12_fb[i], width, height, ss_x, ss_y, - VP9BORDERINPIXELS) < 0) + for (i = 0; i < FRAME_BUFFERS; ++i) { + cm->frame_bufs[i].ref_count = 0; + if (vp9_alloc_frame_buffer(&cm->frame_bufs[i].buf, width, height, + ss_x, ss_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS) < 0) goto fail; } - cm->new_fb_idx = NUM_YV12_BUFFERS - 1; - cm->fb_idx_ref_cnt[cm->new_fb_idx] = 1; - - for (i = 0; i < ALLOWED_REFS_PER_FRAME; i++) - cm->active_ref_idx[i] = i; - - for (i = 0; i < NUM_REF_FRAMES; i++) { - cm->ref_frame_map[i] = i; - cm->fb_idx_ref_cnt[i] = 1; - } + init_frame_bufs(cm); +#if CONFIG_INTERNAL_STATS || CONFIG_VP9_POSTPROC if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y, - VP9BORDERINPIXELS) < 0) - goto fail; - - set_mb_mi(cm, aligned_width, aligned_height); - - // Allocation - mi_size = cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE); - - cm->mip = vpx_calloc(mi_size, sizeof(MODE_INFO)); - if (!cm->mip) - goto fail; - - cm->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO)); - if (!cm->prev_mip) - goto fail; - - cm->mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->mi_grid_base)); - if (!cm->mi_grid_base) - goto fail; - - cm->prev_mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base)); - if (!cm->prev_mi_grid_base) - goto fail; - - setup_mi(cm); - - // Create the segmentation map structure and set to 0. - cm->last_frame_seg_map = vpx_calloc(cm->mi_rows * cm->mi_cols, 1); - if (!cm->last_frame_seg_map) +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS) < 0) goto fail; +#endif return 0; fail: - vp9_free_frame_buffers(cm); + vp9_free_ref_frame_buffers(cm); return 1; } -void vp9_create_common(VP9_COMMON *cm) { - vp9_machine_specific_config(cm); - - cm->tx_mode = ONLY_4X4; - cm->comp_pred_mode = HYBRID_PREDICTION; -} - void vp9_remove_common(VP9_COMMON *cm) { - vp9_free_frame_buffers(cm); + vp9_free_ref_frame_buffers(cm); + vp9_free_context_buffers(cm); + vp9_free_internal_frame_buffers(&cm->int_frame_buffers); } -void vp9_initialize_common() { - vp9_init_neighbors(); - vp9_coef_tree_initialize(); - vp9_entropy_mode_init(); - vp9_entropy_mv_init(); -} - -void vp9_update_frame_size(VP9_COMMON *cm) { - const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, MI_SIZE_LOG2); - const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, MI_SIZE_LOG2); - - set_mb_mi(cm, aligned_width, aligned_height); +void vp9_init_context_buffers(VP9_COMMON *cm) { setup_mi(cm); - - // Initialize the previous frame segment map to 0. if (cm->last_frame_seg_map) vpx_memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols); } + +void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) { + // Swap indices. + const int tmp = cm->mi_idx; + cm->mi_idx = cm->prev_mi_idx; + cm->prev_mi_idx = tmp; + + // Current mip will be the prev_mip for the next frame. + cm->mip = cm->mip_array[cm->mi_idx]; + cm->prev_mip = cm->mip_array[cm->prev_mi_idx]; + cm->mi_grid_base = cm->mi_grid_base_array[cm->mi_idx]; + cm->prev_mi_grid_base = cm->mi_grid_base_array[cm->prev_mi_idx]; + + // Update the upper left visible macroblock ptrs. + cm->mi = cm->mip + cm->mi_stride + 1; + cm->prev_mi = cm->prev_mip + cm->mi_stride + 1; + cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1; + cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1; +} diff --git a/media/libvpx/vp9/common/vp9_alloccommon.h b/media/libvpx/vp9/common/vp9_alloccommon.h index cf8dca573200..c5b893facca3 100644 --- a/media/libvpx/vp9/common/vp9_alloccommon.h +++ b/media/libvpx/vp9/common/vp9_alloccommon.h @@ -12,20 +12,29 @@ #ifndef VP9_COMMON_VP9_ALLOCCOMMON_H_ #define VP9_COMMON_VP9_ALLOCCOMMON_H_ -#include "vp9/common/vp9_onyxc_int.h" +#ifdef __cplusplus +extern "C" { +#endif -void vp9_initialize_common(); +struct VP9Common; -void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi); +void vp9_remove_common(struct VP9Common *cm); -void vp9_create_common(VP9_COMMON *cm); -void vp9_remove_common(VP9_COMMON *cm); +int vp9_alloc_context_buffers(struct VP9Common *cm, int width, int height); +void vp9_init_context_buffers(struct VP9Common *cm); +void vp9_free_context_buffers(struct VP9Common *cm); -int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height); -int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height); -void vp9_free_frame_buffers(VP9_COMMON *cm); +int vp9_alloc_ref_frame_buffers(struct VP9Common *cm, int width, int height); +void vp9_free_ref_frame_buffers(struct VP9Common *cm); +int vp9_alloc_state_buffers(struct VP9Common *cm, int width, int height); +void vp9_free_state_buffers(struct VP9Common *cm); -void vp9_update_frame_size(VP9_COMMON *cm); +void vp9_set_mb_mi(struct VP9Common *cm, int width, int height); +void vp9_swap_mi_and_prev_mi(struct VP9Common *cm); + +#ifdef __cplusplus +} // extern "C" +#endif #endif // VP9_COMMON_VP9_ALLOCCOMMON_H_ diff --git a/media/libvpx/vp9/common/vp9_blockd.c b/media/libvpx/vp9/common/vp9_blockd.c new file mode 100644 index 000000000000..dab8f96172d0 --- /dev/null +++ b/media/libvpx/vp9/common/vp9_blockd.c @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_blockd.h" + +PREDICTION_MODE vp9_left_block_mode(const MODE_INFO *cur_mi, + const MODE_INFO *left_mi, int b) { + if (b == 0 || b == 2) { + if (!left_mi || is_inter_block(&left_mi->mbmi)) + return DC_PRED; + + return get_y_mode(left_mi, b + 1); + } else { + assert(b == 1 || b == 3); + return cur_mi->bmi[b - 1].as_mode; + } +} + +PREDICTION_MODE vp9_above_block_mode(const MODE_INFO *cur_mi, + const MODE_INFO *above_mi, int b) { + if (b == 0 || b == 1) { + if (!above_mi || is_inter_block(&above_mi->mbmi)) + return DC_PRED; + + return get_y_mode(above_mi, b + 2); + } else { + assert(b == 2 || b == 3); + return cur_mi->bmi[b - 2].as_mode; + } +} + +void vp9_foreach_transformed_block_in_plane( + const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane, + foreach_transformed_block_visitor visit, void *arg) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const MB_MODE_INFO* mbmi = &xd->mi[0]->mbmi; + // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") + // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 + // transform size varies per plane, look it up in a common way. + const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) + : mbmi->tx_size; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const int step = 1 << (tx_size << 1); + int i; + + // If mb_to_right_edge is < 0 we are in a situation in which + // the current block size extends into the UMV and we won't + // visit the sub blocks that are wholly within the UMV. + if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) { + int r, c; + + int max_blocks_wide = num_4x4_w; + int max_blocks_high = num_4x4_h; + + // xd->mb_to_right_edge is in units of pixels * 8. This converts + // it to 4x4 block sizes. + if (xd->mb_to_right_edge < 0) + max_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + + if (xd->mb_to_bottom_edge < 0) + max_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + + i = 0; + // Unlike the normal case - in here we have to keep track of the + // row and column of the blocks we use so that we know if we are in + // the unrestricted motion border. + for (r = 0; r < num_4x4_h; r += (1 << tx_size)) { + for (c = 0; c < num_4x4_w; c += (1 << tx_size)) { + if (r < max_blocks_high && c < max_blocks_wide) + visit(plane, i, plane_bsize, tx_size, arg); + i += step; + } + } + } else { + for (i = 0; i < num_4x4_w * num_4x4_h; i += step) + visit(plane, i, plane_bsize, tx_size, arg); + } +} + +void vp9_foreach_transformed_block(const MACROBLOCKD* const xd, + BLOCK_SIZE bsize, + foreach_transformed_block_visitor visit, + void *arg) { + int plane; + + for (plane = 0; plane < MAX_MB_PLANE; plane++) + vp9_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg); +} + +void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob, + int aoff, int loff) { + ENTROPY_CONTEXT *const a = pd->above_context + aoff; + ENTROPY_CONTEXT *const l = pd->left_context + loff; + const int tx_size_in_blocks = 1 << tx_size; + + // above + if (has_eob && xd->mb_to_right_edge < 0) { + int i; + const int blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize] + + (xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + int above_contexts = tx_size_in_blocks; + if (above_contexts + aoff > blocks_wide) + above_contexts = blocks_wide - aoff; + + for (i = 0; i < above_contexts; ++i) + a[i] = has_eob; + for (i = above_contexts; i < tx_size_in_blocks; ++i) + a[i] = 0; + } else { + vpx_memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); + } + + // left + if (has_eob && xd->mb_to_bottom_edge < 0) { + int i; + const int blocks_high = num_4x4_blocks_high_lookup[plane_bsize] + + (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + int left_contexts = tx_size_in_blocks; + if (left_contexts + loff > blocks_high) + left_contexts = blocks_high - loff; + + for (i = 0; i < left_contexts; ++i) + l[i] = has_eob; + for (i = left_contexts; i < tx_size_in_blocks; ++i) + l[i] = 0; + } else { + vpx_memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); + } +} + +void vp9_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y) { + int i; + + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y; + xd->plane[i].subsampling_x = i ? ss_x : 0; + xd->plane[i].subsampling_y = i ? ss_y : 0; + } +} diff --git a/media/libvpx/vp9/common/vp9_blockd.h b/media/libvpx/vp9/common/vp9_blockd.h index c5da3758a1c6..951e6e02334b 100644 --- a/media/libvpx/vp9/common/vp9_blockd.h +++ b/media/libvpx/vp9/common/vp9_blockd.h @@ -24,10 +24,14 @@ #include "vp9/common/vp9_mv.h" #include "vp9/common/vp9_scale.h" #include "vp9/common/vp9_seg_common.h" -#include "vp9/common/vp9_treecoder.h" -#define BLOCK_SIZE_GROUPS 4 -#define MBSKIP_CONTEXTS 3 +#ifdef __cplusplus +extern "C" { +#endif + +#define BLOCK_SIZE_GROUPS 4 +#define SKIP_CONTEXTS 3 +#define INTER_MODE_CONTEXTS 7 /* Segment Feature Masks */ #define MAX_MV_REF_CANDIDATES 2 @@ -37,8 +41,9 @@ #define REF_CONTEXTS 5 typedef enum { - PLANE_TYPE_Y_WITH_DC, - PLANE_TYPE_UV, + PLANE_TYPE_Y = 0, + PLANE_TYPE_UV = 1, + PLANE_TYPES } PLANE_TYPE; typedef char ENTROPY_CONTEXT; @@ -72,9 +77,9 @@ typedef enum { ZEROMV, NEWMV, MB_MODE_COUNT -} MB_PREDICTION_MODE; +} PREDICTION_MODE; -static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) { +static INLINE int is_inter_mode(PREDICTION_MODE mode) { return mode >= NEARESTMV && mode <= NEWMV; } @@ -84,13 +89,12 @@ static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) { #define INTER_OFFSET(mode) ((mode) - NEARESTMV) - /* For keyframes, intra block modes are predicted by the (already decoded) modes for the Y blocks to the left and above us; for interframes, there is a single probability table. */ typedef struct { - MB_PREDICTION_MODE as_mode; + PREDICTION_MODE as_mode; int_mv as_mv[2]; // first, second inter predictor motion vectors } b_mode_info; @@ -114,30 +118,25 @@ static INLINE int mi_width_log2(BLOCK_SIZE sb_type) { return mi_width_log2_lookup[sb_type]; } -static INLINE int mi_height_log2(BLOCK_SIZE sb_type) { - return mi_height_log2_lookup[sb_type]; -} - // This structure now relates to 8x8 block regions. typedef struct { - MB_PREDICTION_MODE mode, uv_mode; - MV_REFERENCE_FRAME ref_frame[2]; - TX_SIZE tx_size; - int_mv mv[2]; // for each reference frame used - int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES]; - int_mv best_mv[2]; - - uint8_t mode_context[MAX_REF_FRAMES]; - - unsigned char skip_coeff; // 0=need to decode coeffs, 1=no coefficients - unsigned char segment_id; // Segment id for this block. - - // Flags used for prediction status of various bit-stream signals - unsigned char seg_id_predicted; - - INTERPOLATION_TYPE interp_filter; - + // Common for both INTER and INTRA blocks BLOCK_SIZE sb_type; + PREDICTION_MODE mode; + TX_SIZE tx_size; + int8_t skip; + int8_t segment_id; + int8_t seg_id_predicted; // valid only when temporal_update is enabled + + // Only for INTRA blocks + PREDICTION_MODE uv_mode; + + // Only for INTER blocks + MV_REFERENCE_FRAME ref_frame[2]; + int_mv mv[2]; + int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES]; + uint8_t mode_context[MAX_REF_FRAMES]; + INTERP_FILTER interp_filter; } MB_MODE_INFO; typedef struct { @@ -145,6 +144,11 @@ typedef struct { b_mode_info bmi[4]; } MODE_INFO; +static INLINE PREDICTION_MODE get_y_mode(const MODE_INFO *mi, int block) { + return mi->mbmi.sb_type < BLOCK_8X8 ? mi->bmi[block].as_mode + : mi->mbmi.mode; +} + static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) { return mbmi->ref_frame[0] > INTRA_FRAME; } @@ -153,16 +157,18 @@ static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) { return mbmi->ref_frame[1] > INTRA_FRAME; } +PREDICTION_MODE vp9_left_block_mode(const MODE_INFO *cur_mi, + const MODE_INFO *left_mi, int b); + +PREDICTION_MODE vp9_above_block_mode(const MODE_INFO *cur_mi, + const MODE_INFO *above_mi, int b); + enum mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 }; -#if CONFIG_ALPHA -enum { MAX_MB_PLANE = 4 }; -#else enum { MAX_MB_PLANE = 3 }; -#endif struct buf_2d { uint8_t *buf; @@ -170,33 +176,34 @@ struct buf_2d { }; struct macroblockd_plane { - int16_t *qcoeff; int16_t *dqcoeff; - uint16_t *eobs; PLANE_TYPE plane_type; int subsampling_x; int subsampling_y; struct buf_2d dst; struct buf_2d pre[2]; - int16_t *dequant; + const int16_t *dequant; ENTROPY_CONTEXT *above_context; ENTROPY_CONTEXT *left_context; }; #define BLOCK_OFFSET(x, i) ((x) + (i) * 16) +typedef struct RefBuffer { + // TODO(dkovalev): idx is not really required and should be removed, now it + // is used in vp9_onyxd_if.c + int idx; + YV12_BUFFER_CONFIG *buf; + struct scale_factors sf; +} RefBuffer; + typedef struct macroblockd { struct macroblockd_plane plane[MAX_MB_PLANE]; - struct scale_factors scale_factor[2]; - - MODE_INFO *last_mi; - int mode_info_stride; + int mi_stride; // A NULL indicates that the 8x8 is not part of the image - MODE_INFO **mi_8x8; - MODE_INFO **prev_mi_8x8; - MODE_INFO *mi_stream; + MODE_INFO **mi; int up_available; int left_available; @@ -207,15 +214,21 @@ typedef struct macroblockd { int mb_to_top_edge; int mb_to_bottom_edge; - int lossless; - /* Inverse transform function pointers. */ - void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob); + /* pointers to reference frames */ + RefBuffer *block_refs[2]; - struct subpix_fn_table subpix; + /* pointer to current frame */ + const YV12_BUFFER_CONFIG *cur_buf; + + /* mc buffer */ + DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]); + + int lossless; int corrupted; - /* Y,U,V,(A) */ + DECLARE_ALIGNED(16, int16_t, dqcoeff[MAX_MB_PLANE][64 * 64]); + ENTROPY_CONTEXT *above_context[MAX_MB_PLANE]; ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16]; @@ -223,77 +236,53 @@ typedef struct macroblockd { PARTITION_CONTEXT left_seg_context[8]; } MACROBLOCKD; - - -static BLOCK_SIZE get_subsize(BLOCK_SIZE bsize, PARTITION_TYPE partition) { - const BLOCK_SIZE subsize = subsize_lookup[partition][bsize]; - assert(subsize < BLOCK_SIZES); - return subsize; +static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize, + PARTITION_TYPE partition) { + return subsize_lookup[partition][bsize]; } -extern const TX_TYPE mode2txfm_map[MB_MODE_COUNT]; +extern const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES]; + +static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type, + const MACROBLOCKD *xd) { + const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + + if (plane_type != PLANE_TYPE_Y || is_inter_block(mbmi)) + return DCT_DCT; + return intra_mode_to_tx_type_lookup[mbmi->mode]; +} static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type, const MACROBLOCKD *xd, int ib) { - const MODE_INFO *const mi = xd->mi_8x8[0]; - const MB_MODE_INFO *const mbmi = &mi->mbmi; + const MODE_INFO *const mi = xd->mi[0]; - if (plane_type != PLANE_TYPE_Y_WITH_DC || - xd->lossless || - is_inter_block(mbmi)) + if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(&mi->mbmi)) return DCT_DCT; - return mode2txfm_map[mbmi->sb_type < BLOCK_8X8 ? - mi->bmi[ib].as_mode : mbmi->mode]; + return intra_mode_to_tx_type_lookup[get_y_mode(mi, ib)]; } -static INLINE TX_TYPE get_tx_type_8x8(PLANE_TYPE plane_type, - const MACROBLOCKD *xd) { - return plane_type == PLANE_TYPE_Y_WITH_DC ? - mode2txfm_map[xd->mi_8x8[0]->mbmi.mode] : DCT_DCT; -} +void vp9_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y); -static INLINE TX_TYPE get_tx_type_16x16(PLANE_TYPE plane_type, - const MACROBLOCKD *xd) { - return plane_type == PLANE_TYPE_Y_WITH_DC ? - mode2txfm_map[xd->mi_8x8[0]->mbmi.mode] : DCT_DCT; -} - -static void setup_block_dptrs(MACROBLOCKD *xd, int ss_x, int ss_y) { - int i; - - for (i = 0; i < MAX_MB_PLANE; i++) { - xd->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC; - xd->plane[i].subsampling_x = i ? ss_x : 0; - xd->plane[i].subsampling_y = i ? ss_y : 0; +static INLINE TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize, + int xss, int yss) { + if (bsize < BLOCK_8X8) { + return TX_4X4; + } else { + const BLOCK_SIZE plane_bsize = ss_size_lookup[bsize][xss][yss]; + return MIN(y_tx_size, max_txsize_lookup[plane_bsize]); } -#if CONFIG_ALPHA - // TODO(jkoleszar): Using the Y w/h for now - xd->plane[3].subsampling_x = 0; - xd->plane[3].subsampling_y = 0; -#endif } - -static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) { - return MIN(mbmi->tx_size, max_uv_txsize_lookup[mbmi->sb_type]); +static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi, + const struct macroblockd_plane *pd) { + return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type, pd->subsampling_x, + pd->subsampling_y); } -static BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize, - const struct macroblockd_plane *pd) { - BLOCK_SIZE bs = ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y]; - assert(bs < BLOCK_SIZES); - return bs; -} - -static INLINE int plane_block_width(BLOCK_SIZE bsize, - const struct macroblockd_plane* plane) { - return 4 << (b_width_log2(bsize) - plane->subsampling_x); -} - -static INLINE int plane_block_height(BLOCK_SIZE bsize, - const struct macroblockd_plane* plane) { - return 4 << (b_height_log2(bsize) - plane->subsampling_y); +static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize, + const struct macroblockd_plane *pd) { + return ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y]; } typedef void (*foreach_transformed_block_visitor)(int plane, int block, @@ -301,106 +290,18 @@ typedef void (*foreach_transformed_block_visitor)(int plane, int block, TX_SIZE tx_size, void *arg); -static INLINE void foreach_transformed_block_in_plane( +void vp9_foreach_transformed_block_in_plane( const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane, - foreach_transformed_block_visitor visit, void *arg) { - const struct macroblockd_plane *const pd = &xd->plane[plane]; - const MB_MODE_INFO* mbmi = &xd->mi_8x8[0]->mbmi; - // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") - // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 - // transform size varies per plane, look it up in a common way. - const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) - : mbmi->tx_size; - const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); - const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; - const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; - const int step = 1 << (tx_size << 1); - int i; + foreach_transformed_block_visitor visit, void *arg); - // If mb_to_right_edge is < 0 we are in a situation in which - // the current block size extends into the UMV and we won't - // visit the sub blocks that are wholly within the UMV. - if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) { - int r, c; - int max_blocks_wide = num_4x4_w; - int max_blocks_high = num_4x4_h; - - // xd->mb_to_right_edge is in units of pixels * 8. This converts - // it to 4x4 block sizes. - if (xd->mb_to_right_edge < 0) - max_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x)); - - if (xd->mb_to_bottom_edge < 0) - max_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); - - i = 0; - // Unlike the normal case - in here we have to keep track of the - // row and column of the blocks we use so that we know if we are in - // the unrestricted motion border. - for (r = 0; r < num_4x4_h; r += (1 << tx_size)) { - for (c = 0; c < num_4x4_w; c += (1 << tx_size)) { - if (r < max_blocks_high && c < max_blocks_wide) - visit(plane, i, plane_bsize, tx_size, arg); - i += step; - } - } - } else { - for (i = 0; i < num_4x4_w * num_4x4_h; i += step) - visit(plane, i, plane_bsize, tx_size, arg); - } -} - -static INLINE void foreach_transformed_block( +void vp9_foreach_transformed_block( const MACROBLOCKD* const xd, BLOCK_SIZE bsize, - foreach_transformed_block_visitor visit, void *arg) { - int plane; + foreach_transformed_block_visitor visit, void *arg); - for (plane = 0; plane < MAX_MB_PLANE; plane++) - foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg); -} - -static INLINE void foreach_transformed_block_uv( - const MACROBLOCKD* const xd, BLOCK_SIZE bsize, - foreach_transformed_block_visitor visit, void *arg) { - int plane; - - for (plane = 1; plane < MAX_MB_PLANE; plane++) - foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg); -} - -static int raster_block_offset(BLOCK_SIZE plane_bsize, - int raster_block, int stride) { - const int bw = b_width_log2(plane_bsize); - const int y = 4 * (raster_block >> bw); - const int x = 4 * (raster_block & ((1 << bw) - 1)); - return y * stride + x; -} -static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize, - int raster_block, int16_t *base) { - const int stride = 4 << b_width_log2(plane_bsize); - return base + raster_block_offset(plane_bsize, raster_block, stride); -} -static uint8_t* raster_block_offset_uint8(BLOCK_SIZE plane_bsize, - int raster_block, uint8_t *base, - int stride) { - return base + raster_block_offset(plane_bsize, raster_block, stride); -} - -static int txfrm_block_to_raster_block(BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, int block) { - const int bwl = b_width_log2(plane_bsize); - const int tx_cols_log2 = bwl - tx_size; - const int tx_cols = 1 << tx_cols_log2; - const int raster_mb = block >> (tx_size << 1); - const int x = (raster_mb & (tx_cols - 1)) << tx_size; - const int y = (raster_mb >> tx_cols_log2) << tx_size; - return x + (y << bwl); -} - -static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, int block, - int *x, int *y) { +static INLINE void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, int block, + int *x, int *y) { const int bwl = b_width_log2(plane_bsize); const int tx_cols_log2 = bwl - tx_size; const int tx_cols = 1 << tx_cols_log2; @@ -409,93 +310,12 @@ static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize, *y = (raster_mb >> tx_cols_log2) << tx_size; } -static void extend_for_intra(MACROBLOCKD *xd, BLOCK_SIZE plane_bsize, - int plane, int block, TX_SIZE tx_size) { - struct macroblockd_plane *const pd = &xd->plane[plane]; - uint8_t *const buf = pd->dst.buf; - const int stride = pd->dst.stride; +void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob, + int aoff, int loff); - int x, y; - txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y); - x = x * 4 - 1; - y = y * 4 - 1; - // Copy a pixel into the umv if we are in a situation where the block size - // extends into the UMV. - // TODO(JBB): Should be able to do the full extend in place so we don't have - // to do this multiple times. - if (xd->mb_to_right_edge < 0) { - const int bw = 4 << b_width_log2(plane_bsize); - const int umv_border_start = bw + (xd->mb_to_right_edge >> - (3 + pd->subsampling_x)); - - if (x + bw > umv_border_start) - vpx_memset(&buf[y * stride + umv_border_start], - buf[y * stride + umv_border_start - 1], bw); - } - - if (xd->mb_to_bottom_edge < 0) { - if (xd->left_available || x >= 0) { - const int bh = 4 << b_height_log2(plane_bsize); - const int umv_border_start = - bh + (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)); - - if (y + bh > umv_border_start) { - const uint8_t c = buf[(umv_border_start - 1) * stride + x]; - uint8_t *d = &buf[umv_border_start * stride + x]; - int i; - for (i = 0; i < bh; ++i, d += stride) - *d = c; - } - } - } -} - -static void set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, - BLOCK_SIZE plane_bsize, TX_SIZE tx_size, - int has_eob, int aoff, int loff) { - ENTROPY_CONTEXT *const a = pd->above_context + aoff; - ENTROPY_CONTEXT *const l = pd->left_context + loff; - const int tx_size_in_blocks = 1 << tx_size; - - // above - if (has_eob && xd->mb_to_right_edge < 0) { - int i; - const int blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize] + - (xd->mb_to_right_edge >> (5 + pd->subsampling_x)); - int above_contexts = tx_size_in_blocks; - if (above_contexts + aoff > blocks_wide) - above_contexts = blocks_wide - aoff; - - for (i = 0; i < above_contexts; ++i) - a[i] = has_eob; - for (i = above_contexts; i < tx_size_in_blocks; ++i) - a[i] = 0; - } else { - vpx_memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); - } - - // left - if (has_eob && xd->mb_to_bottom_edge < 0) { - int i; - const int blocks_high = num_4x4_blocks_high_lookup[plane_bsize] + - (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); - int left_contexts = tx_size_in_blocks; - if (left_contexts + loff > blocks_high) - left_contexts = blocks_high - loff; - - for (i = 0; i < left_contexts; ++i) - l[i] = has_eob; - for (i = left_contexts; i < tx_size_in_blocks; ++i) - l[i] = 0; - } else { - vpx_memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks); - } -} - -static int get_tx_eob(const struct segmentation *seg, int segment_id, - TX_SIZE tx_size) { - const int eob_max = 16 << (tx_size << 1); - return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max; -} +#ifdef __cplusplus +} // extern "C" +#endif #endif // VP9_COMMON_VP9_BLOCKD_H_ diff --git a/media/libvpx/vp9/common/vp9_common.h b/media/libvpx/vp9/common/vp9_common.h index 36d1cdf14633..5587192e8a5f 100644 --- a/media/libvpx/vp9/common/vp9_common.h +++ b/media/libvpx/vp9/common/vp9_common.h @@ -18,6 +18,11 @@ #include "./vpx_config.h" #include "vpx_mem/vpx_mem.h" #include "vpx/vpx_integer.h" +#include "vp9/common/vp9_systemdependent.h" + +#ifdef __cplusplus +extern "C" { +#endif #define MIN(x, y) (((x) < (y)) ? (x) : (y)) #define MAX(x, y) (((x) > (y)) ? (x) : (y)) @@ -40,11 +45,11 @@ vpx_memcpy(dest, src, n * sizeof(*src)); \ } -#define vp9_zero(dest) vpx_memset(&dest, 0, sizeof(dest)) +#define vp9_zero(dest) vpx_memset(&(dest), 0, sizeof(dest)) #define vp9_zero_array(dest, n) vpx_memset(dest, 0, n * sizeof(*dest)) static INLINE uint8_t clip_pixel(int val) { - return (val > 255) ? 255u : (val < 0) ? 0u : val; + return (val > 255) ? 255 : (val < 0) ? 0 : val; } static INLINE int clamp(int value, int low, int high) { @@ -55,18 +60,15 @@ static INLINE double fclamp(double value, double low, double high) { return value < low ? low : (value > high ? high : value); } -static int get_unsigned_bits(unsigned int num_values) { - int cat = 0; - if (num_values <= 1) - return 0; - num_values--; - while (num_values > 0) { - cat++; - num_values >>= 1; - } - return cat; +static INLINE int get_unsigned_bits(unsigned int num_values) { + return num_values > 0 ? get_msb(num_values) + 1 : 0; } +#if CONFIG_VP9_HIGHBITDEPTH +#define CONVERT_TO_SHORTPTR(x) ((uint16_t*)(((uintptr_t)x) << 1)) +#define CONVERT_TO_BYTEPTR(x) ((uint8_t*)(((uintptr_t)x) >> 1 )) +#endif // CONFIG_VP9_HIGHBITDEPTH + #if CONFIG_DEBUG #define CHECK_MEM_ERROR(cm, lval, expr) do { \ lval = (expr); \ @@ -91,4 +93,8 @@ static int get_unsigned_bits(unsigned int num_values) { #define VP9_FRAME_MARKER 0x2 +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_COMMON_H_ diff --git a/media/libvpx/vp9/common/vp9_common_data.c b/media/libvpx/vp9/common/vp9_common_data.c index f858900a4f4e..d4c1b7124de9 100644 --- a/media/libvpx/vp9/common/vp9_common_data.c +++ b/media/libvpx/vp9/common/vp9_common_data.c @@ -26,8 +26,6 @@ const int mi_width_log2_lookup[BLOCK_SIZES] = {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3}; const int num_8x8_blocks_wide_lookup[BLOCK_SIZES] = {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8}; -const int mi_height_log2_lookup[BLOCK_SIZES] = - {0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3}; const int num_8x8_blocks_high_lookup[BLOCK_SIZES] = {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8}; @@ -108,11 +106,12 @@ const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = { TX_16X16, TX_16X16, TX_16X16, TX_32X32, TX_32X32, TX_32X32, TX_32X32 }; -const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES] = { - TX_4X4, TX_4X4, TX_4X4, - TX_4X4, TX_4X4, TX_4X4, - TX_8X8, TX_8X8, TX_8X8, - TX_16X16, TX_16X16, TX_16X16, TX_32X32 + +const BLOCK_SIZE txsize_to_bsize[TX_SIZES] = { + BLOCK_4X4, // TX_4X4 + BLOCK_8X8, // TX_8X8 + BLOCK_16X16, // TX_16X16 + BLOCK_32X32, // TX_32X32 }; const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = { @@ -123,8 +122,6 @@ const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = { TX_32X32, // TX_MODE_SELECT }; - - const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = { // ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1 // ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1 @@ -143,4 +140,24 @@ const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = { {{BLOCK_64X64, BLOCK_64X32}, {BLOCK_32X64, BLOCK_32X32}}, }; - +// Generates 4 bit field in which each bit set to 1 represents +// a blocksize partition 1111 means we split 64x64, 32x32, 16x16 +// and 8x8. 1000 means we just split the 64x64 to 32x32 +const struct { + PARTITION_CONTEXT above; + PARTITION_CONTEXT left; +} partition_context_lookup[BLOCK_SIZES]= { + {15, 15}, // 4X4 - {0b1111, 0b1111} + {15, 14}, // 4X8 - {0b1111, 0b1110} + {14, 15}, // 8X4 - {0b1110, 0b1111} + {14, 14}, // 8X8 - {0b1110, 0b1110} + {14, 12}, // 8X16 - {0b1110, 0b1100} + {12, 14}, // 16X8 - {0b1100, 0b1110} + {12, 12}, // 16X16 - {0b1100, 0b1100} + {12, 8 }, // 16X32 - {0b1100, 0b1000} + {8, 12}, // 32X16 - {0b1000, 0b1100} + {8, 8 }, // 32X32 - {0b1000, 0b1000} + {8, 0 }, // 32X64 - {0b1000, 0b0000} + {0, 8 }, // 64X32 - {0b0000, 0b1000} + {0, 0 }, // 64X64 - {0b0000, 0b0000} +}; diff --git a/media/libvpx/vp9/common/vp9_common_data.h b/media/libvpx/vp9/common/vp9_common_data.h index c1f6405364bc..a06c9bed81a8 100644 --- a/media/libvpx/vp9/common/vp9_common_data.h +++ b/media/libvpx/vp9/common/vp9_common_data.h @@ -13,10 +13,13 @@ #include "vp9/common/vp9_enums.h" +#ifdef __cplusplus +extern "C" { +#endif + extern const int b_width_log2_lookup[BLOCK_SIZES]; extern const int b_height_log2_lookup[BLOCK_SIZES]; extern const int mi_width_log2_lookup[BLOCK_SIZES]; -extern const int mi_height_log2_lookup[BLOCK_SIZES]; extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZES]; extern const int num_8x8_blocks_high_lookup[BLOCK_SIZES]; extern const int num_4x4_blocks_high_lookup[BLOCK_SIZES]; @@ -26,8 +29,12 @@ extern const int num_pels_log2_lookup[BLOCK_SIZES]; extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZES]; extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES]; extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES]; -extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES]; +extern const BLOCK_SIZE txsize_to_bsize[TX_SIZES]; extern const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES]; extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2]; -#endif // VP9_COMMON_VP9_COMMON_DATA_H +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_COMMON_DATA_H_ diff --git a/media/libvpx/vp9/common/vp9_convolve.c b/media/libvpx/vp9/common/vp9_convolve.c index a2d864c72e14..d8aaf32c44a1 100644 --- a/media/libvpx/vp9/common/vp9_convolve.c +++ b/media/libvpx/vp9/common/vp9_convolve.c @@ -18,40 +18,21 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" -static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x0, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h, int taps) { - int x, y, k; - - /* NOTE: This assumes that the filter table is 256-byte aligned. */ - /* TODO(agrange) Modify to make independent of table alignment. */ - const int16_t *const filter_x_base = - (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff); - - /* Adjust base pointer address for this source line */ - src -= taps / 2 - 1; - +static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, + int x0_q4, int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; for (y = 0; y < h; ++y) { - /* Initial phase offset */ - int x_q4 = (int)(filter_x0 - filter_x_base) / taps; - + int x_q4 = x0_q4; for (x = 0; x < w; ++x) { - /* Per-pixel src offset */ - const int src_x = x_q4 >> SUBPEL_BITS; - int sum = 0; - - /* Pointer to filter to use */ - const int16_t *const filter_x = filter_x_base + - (x_q4 & SUBPEL_MASK) * taps; - - for (k = 0; k < taps; ++k) - sum += src[src_x + k] * filter_x[k]; - + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_x[k] * x_filter[k]; dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); - - /* Move to the next source pixel */ x_q4 += x_step_q4; } src += src_stride; @@ -59,41 +40,22 @@ static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride, } } -static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x0, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h, int taps) { - int x, y, k; - - /* NOTE: This assumes that the filter table is 256-byte aligned. */ - /* TODO(agrange) Modify to make independent of table alignment. */ - const int16_t *const filter_x_base = - (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff); - - /* Adjust base pointer address for this source line */ - src -= taps / 2 - 1; - +static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, + int x0_q4, int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; for (y = 0; y < h; ++y) { - /* Initial phase offset */ - int x_q4 = (int)(filter_x0 - filter_x_base) / taps; - + int x_q4 = x0_q4; for (x = 0; x < w; ++x) { - /* Per-pixel src offset */ - const int src_x = x_q4 >> SUBPEL_BITS; - int sum = 0; - - /* Pointer to filter to use */ - const int16_t *const filter_x = filter_x_base + - (x_q4 & SUBPEL_MASK) * taps; - - for (k = 0; k < taps; ++k) - sum += src[src_x + k] * filter_x[k]; - + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_x[k] * x_filter[k]; dst[x] = ROUND_POWER_OF_TWO(dst[x] + - clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); - - /* Move to the next source pixel */ + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); x_q4 += x_step_q4; } src += src_stride; @@ -101,41 +63,22 @@ static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, } } -static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y0, int y_step_q4, - int w, int h, int taps) { - int x, y, k; - - /* NOTE: This assumes that the filter table is 256-byte aligned. */ - /* TODO(agrange) Modify to make independent of table alignment. */ - const int16_t *const filter_y_base = - (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff); - - /* Adjust base pointer address for this source column */ - src -= src_stride * (taps / 2 - 1); +static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, + int y0_q4, int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); for (x = 0; x < w; ++x) { - /* Initial phase offset */ - int y_q4 = (int)(filter_y0 - filter_y_base) / taps; - + int y_q4 = y0_q4; for (y = 0; y < h; ++y) { - /* Per-pixel src offset */ - const int src_y = y_q4 >> SUBPEL_BITS; - int sum = 0; - - /* Pointer to filter to use */ - const int16_t *const filter_y = filter_y_base + - (y_q4 & SUBPEL_MASK) * taps; - - for (k = 0; k < taps; ++k) - sum += src[(src_y + k) * src_stride] * filter_y[k]; - - dst[y * dst_stride] = - clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); - - /* Move to the next source pixel */ + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); y_q4 += y_step_q4; } ++src; @@ -143,41 +86,23 @@ static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride, } } -static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y0, int y_step_q4, - int w, int h, int taps) { - int x, y, k; - - /* NOTE: This assumes that the filter table is 256-byte aligned. */ - /* TODO(agrange) Modify to make independent of table alignment. */ - const int16_t *const filter_y_base = - (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff); - - /* Adjust base pointer address for this source column */ - src -= src_stride * (taps / 2 - 1); +static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, + int y0_q4, int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); for (x = 0; x < w; ++x) { - /* Initial phase offset */ - int y_q4 = (int)(filter_y0 - filter_y_base) / taps; - + int y_q4 = y0_q4; for (y = 0; y < h; ++y) { - /* Per-pixel src offset */ - const int src_y = y_q4 >> SUBPEL_BITS; - int sum = 0; - - /* Pointer to filter to use */ - const int16_t *const filter_y = filter_y_base + - (y_q4 & SUBPEL_MASK) * taps; - - for (k = 0; k < taps; ++k) - sum += src[(src_y + k) * src_stride] * filter_y[k]; - + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + - clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); - - /* Move to the next source pixel */ + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); y_q4 += y_step_q4; } ++src; @@ -185,33 +110,50 @@ static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, } } -static void convolve_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h, int taps) { - /* Fixed size intermediate buffer places limits on parameters. - * Maximum intermediate_height is 324, for y_step_q4 == 80, - * h == 64, taps == 8. - * y_step_q4 of 80 allows for 1/10 scale for 5 layer svc - */ - uint8_t temp[64 * 324]; - int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + taps; +static void convolve(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *const x_filters, + int x0_q4, int x_step_q4, + const InterpKernel *const y_filters, + int y0_q4, int y_step_q4, + int w, int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + uint8_t temp[135 * 64]; + int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + SUBPEL_TAPS; assert(w <= 64); assert(h <= 64); - assert(taps <= 8); - assert(y_step_q4 <= 80); - assert(x_step_q4 <= 80); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); if (intermediate_height < h) intermediate_height = h; - convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride, temp, 64, - filter_x, x_step_q4, filter_y, y_step_q4, w, - intermediate_height, taps); - convolve_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h, taps); + convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, + x_filters, x0_q4, x_step_q4, w, intermediate_height); + convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, + y_filters, y0_q4, y_step_q4, w, h); +} + +static const InterpKernel *get_filter_base(const int16_t *filter) { + // NOTE: This assumes that the filter table is 256-byte aligned. + // TODO(agrange) Modify to make independent of table alignment. + return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); +} + +static int get_filter_offset(const int16_t *f, const InterpKernel *base) { + return (int)((const InterpKernel *)(intptr_t)f - base); } void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, @@ -219,8 +161,14 @@ void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { - convolve_horiz_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8); + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + (void)filter_y; + (void)y_step_q4; + + convolve_horiz(src, src_stride, dst, dst_stride, filters_x, + x0_q4, x_step_q4, w, h); } void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, @@ -228,8 +176,14 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { - convolve_avg_horiz_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8); + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + (void)filter_y; + (void)y_step_q4; + + convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, + x0_q4, x_step_q4, w, h); } void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, @@ -237,8 +191,14 @@ void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { - convolve_vert_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8); + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + (void)filter_x; + (void)x_step_q4; + + convolve_vert(src, src_stride, dst, dst_stride, filters_y, + y0_q4, y_step_q4, w, h); } void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, @@ -246,8 +206,14 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { - convolve_avg_vert_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8); + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + (void)filter_x; + (void)x_step_q4; + + convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, + y0_q4, y_step_q4, w, h); } void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, @@ -255,8 +221,15 @@ void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { - convolve_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8); + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + convolve(src, src_stride, dst, dst_stride, + filters_x, x0_q4, x_step_q4, + filters_y, y0_q4, y_step_q4, w, h); } void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, @@ -269,9 +242,9 @@ void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, assert(w <= 64); assert(h <= 64); - vp9_convolve8(src, src_stride, temp, 64, - filter_x, x_step_q4, filter_y, y_step_q4, w, h); - vp9_convolve_avg(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h); + vp9_convolve8_c(src, src_stride, temp, 64, + filter_x, x_step_q4, filter_y, y_step_q4, w, h); + vp9_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h); } void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, @@ -281,6 +254,9 @@ void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, int w, int h) { int r; + (void)filter_x; (void)filter_x_stride; + (void)filter_y; (void)filter_y_stride; + for (r = h; r > 0; --r) { vpx_memcpy(dst, src, w); src += src_stride; @@ -295,6 +271,9 @@ void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, int w, int h) { int x, y; + (void)filter_x; (void)filter_x_stride; + (void)filter_y; (void)filter_y_stride; + for (y = 0; y < h; ++y) { for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); diff --git a/media/libvpx/vp9/common/vp9_convolve.h b/media/libvpx/vp9/common/vp9_convolve.h index 29d499063c6d..6bf71fc79430 100644 --- a/media/libvpx/vp9/common/vp9_convolve.h +++ b/media/libvpx/vp9/common/vp9_convolve.h @@ -13,10 +13,18 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_CONVOLVE_H_ diff --git a/media/libvpx/vp9/common/vp9_debugmodes.c b/media/libvpx/vp9/common/vp9_debugmodes.c index 355ac1a49008..d2522bbdfc84 100644 --- a/media/libvpx/vp9/common/vp9_debugmodes.c +++ b/media/libvpx/vp9/common/vp9_debugmodes.c @@ -22,12 +22,11 @@ static void log_frame_info(VP9_COMMON *cm, const char *str, FILE *f) { * and uses the passed in member offset to print out the value of an integer * for each mbmi member value in the mi structure. */ -static void print_mi_data(VP9_COMMON *cm, FILE *file, char *descriptor, +static void print_mi_data(VP9_COMMON *cm, FILE *file, const char *descriptor, size_t member_offset) { - int mi_row; - int mi_col; + int mi_row, mi_col; int mi_index = 0; - MODE_INFO **mi_8x8 = cm->mi_grid_visible; + MODE_INFO **mi = cm->mi_grid_visible; int rows = cm->mi_rows; int cols = cm->mi_cols; char prefix = descriptor[0]; @@ -38,7 +37,7 @@ static void print_mi_data(VP9_COMMON *cm, FILE *file, char *descriptor, fprintf(file, "%c ", prefix); for (mi_col = 0; mi_col < cols; mi_col++) { fprintf(file, "%2d ", - *((int*) ((char *) (&mi_8x8[mi_index]->mbmi) + + *((int*) ((char *) (&mi[mi_index]->mbmi) + member_offset))); mi_index++; } @@ -47,18 +46,18 @@ static void print_mi_data(VP9_COMMON *cm, FILE *file, char *descriptor, } fprintf(file, "\n"); } -void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) { +void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, const char *file) { int mi_row; int mi_col; int mi_index = 0; FILE *mvs = fopen(file, "a"); - MODE_INFO **mi_8x8 = cm->mi_grid_visible; + MODE_INFO **mi = cm->mi_grid_visible; int rows = cm->mi_rows; int cols = cm->mi_cols; print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type)); print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode)); - print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip_coeff)); + print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip)); print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0])); print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size)); print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode)); @@ -67,8 +66,8 @@ void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) { for (mi_row = 0; mi_row < rows; mi_row++) { fprintf(mvs, "V "); for (mi_col = 0; mi_col < cols; mi_col++) { - fprintf(mvs, "%4d:%4d ", mi_8x8[mi_index]->mbmi.mv[0].as_mv.row, - mi_8x8[mi_index]->mbmi.mv[0].as_mv.col); + fprintf(mvs, "%4d:%4d ", mi[mi_index]->mbmi.mv[0].as_mv.row, + mi[mi_index]->mbmi.mv[0].as_mv.col); mi_index++; } fprintf(mvs, "\n"); diff --git a/media/libvpx/vp9/common/vp9_default_coef_probs.h b/media/libvpx/vp9/common/vp9_default_coef_probs.h deleted file mode 100644 index 3b512beb9d3f..000000000000 --- a/media/libvpx/vp9/common/vp9_default_coef_probs.h +++ /dev/null @@ -1,699 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. -*/ -#ifndef VP9_COMMON_DEFAULT_COEF_PROBS_H_ -#define VP9_COMMON_DEFAULT_COEF_PROBS_H_ - -/*Generated file, included by vp9_entropy.c*/ -static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = { - { /* block Type 0 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 195, 29, 183 }, - { 84, 49, 136 }, - { 8, 42, 71 } - }, { /* Coeff Band 1 */ - { 31, 107, 169 }, - { 35, 99, 159 }, - { 17, 82, 140 }, - { 8, 66, 114 }, - { 2, 44, 76 }, - { 1, 19, 32 } - }, { /* Coeff Band 2 */ - { 40, 132, 201 }, - { 29, 114, 187 }, - { 13, 91, 157 }, - { 7, 75, 127 }, - { 3, 58, 95 }, - { 1, 28, 47 } - }, { /* Coeff Band 3 */ - { 69, 142, 221 }, - { 42, 122, 201 }, - { 15, 91, 159 }, - { 6, 67, 121 }, - { 1, 42, 77 }, - { 1, 17, 31 } - }, { /* Coeff Band 4 */ - { 102, 148, 228 }, - { 67, 117, 204 }, - { 17, 82, 154 }, - { 6, 59, 114 }, - { 2, 39, 75 }, - { 1, 15, 29 } - }, { /* Coeff Band 5 */ - { 156, 57, 233 }, - { 119, 57, 212 }, - { 58, 48, 163 }, - { 29, 40, 124 }, - { 12, 30, 81 }, - { 3, 12, 31 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 191, 107, 226 }, - { 124, 117, 204 }, - { 25, 99, 155 } - }, { /* Coeff Band 1 */ - { 29, 148, 210 }, - { 37, 126, 194 }, - { 8, 93, 157 }, - { 2, 68, 118 }, - { 1, 39, 69 }, - { 1, 17, 33 } - }, { /* Coeff Band 2 */ - { 41, 151, 213 }, - { 27, 123, 193 }, - { 3, 82, 144 }, - { 1, 58, 105 }, - { 1, 32, 60 }, - { 1, 13, 26 } - }, { /* Coeff Band 3 */ - { 59, 159, 220 }, - { 23, 126, 198 }, - { 4, 88, 151 }, - { 1, 66, 114 }, - { 1, 38, 71 }, - { 1, 18, 34 } - }, { /* Coeff Band 4 */ - { 114, 136, 232 }, - { 51, 114, 207 }, - { 11, 83, 155 }, - { 3, 56, 105 }, - { 1, 33, 65 }, - { 1, 17, 34 } - }, { /* Coeff Band 5 */ - { 149, 65, 234 }, - { 121, 57, 215 }, - { 61, 49, 166 }, - { 28, 36, 114 }, - { 12, 25, 76 }, - { 3, 16, 42 } - } - } - }, { /* block Type 1 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 214, 49, 220 }, - { 132, 63, 188 }, - { 42, 65, 137 } - }, { /* Coeff Band 1 */ - { 85, 137, 221 }, - { 104, 131, 216 }, - { 49, 111, 192 }, - { 21, 87, 155 }, - { 2, 49, 87 }, - { 1, 16, 28 } - }, { /* Coeff Band 2 */ - { 89, 163, 230 }, - { 90, 137, 220 }, - { 29, 100, 183 }, - { 10, 70, 135 }, - { 2, 42, 81 }, - { 1, 17, 33 } - }, { /* Coeff Band 3 */ - { 108, 167, 237 }, - { 55, 133, 222 }, - { 15, 97, 179 }, - { 4, 72, 135 }, - { 1, 45, 85 }, - { 1, 19, 38 } - }, { /* Coeff Band 4 */ - { 124, 146, 240 }, - { 66, 124, 224 }, - { 17, 88, 175 }, - { 4, 58, 122 }, - { 1, 36, 75 }, - { 1, 18, 37 } - }, { /* Coeff Band 5 */ - { 141, 79, 241 }, - { 126, 70, 227 }, - { 66, 58, 182 }, - { 30, 44, 136 }, - { 12, 34, 96 }, - { 2, 20, 47 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 229, 99, 249 }, - { 143, 111, 235 }, - { 46, 109, 192 } - }, { /* Coeff Band 1 */ - { 82, 158, 236 }, - { 94, 146, 224 }, - { 25, 117, 191 }, - { 9, 87, 149 }, - { 3, 56, 99 }, - { 1, 33, 57 } - }, { /* Coeff Band 2 */ - { 83, 167, 237 }, - { 68, 145, 222 }, - { 10, 103, 177 }, - { 2, 72, 131 }, - { 1, 41, 79 }, - { 1, 20, 39 } - }, { /* Coeff Band 3 */ - { 99, 167, 239 }, - { 47, 141, 224 }, - { 10, 104, 178 }, - { 2, 73, 133 }, - { 1, 44, 85 }, - { 1, 22, 47 } - }, { /* Coeff Band 4 */ - { 127, 145, 243 }, - { 71, 129, 228 }, - { 17, 93, 177 }, - { 3, 61, 124 }, - { 1, 41, 84 }, - { 1, 21, 52 } - }, { /* Coeff Band 5 */ - { 157, 78, 244 }, - { 140, 72, 231 }, - { 69, 58, 184 }, - { 31, 44, 137 }, - { 14, 38, 105 }, - { 8, 23, 61 } - } - } - } -}; -static const vp9_coeff_probs_model default_coef_probs_8x8[BLOCK_TYPES] = { - { /* block Type 0 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 125, 34, 187 }, - { 52, 41, 133 }, - { 6, 31, 56 } - }, { /* Coeff Band 1 */ - { 37, 109, 153 }, - { 51, 102, 147 }, - { 23, 87, 128 }, - { 8, 67, 101 }, - { 1, 41, 63 }, - { 1, 19, 29 } - }, { /* Coeff Band 2 */ - { 31, 154, 185 }, - { 17, 127, 175 }, - { 6, 96, 145 }, - { 2, 73, 114 }, - { 1, 51, 82 }, - { 1, 28, 45 } - }, { /* Coeff Band 3 */ - { 23, 163, 200 }, - { 10, 131, 185 }, - { 2, 93, 148 }, - { 1, 67, 111 }, - { 1, 41, 69 }, - { 1, 14, 24 } - }, { /* Coeff Band 4 */ - { 29, 176, 217 }, - { 12, 145, 201 }, - { 3, 101, 156 }, - { 1, 69, 111 }, - { 1, 39, 63 }, - { 1, 14, 23 } - }, { /* Coeff Band 5 */ - { 57, 192, 233 }, - { 25, 154, 215 }, - { 6, 109, 167 }, - { 3, 78, 118 }, - { 1, 48, 69 }, - { 1, 21, 29 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 202, 105, 245 }, - { 108, 106, 216 }, - { 18, 90, 144 } - }, { /* Coeff Band 1 */ - { 33, 172, 219 }, - { 64, 149, 206 }, - { 14, 117, 177 }, - { 5, 90, 141 }, - { 2, 61, 95 }, - { 1, 37, 57 } - }, { /* Coeff Band 2 */ - { 33, 179, 220 }, - { 11, 140, 198 }, - { 1, 89, 148 }, - { 1, 60, 104 }, - { 1, 33, 57 }, - { 1, 12, 21 } - }, { /* Coeff Band 3 */ - { 30, 181, 221 }, - { 8, 141, 198 }, - { 1, 87, 145 }, - { 1, 58, 100 }, - { 1, 31, 55 }, - { 1, 12, 20 } - }, { /* Coeff Band 4 */ - { 32, 186, 224 }, - { 7, 142, 198 }, - { 1, 86, 143 }, - { 1, 58, 100 }, - { 1, 31, 55 }, - { 1, 12, 22 } - }, { /* Coeff Band 5 */ - { 57, 192, 227 }, - { 20, 143, 204 }, - { 3, 96, 154 }, - { 1, 68, 112 }, - { 1, 42, 69 }, - { 1, 19, 32 } - } - } - }, { /* block Type 1 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 212, 35, 215 }, - { 113, 47, 169 }, - { 29, 48, 105 } - }, { /* Coeff Band 1 */ - { 74, 129, 203 }, - { 106, 120, 203 }, - { 49, 107, 178 }, - { 19, 84, 144 }, - { 4, 50, 84 }, - { 1, 15, 25 } - }, { /* Coeff Band 2 */ - { 71, 172, 217 }, - { 44, 141, 209 }, - { 15, 102, 173 }, - { 6, 76, 133 }, - { 2, 51, 89 }, - { 1, 24, 42 } - }, { /* Coeff Band 3 */ - { 64, 185, 231 }, - { 31, 148, 216 }, - { 8, 103, 175 }, - { 3, 74, 131 }, - { 1, 46, 81 }, - { 1, 18, 30 } - }, { /* Coeff Band 4 */ - { 65, 196, 235 }, - { 25, 157, 221 }, - { 5, 105, 174 }, - { 1, 67, 120 }, - { 1, 38, 69 }, - { 1, 15, 30 } - }, { /* Coeff Band 5 */ - { 65, 204, 238 }, - { 30, 156, 224 }, - { 7, 107, 177 }, - { 2, 70, 124 }, - { 1, 42, 73 }, - { 1, 18, 34 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 225, 86, 251 }, - { 144, 104, 235 }, - { 42, 99, 181 } - }, { /* Coeff Band 1 */ - { 85, 175, 239 }, - { 112, 165, 229 }, - { 29, 136, 200 }, - { 12, 103, 162 }, - { 6, 77, 123 }, - { 2, 53, 84 } - }, { /* Coeff Band 2 */ - { 75, 183, 239 }, - { 30, 155, 221 }, - { 3, 106, 171 }, - { 1, 74, 128 }, - { 1, 44, 76 }, - { 1, 17, 28 } - }, { /* Coeff Band 3 */ - { 73, 185, 240 }, - { 27, 159, 222 }, - { 2, 107, 172 }, - { 1, 75, 127 }, - { 1, 42, 73 }, - { 1, 17, 29 } - }, { /* Coeff Band 4 */ - { 62, 190, 238 }, - { 21, 159, 222 }, - { 2, 107, 172 }, - { 1, 72, 122 }, - { 1, 40, 71 }, - { 1, 18, 32 } - }, { /* Coeff Band 5 */ - { 61, 199, 240 }, - { 27, 161, 226 }, - { 4, 113, 180 }, - { 1, 76, 129 }, - { 1, 46, 80 }, - { 1, 23, 41 } - } - } - } -}; -static const vp9_coeff_probs_model default_coef_probs_16x16[BLOCK_TYPES] = { - { /* block Type 0 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 7, 27, 153 }, - { 5, 30, 95 }, - { 1, 16, 30 } - }, { /* Coeff Band 1 */ - { 50, 75, 127 }, - { 57, 75, 124 }, - { 27, 67, 108 }, - { 10, 54, 86 }, - { 1, 33, 52 }, - { 1, 12, 18 } - }, { /* Coeff Band 2 */ - { 43, 125, 151 }, - { 26, 108, 148 }, - { 7, 83, 122 }, - { 2, 59, 89 }, - { 1, 38, 60 }, - { 1, 17, 27 } - }, { /* Coeff Band 3 */ - { 23, 144, 163 }, - { 13, 112, 154 }, - { 2, 75, 117 }, - { 1, 50, 81 }, - { 1, 31, 51 }, - { 1, 14, 23 } - }, { /* Coeff Band 4 */ - { 18, 162, 185 }, - { 6, 123, 171 }, - { 1, 78, 125 }, - { 1, 51, 86 }, - { 1, 31, 54 }, - { 1, 14, 23 } - }, { /* Coeff Band 5 */ - { 15, 199, 227 }, - { 3, 150, 204 }, - { 1, 91, 146 }, - { 1, 55, 95 }, - { 1, 30, 53 }, - { 1, 11, 20 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 19, 55, 240 }, - { 19, 59, 196 }, - { 3, 52, 105 } - }, { /* Coeff Band 1 */ - { 41, 166, 207 }, - { 104, 153, 199 }, - { 31, 123, 181 }, - { 14, 101, 152 }, - { 5, 72, 106 }, - { 1, 36, 52 } - }, { /* Coeff Band 2 */ - { 35, 176, 211 }, - { 12, 131, 190 }, - { 2, 88, 144 }, - { 1, 60, 101 }, - { 1, 36, 60 }, - { 1, 16, 28 } - }, { /* Coeff Band 3 */ - { 28, 183, 213 }, - { 8, 134, 191 }, - { 1, 86, 142 }, - { 1, 56, 96 }, - { 1, 30, 53 }, - { 1, 12, 20 } - }, { /* Coeff Band 4 */ - { 20, 190, 215 }, - { 4, 135, 192 }, - { 1, 84, 139 }, - { 1, 53, 91 }, - { 1, 28, 49 }, - { 1, 11, 20 } - }, { /* Coeff Band 5 */ - { 13, 196, 216 }, - { 2, 137, 192 }, - { 1, 86, 143 }, - { 1, 57, 99 }, - { 1, 32, 56 }, - { 1, 13, 24 } - } - } - }, { /* block Type 1 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 211, 29, 217 }, - { 96, 47, 156 }, - { 22, 43, 87 } - }, { /* Coeff Band 1 */ - { 78, 120, 193 }, - { 111, 116, 186 }, - { 46, 102, 164 }, - { 15, 80, 128 }, - { 2, 49, 76 }, - { 1, 18, 28 } - }, { /* Coeff Band 2 */ - { 71, 161, 203 }, - { 42, 132, 192 }, - { 10, 98, 150 }, - { 3, 69, 109 }, - { 1, 44, 70 }, - { 1, 18, 29 } - }, { /* Coeff Band 3 */ - { 57, 186, 211 }, - { 30, 140, 196 }, - { 4, 93, 146 }, - { 1, 62, 102 }, - { 1, 38, 65 }, - { 1, 16, 27 } - }, { /* Coeff Band 4 */ - { 47, 199, 217 }, - { 14, 145, 196 }, - { 1, 88, 142 }, - { 1, 57, 98 }, - { 1, 36, 62 }, - { 1, 15, 26 } - }, { /* Coeff Band 5 */ - { 26, 219, 229 }, - { 5, 155, 207 }, - { 1, 94, 151 }, - { 1, 60, 104 }, - { 1, 36, 62 }, - { 1, 16, 28 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 233, 29, 248 }, - { 146, 47, 220 }, - { 43, 52, 140 } - }, { /* Coeff Band 1 */ - { 100, 163, 232 }, - { 179, 161, 222 }, - { 63, 142, 204 }, - { 37, 113, 174 }, - { 26, 89, 137 }, - { 18, 68, 97 } - }, { /* Coeff Band 2 */ - { 85, 181, 230 }, - { 32, 146, 209 }, - { 7, 100, 164 }, - { 3, 71, 121 }, - { 1, 45, 77 }, - { 1, 18, 30 } - }, { /* Coeff Band 3 */ - { 65, 187, 230 }, - { 20, 148, 207 }, - { 2, 97, 159 }, - { 1, 68, 116 }, - { 1, 40, 70 }, - { 1, 14, 29 } - }, { /* Coeff Band 4 */ - { 40, 194, 227 }, - { 8, 147, 204 }, - { 1, 94, 155 }, - { 1, 65, 112 }, - { 1, 39, 66 }, - { 1, 14, 26 } - }, { /* Coeff Band 5 */ - { 16, 208, 228 }, - { 3, 151, 207 }, - { 1, 98, 160 }, - { 1, 67, 117 }, - { 1, 41, 74 }, - { 1, 17, 31 } - } - } - } -}; -static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = { - { /* block Type 0 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 17, 38, 140 }, - { 7, 34, 80 }, - { 1, 17, 29 } - }, { /* Coeff Band 1 */ - { 37, 75, 128 }, - { 41, 76, 128 }, - { 26, 66, 116 }, - { 12, 52, 94 }, - { 2, 32, 55 }, - { 1, 10, 16 } - }, { /* Coeff Band 2 */ - { 50, 127, 154 }, - { 37, 109, 152 }, - { 16, 82, 121 }, - { 5, 59, 85 }, - { 1, 35, 54 }, - { 1, 13, 20 } - }, { /* Coeff Band 3 */ - { 40, 142, 167 }, - { 17, 110, 157 }, - { 2, 71, 112 }, - { 1, 44, 72 }, - { 1, 27, 45 }, - { 1, 11, 17 } - }, { /* Coeff Band 4 */ - { 30, 175, 188 }, - { 9, 124, 169 }, - { 1, 74, 116 }, - { 1, 48, 78 }, - { 1, 30, 49 }, - { 1, 11, 18 } - }, { /* Coeff Band 5 */ - { 10, 222, 223 }, - { 2, 150, 194 }, - { 1, 83, 128 }, - { 1, 48, 79 }, - { 1, 27, 45 }, - { 1, 11, 17 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 36, 41, 235 }, - { 29, 36, 193 }, - { 10, 27, 111 } - }, { /* Coeff Band 1 */ - { 85, 165, 222 }, - { 177, 162, 215 }, - { 110, 135, 195 }, - { 57, 113, 168 }, - { 23, 83, 120 }, - { 10, 49, 61 } - }, { /* Coeff Band 2 */ - { 85, 190, 223 }, - { 36, 139, 200 }, - { 5, 90, 146 }, - { 1, 60, 103 }, - { 1, 38, 65 }, - { 1, 18, 30 } - }, { /* Coeff Band 3 */ - { 72, 202, 223 }, - { 23, 141, 199 }, - { 2, 86, 140 }, - { 1, 56, 97 }, - { 1, 36, 61 }, - { 1, 16, 27 } - }, { /* Coeff Band 4 */ - { 55, 218, 225 }, - { 13, 145, 200 }, - { 1, 86, 141 }, - { 1, 57, 99 }, - { 1, 35, 61 }, - { 1, 13, 22 } - }, { /* Coeff Band 5 */ - { 15, 235, 212 }, - { 1, 132, 184 }, - { 1, 84, 139 }, - { 1, 57, 97 }, - { 1, 34, 56 }, - { 1, 14, 23 } - } - } - }, { /* block Type 1 */ - { /* Intra */ - { /* Coeff Band 0 */ - { 181, 21, 201 }, - { 61, 37, 123 }, - { 10, 38, 71 } - }, { /* Coeff Band 1 */ - { 47, 106, 172 }, - { 95, 104, 173 }, - { 42, 93, 159 }, - { 18, 77, 131 }, - { 4, 50, 81 }, - { 1, 17, 23 } - }, { /* Coeff Band 2 */ - { 62, 147, 199 }, - { 44, 130, 189 }, - { 28, 102, 154 }, - { 18, 75, 115 }, - { 2, 44, 65 }, - { 1, 12, 19 } - }, { /* Coeff Band 3 */ - { 55, 153, 210 }, - { 24, 130, 194 }, - { 3, 93, 146 }, - { 1, 61, 97 }, - { 1, 31, 50 }, - { 1, 10, 16 } - }, { /* Coeff Band 4 */ - { 49, 186, 223 }, - { 17, 148, 204 }, - { 1, 96, 142 }, - { 1, 53, 83 }, - { 1, 26, 44 }, - { 1, 11, 17 } - }, { /* Coeff Band 5 */ - { 13, 217, 212 }, - { 2, 136, 180 }, - { 1, 78, 124 }, - { 1, 50, 83 }, - { 1, 29, 49 }, - { 1, 14, 23 } - } - }, { /* Inter */ - { /* Coeff Band 0 */ - { 197, 13, 247 }, - { 82, 17, 222 }, - { 25, 17, 162 } - }, { /* Coeff Band 1 */ - { 126, 186, 247 }, - { 234, 191, 243 }, - { 176, 177, 234 }, - { 104, 158, 220 }, - { 66, 128, 186 }, - { 55, 90, 137 } - }, { /* Coeff Band 2 */ - { 111, 197, 242 }, - { 46, 158, 219 }, - { 9, 104, 171 }, - { 2, 65, 125 }, - { 1, 44, 80 }, - { 1, 17, 91 } - }, { /* Coeff Band 3 */ - { 104, 208, 245 }, - { 39, 168, 224 }, - { 3, 109, 162 }, - { 1, 79, 124 }, - { 1, 50, 102 }, - { 1, 43, 102 } - }, { /* Coeff Band 4 */ - { 84, 220, 246 }, - { 31, 177, 231 }, - { 2, 115, 180 }, - { 1, 79, 134 }, - { 1, 55, 77 }, - { 1, 60, 79 } - }, { /* Coeff Band 5 */ - { 43, 243, 240 }, - { 8, 180, 217 }, - { 1, 115, 166 }, - { 1, 84, 121 }, - { 1, 51, 67 }, - { 1, 16, 6 } - } - } - } -}; - -#endif // VP9_COMMON_DEFAULT_COEF_PROBS_H_ diff --git a/media/libvpx/vp9/common/vp9_entropy.c b/media/libvpx/vp9/common/vp9_entropy.c index feceb667a46b..3a54de225816 100644 --- a/media/libvpx/vp9/common/vp9_entropy.c +++ b/media/libvpx/vp9/common/vp9_entropy.c @@ -15,29 +15,16 @@ #include "vpx_mem/vpx_mem.h" #include "vpx/vpx_integer.h" -#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES) - -DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = { - 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +const vp9_prob vp9_cat1_prob[] = { 159 }; +const vp9_prob vp9_cat2_prob[] = { 165, 145 }; +const vp9_prob vp9_cat3_prob[] = { 173, 148, 140 }; +const vp9_prob vp9_cat4_prob[] = { 176, 155, 140, 135 }; +const vp9_prob vp9_cat5_prob[] = { 180, 157, 141, 134, 130 }; +const vp9_prob vp9_cat6_prob[] = { + 254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129 }; -DECLARE_ALIGNED(16, const uint8_t, - vp9_coefband_trans_8x8plus[1024]) = { +const uint8_t vp9_coefband_trans_8x8plus[1024] = { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, // beyond MAXBAND_INDEX+1 all values are filled as 5 @@ -106,50 +93,17 @@ DECLARE_ALIGNED(16, const uint8_t, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, }; -DECLARE_ALIGNED(16, const uint8_t, - vp9_coefband_trans_4x4[16]) = { +const uint8_t vp9_coefband_trans_4x4[16] = { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, }; -DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = { +const uint8_t vp9_pt_energy_class[ENTROPY_TOKENS] = { 0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5 }; - - -/* Array indices are identical to previously-existing CONTEXT_NODE indices */ - -const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = { - -DCT_EOB_TOKEN, 2, /* 0 = EOB */ - -ZERO_TOKEN, 4, /* 1 = ZERO */ - -ONE_TOKEN, 6, /* 2 = ONE */ - 8, 12, /* 3 = LOW_VAL */ - -TWO_TOKEN, 10, /* 4 = TWO */ - -THREE_TOKEN, -FOUR_TOKEN, /* 5 = THREE */ - 14, 16, /* 6 = HIGH_LOW */ - -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 7 = CAT_ONE */ - 18, 20, /* 8 = CAT_THREEFOUR */ - -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 9 = CAT_THREE */ - -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 10 = CAT_FIVE */ -}; - -struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS]; - -/* Trees for extra bits. Probabilities are constant and - do not depend on previously encoded bits */ - -static const vp9_prob Pcat1[] = { 159}; -static const vp9_prob Pcat2[] = { 165, 145}; -static const vp9_prob Pcat3[] = { 173, 148, 140}; -static const vp9_prob Pcat4[] = { 176, 155, 140, 135}; -static const vp9_prob Pcat5[] = { 180, 157, 141, 134, 130}; -static const vp9_prob Pcat6[] = { - 254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129 -}; - -const vp9_tree_index vp9_coefmodel_tree[6] = { - -DCT_EOB_MODEL_TOKEN, 2, /* 0 = EOB */ - -ZERO_TOKEN, 4, /* 1 = ZERO */ +const vp9_tree_index vp9_coefmodel_tree[TREE_SIZE(UNCONSTRAINED_NODES + 1)] = { + -EOB_MODEL_TOKEN, 2, + -ZERO_TOKEN, 4, -ONE_TOKEN, -TWO_TOKEN, }; @@ -162,198 +116,617 @@ const vp9_tree_index vp9_coefmodel_tree[6] = { // the probabilities for the rest of the nodes. // beta = 8 -static const vp9_prob modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = { + +// Every odd line in this table can be generated from the even lines +// by averaging : +// vp9_pareto8_full[l][node] = (vp9_pareto8_full[l-1][node] + +// vp9_pareto8_full[l+1][node] ) >> 1; +const vp9_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES] = { { 3, 86, 128, 6, 86, 23, 88, 29}, + { 6, 86, 128, 11, 87, 42, 91, 52}, { 9, 86, 129, 17, 88, 61, 94, 76}, + { 12, 86, 129, 22, 88, 77, 97, 93}, { 15, 87, 129, 28, 89, 93, 100, 110}, + { 17, 87, 129, 33, 90, 105, 103, 123}, { 20, 88, 130, 38, 91, 118, 106, 136}, + { 23, 88, 130, 43, 91, 128, 108, 146}, { 26, 89, 131, 48, 92, 139, 111, 156}, + { 28, 89, 131, 53, 93, 147, 114, 163}, { 31, 90, 131, 58, 94, 156, 117, 171}, + { 34, 90, 131, 62, 94, 163, 119, 177}, { 37, 90, 132, 66, 95, 171, 122, 184}, + { 39, 90, 132, 70, 96, 177, 124, 189}, { 42, 91, 132, 75, 97, 183, 127, 194}, + { 44, 91, 132, 79, 97, 188, 129, 198}, { 47, 92, 133, 83, 98, 193, 132, 202}, + { 49, 92, 133, 86, 99, 197, 134, 205}, { 52, 93, 133, 90, 100, 201, 137, 208}, + { 54, 93, 133, 94, 100, 204, 139, 211}, { 57, 94, 134, 98, 101, 208, 142, 214}, + { 59, 94, 134, 101, 102, 211, 144, 216}, { 62, 94, 135, 105, 103, 214, 146, 218}, + { 64, 94, 135, 108, 103, 216, 148, 220}, { 66, 95, 135, 111, 104, 219, 151, 222}, + { 68, 95, 135, 114, 105, 221, 153, 223}, { 71, 96, 136, 117, 106, 224, 155, 225}, + { 73, 96, 136, 120, 106, 225, 157, 226}, { 76, 97, 136, 123, 107, 227, 159, 228}, + { 78, 97, 136, 126, 108, 229, 160, 229}, { 80, 98, 137, 129, 109, 231, 162, 231}, + { 82, 98, 137, 131, 109, 232, 164, 232}, { 84, 98, 138, 134, 110, 234, 166, 233}, + { 86, 98, 138, 137, 111, 235, 168, 234}, { 89, 99, 138, 140, 112, 236, 170, 235}, + { 91, 99, 138, 142, 112, 237, 171, 235}, { 93, 100, 139, 145, 113, 238, 173, 236}, + { 95, 100, 139, 147, 114, 239, 174, 237}, { 97, 101, 140, 149, 115, 240, 176, 238}, + { 99, 101, 140, 151, 115, 241, 177, 238}, {101, 102, 140, 154, 116, 242, 179, 239}, + {103, 102, 140, 156, 117, 242, 180, 239}, {105, 103, 141, 158, 118, 243, 182, 240}, + {107, 103, 141, 160, 118, 243, 183, 240}, {109, 104, 141, 162, 119, 244, 185, 241}, + {111, 104, 141, 164, 119, 244, 186, 241}, {113, 104, 142, 166, 120, 245, 187, 242}, + {114, 104, 142, 168, 121, 245, 188, 242}, {116, 105, 143, 170, 122, 246, 190, 243}, + {118, 105, 143, 171, 122, 246, 191, 243}, {120, 106, 143, 173, 123, 247, 192, 244}, + {121, 106, 143, 175, 124, 247, 193, 244}, {123, 107, 144, 177, 125, 248, 195, 244}, + {125, 107, 144, 178, 125, 248, 196, 244}, {127, 108, 145, 180, 126, 249, 197, 245}, + {128, 108, 145, 181, 127, 249, 198, 245}, {130, 109, 145, 183, 128, 249, 199, 245}, + {132, 109, 145, 184, 128, 249, 200, 245}, {134, 110, 146, 186, 129, 250, 201, 246}, + {135, 110, 146, 187, 130, 250, 202, 246}, {137, 111, 147, 189, 131, 251, 203, 246}, + {138, 111, 147, 190, 131, 251, 204, 246}, {140, 112, 147, 192, 132, 251, 205, 247}, + {141, 112, 147, 193, 132, 251, 206, 247}, {143, 113, 148, 194, 133, 251, 207, 247}, + {144, 113, 148, 195, 134, 251, 207, 247}, {146, 114, 149, 197, 135, 252, 208, 248}, + {147, 114, 149, 198, 135, 252, 209, 248}, {149, 115, 149, 199, 136, 252, 210, 248}, + {150, 115, 149, 200, 137, 252, 210, 248}, {152, 115, 150, 201, 138, 252, 211, 248}, + {153, 115, 150, 202, 138, 252, 212, 248}, {155, 116, 151, 204, 139, 253, 213, 249}, + {156, 116, 151, 205, 139, 253, 213, 249}, {158, 117, 151, 206, 140, 253, 214, 249}, + {159, 117, 151, 207, 141, 253, 215, 249}, {161, 118, 152, 208, 142, 253, 216, 249}, + {162, 118, 152, 209, 142, 253, 216, 249}, {163, 119, 153, 210, 143, 253, 217, 249}, + {164, 119, 153, 211, 143, 253, 217, 249}, {166, 120, 153, 212, 144, 254, 218, 250}, + {167, 120, 153, 212, 145, 254, 219, 250}, {168, 121, 154, 213, 146, 254, 220, 250}, + {169, 121, 154, 214, 146, 254, 220, 250}, {171, 122, 155, 215, 147, 254, 221, 250}, + {172, 122, 155, 216, 147, 254, 221, 250}, {173, 123, 155, 217, 148, 254, 222, 250}, + {174, 123, 155, 217, 149, 254, 222, 250}, {176, 124, 156, 218, 150, 254, 223, 250}, + {177, 124, 156, 219, 150, 254, 223, 250}, {178, 125, 157, 220, 151, 254, 224, 251}, + {179, 125, 157, 220, 151, 254, 224, 251}, {180, 126, 157, 221, 152, 254, 225, 251}, + {181, 126, 157, 221, 152, 254, 225, 251}, {183, 127, 158, 222, 153, 254, 226, 251}, + {184, 127, 158, 223, 154, 254, 226, 251}, {185, 128, 159, 224, 155, 255, 227, 251}, + {186, 128, 159, 224, 155, 255, 227, 251}, {187, 129, 160, 225, 156, 255, 228, 251}, + {188, 130, 160, 225, 156, 255, 228, 251}, {189, 131, 160, 226, 157, 255, 228, 251}, + {190, 131, 160, 226, 158, 255, 228, 251}, {191, 132, 161, 227, 159, 255, 229, 251}, + {192, 132, 161, 227, 159, 255, 229, 251}, {193, 133, 162, 228, 160, 255, 230, 252}, + {194, 133, 162, 229, 160, 255, 230, 252}, {195, 134, 163, 230, 161, 255, 231, 252}, + {196, 134, 163, 230, 161, 255, 231, 252}, {197, 135, 163, 231, 162, 255, 231, 252}, + {198, 135, 163, 231, 162, 255, 231, 252}, {199, 136, 164, 232, 163, 255, 232, 252}, + {200, 136, 164, 232, 164, 255, 232, 252}, + {201, 137, 165, 233, 165, 255, 233, 252}, {201, 137, 165, 233, 165, 255, 233, 252}, {202, 138, 166, 233, 166, 255, 233, 252}, + {203, 138, 166, 233, 166, 255, 233, 252}, {204, 139, 166, 234, 167, 255, 234, 252}, + {205, 139, 166, 234, 167, 255, 234, 252}, + {206, 140, 167, 235, 168, 255, 235, 252}, {206, 140, 167, 235, 168, 255, 235, 252}, {207, 141, 168, 236, 169, 255, 235, 252}, + {208, 141, 168, 236, 170, 255, 235, 252}, {209, 142, 169, 237, 171, 255, 236, 252}, + {209, 143, 169, 237, 171, 255, 236, 252}, {210, 144, 169, 237, 172, 255, 236, 252}, + {211, 144, 169, 237, 172, 255, 236, 252}, {212, 145, 170, 238, 173, 255, 237, 252}, + {213, 145, 170, 238, 173, 255, 237, 252}, + {214, 146, 171, 239, 174, 255, 237, 253}, {214, 146, 171, 239, 174, 255, 237, 253}, {215, 147, 172, 240, 175, 255, 238, 253}, + {215, 147, 172, 240, 175, 255, 238, 253}, {216, 148, 173, 240, 176, 255, 238, 253}, + {217, 148, 173, 240, 176, 255, 238, 253}, {218, 149, 173, 241, 177, 255, 239, 253}, + {218, 149, 173, 241, 178, 255, 239, 253}, {219, 150, 174, 241, 179, 255, 239, 253}, + {219, 151, 174, 241, 179, 255, 239, 253}, {220, 152, 175, 242, 180, 255, 240, 253}, + {221, 152, 175, 242, 180, 255, 240, 253}, + {222, 153, 176, 242, 181, 255, 240, 253}, {222, 153, 176, 242, 181, 255, 240, 253}, {223, 154, 177, 243, 182, 255, 240, 253}, + {223, 154, 177, 243, 182, 255, 240, 253}, + {224, 155, 178, 244, 183, 255, 241, 253}, {224, 155, 178, 244, 183, 255, 241, 253}, {225, 156, 178, 244, 184, 255, 241, 253}, + {225, 157, 178, 244, 184, 255, 241, 253}, {226, 158, 179, 244, 185, 255, 242, 253}, + {227, 158, 179, 244, 185, 255, 242, 253}, + {228, 159, 180, 245, 186, 255, 242, 253}, {228, 159, 180, 245, 186, 255, 242, 253}, {229, 160, 181, 245, 187, 255, 242, 253}, + {229, 160, 181, 245, 187, 255, 242, 253}, {230, 161, 182, 246, 188, 255, 243, 253}, + {230, 162, 182, 246, 188, 255, 243, 253}, + {231, 163, 183, 246, 189, 255, 243, 253}, {231, 163, 183, 246, 189, 255, 243, 253}, {232, 164, 184, 247, 190, 255, 243, 253}, + {232, 164, 184, 247, 190, 255, 243, 253}, + {233, 165, 185, 247, 191, 255, 244, 253}, {233, 165, 185, 247, 191, 255, 244, 253}, {234, 166, 185, 247, 192, 255, 244, 253}, + {234, 167, 185, 247, 192, 255, 244, 253}, + {235, 168, 186, 248, 193, 255, 244, 253}, {235, 168, 186, 248, 193, 255, 244, 253}, {236, 169, 187, 248, 194, 255, 244, 253}, + {236, 169, 187, 248, 194, 255, 244, 253}, + {236, 170, 188, 248, 195, 255, 245, 253}, {236, 170, 188, 248, 195, 255, 245, 253}, {237, 171, 189, 249, 196, 255, 245, 254}, + {237, 172, 189, 249, 196, 255, 245, 254}, + {238, 173, 190, 249, 197, 255, 245, 254}, {238, 173, 190, 249, 197, 255, 245, 254}, {239, 174, 191, 249, 198, 255, 245, 254}, + {239, 174, 191, 249, 198, 255, 245, 254}, {240, 175, 192, 249, 199, 255, 246, 254}, + {240, 176, 192, 249, 199, 255, 246, 254}, + {240, 177, 193, 250, 200, 255, 246, 254}, {240, 177, 193, 250, 200, 255, 246, 254}, {241, 178, 194, 250, 201, 255, 246, 254}, + {241, 178, 194, 250, 201, 255, 246, 254}, {242, 179, 195, 250, 202, 255, 246, 254}, + {242, 180, 195, 250, 202, 255, 246, 254}, + {242, 181, 196, 250, 203, 255, 247, 254}, {242, 181, 196, 250, 203, 255, 247, 254}, {243, 182, 197, 251, 204, 255, 247, 254}, + {243, 183, 197, 251, 204, 255, 247, 254}, + {244, 184, 198, 251, 205, 255, 247, 254}, {244, 184, 198, 251, 205, 255, 247, 254}, {244, 185, 199, 251, 206, 255, 247, 254}, + {244, 185, 199, 251, 206, 255, 247, 254}, {245, 186, 200, 251, 207, 255, 247, 254}, + {245, 187, 200, 251, 207, 255, 247, 254}, + {246, 188, 201, 252, 207, 255, 248, 254}, {246, 188, 201, 252, 207, 255, 248, 254}, {246, 189, 202, 252, 208, 255, 248, 254}, + {246, 190, 202, 252, 208, 255, 248, 254}, + {247, 191, 203, 252, 209, 255, 248, 254}, {247, 191, 203, 252, 209, 255, 248, 254}, {247, 192, 204, 252, 210, 255, 248, 254}, + {247, 193, 204, 252, 210, 255, 248, 254}, + {248, 194, 205, 252, 211, 255, 248, 254}, {248, 194, 205, 252, 211, 255, 248, 254}, {248, 195, 206, 252, 212, 255, 249, 254}, + {248, 196, 206, 252, 212, 255, 249, 254}, + {249, 197, 207, 253, 213, 255, 249, 254}, {249, 197, 207, 253, 213, 255, 249, 254}, {249, 198, 208, 253, 214, 255, 249, 254}, + {249, 199, 209, 253, 214, 255, 249, 254}, + {250, 200, 210, 253, 215, 255, 249, 254}, {250, 200, 210, 253, 215, 255, 249, 254}, {250, 201, 211, 253, 215, 255, 249, 254}, + {250, 202, 211, 253, 215, 255, 249, 254}, + {250, 203, 212, 253, 216, 255, 249, 254}, {250, 203, 212, 253, 216, 255, 249, 254}, {251, 204, 213, 253, 217, 255, 250, 254}, + {251, 205, 213, 253, 217, 255, 250, 254}, {251, 206, 214, 254, 218, 255, 250, 254}, + {251, 206, 215, 254, 218, 255, 250, 254}, {252, 207, 216, 254, 219, 255, 250, 254}, + {252, 208, 216, 254, 219, 255, 250, 254}, {252, 209, 217, 254, 220, 255, 250, 254}, + {252, 210, 217, 254, 220, 255, 250, 254}, {252, 211, 218, 254, 221, 255, 250, 254}, + {252, 212, 218, 254, 221, 255, 250, 254}, {253, 213, 219, 254, 222, 255, 250, 254}, + {253, 213, 220, 254, 222, 255, 250, 254}, {253, 214, 221, 254, 223, 255, 250, 254}, + {253, 215, 221, 254, 223, 255, 250, 254}, {253, 216, 222, 254, 224, 255, 251, 254}, + {253, 217, 223, 254, 224, 255, 251, 254}, {253, 218, 224, 254, 225, 255, 251, 254}, + {253, 219, 224, 254, 225, 255, 251, 254}, {254, 220, 225, 254, 225, 255, 251, 254}, + {254, 221, 226, 254, 225, 255, 251, 254}, {254, 222, 227, 255, 226, 255, 251, 254}, + {254, 223, 227, 255, 226, 255, 251, 254}, {254, 224, 228, 255, 227, 255, 251, 254}, + {254, 225, 229, 255, 227, 255, 251, 254}, {254, 226, 230, 255, 228, 255, 251, 254}, + {254, 227, 230, 255, 229, 255, 251, 254}, {255, 228, 231, 255, 230, 255, 251, 254}, + {255, 229, 232, 255, 230, 255, 251, 254}, {255, 230, 233, 255, 231, 255, 252, 254}, + {255, 231, 234, 255, 231, 255, 252, 254}, {255, 232, 235, 255, 232, 255, 252, 254}, + {255, 233, 236, 255, 232, 255, 252, 254}, {255, 235, 237, 255, 233, 255, 252, 254}, + {255, 236, 238, 255, 234, 255, 252, 254}, {255, 238, 240, 255, 235, 255, 252, 255}, + {255, 239, 241, 255, 235, 255, 252, 254}, {255, 241, 243, 255, 236, 255, 252, 254}, - {255, 246, 247, 255, 239, 255, 253, 255} + {255, 243, 245, 255, 237, 255, 252, 254}, + {255, 246, 247, 255, 239, 255, 253, 255}, + {255, 246, 247, 255, 239, 255, 253, 255}, }; -static void extend_model_to_full_distribution(vp9_prob p, - vp9_prob *tree_probs) { - const int l = (p - 1) / 2; - const vp9_prob (*model)[MODEL_NODES] = modelcoefprobs_pareto8; - if (p & 1) { - vpx_memcpy(tree_probs + UNCONSTRAINED_NODES, - model[l], MODEL_NODES * sizeof(vp9_prob)); - } else { - // interpolate - int i; - for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i) - tree_probs[i] = (model[l][i - UNCONSTRAINED_NODES] + - model[l + 1][i - UNCONSTRAINED_NODES]) >> 1; +static const vp9_coeff_probs_model default_coef_probs_4x4[PLANE_TYPES] = { + { // Y plane + { // Intra + { // Band 0 + { 195, 29, 183 }, { 84, 49, 136 }, { 8, 42, 71 } + }, { // Band 1 + { 31, 107, 169 }, { 35, 99, 159 }, { 17, 82, 140 }, + { 8, 66, 114 }, { 2, 44, 76 }, { 1, 19, 32 } + }, { // Band 2 + { 40, 132, 201 }, { 29, 114, 187 }, { 13, 91, 157 }, + { 7, 75, 127 }, { 3, 58, 95 }, { 1, 28, 47 } + }, { // Band 3 + { 69, 142, 221 }, { 42, 122, 201 }, { 15, 91, 159 }, + { 6, 67, 121 }, { 1, 42, 77 }, { 1, 17, 31 } + }, { // Band 4 + { 102, 148, 228 }, { 67, 117, 204 }, { 17, 82, 154 }, + { 6, 59, 114 }, { 2, 39, 75 }, { 1, 15, 29 } + }, { // Band 5 + { 156, 57, 233 }, { 119, 57, 212 }, { 58, 48, 163 }, + { 29, 40, 124 }, { 12, 30, 81 }, { 3, 12, 31 } + } + }, { // Inter + { // Band 0 + { 191, 107, 226 }, { 124, 117, 204 }, { 25, 99, 155 } + }, { // Band 1 + { 29, 148, 210 }, { 37, 126, 194 }, { 8, 93, 157 }, + { 2, 68, 118 }, { 1, 39, 69 }, { 1, 17, 33 } + }, { // Band 2 + { 41, 151, 213 }, { 27, 123, 193 }, { 3, 82, 144 }, + { 1, 58, 105 }, { 1, 32, 60 }, { 1, 13, 26 } + }, { // Band 3 + { 59, 159, 220 }, { 23, 126, 198 }, { 4, 88, 151 }, + { 1, 66, 114 }, { 1, 38, 71 }, { 1, 18, 34 } + }, { // Band 4 + { 114, 136, 232 }, { 51, 114, 207 }, { 11, 83, 155 }, + { 3, 56, 105 }, { 1, 33, 65 }, { 1, 17, 34 } + }, { // Band 5 + { 149, 65, 234 }, { 121, 57, 215 }, { 61, 49, 166 }, + { 28, 36, 114 }, { 12, 25, 76 }, { 3, 16, 42 } + } + } + }, { // UV plane + { // Intra + { // Band 0 + { 214, 49, 220 }, { 132, 63, 188 }, { 42, 65, 137 } + }, { // Band 1 + { 85, 137, 221 }, { 104, 131, 216 }, { 49, 111, 192 }, + { 21, 87, 155 }, { 2, 49, 87 }, { 1, 16, 28 } + }, { // Band 2 + { 89, 163, 230 }, { 90, 137, 220 }, { 29, 100, 183 }, + { 10, 70, 135 }, { 2, 42, 81 }, { 1, 17, 33 } + }, { // Band 3 + { 108, 167, 237 }, { 55, 133, 222 }, { 15, 97, 179 }, + { 4, 72, 135 }, { 1, 45, 85 }, { 1, 19, 38 } + }, { // Band 4 + { 124, 146, 240 }, { 66, 124, 224 }, { 17, 88, 175 }, + { 4, 58, 122 }, { 1, 36, 75 }, { 1, 18, 37 } + }, { // Band 5 + { 141, 79, 241 }, { 126, 70, 227 }, { 66, 58, 182 }, + { 30, 44, 136 }, { 12, 34, 96 }, { 2, 20, 47 } + } + }, { // Inter + { // Band 0 + { 229, 99, 249 }, { 143, 111, 235 }, { 46, 109, 192 } + }, { // Band 1 + { 82, 158, 236 }, { 94, 146, 224 }, { 25, 117, 191 }, + { 9, 87, 149 }, { 3, 56, 99 }, { 1, 33, 57 } + }, { // Band 2 + { 83, 167, 237 }, { 68, 145, 222 }, { 10, 103, 177 }, + { 2, 72, 131 }, { 1, 41, 79 }, { 1, 20, 39 } + }, { // Band 3 + { 99, 167, 239 }, { 47, 141, 224 }, { 10, 104, 178 }, + { 2, 73, 133 }, { 1, 44, 85 }, { 1, 22, 47 } + }, { // Band 4 + { 127, 145, 243 }, { 71, 129, 228 }, { 17, 93, 177 }, + { 3, 61, 124 }, { 1, 41, 84 }, { 1, 21, 52 } + }, { // Band 5 + { 157, 78, 244 }, { 140, 72, 231 }, { 69, 58, 184 }, + { 31, 44, 137 }, { 14, 38, 105 }, { 8, 23, 61 } + } + } } +}; + +static const vp9_coeff_probs_model default_coef_probs_8x8[PLANE_TYPES] = { + { // Y plane + { // Intra + { // Band 0 + { 125, 34, 187 }, { 52, 41, 133 }, { 6, 31, 56 } + }, { // Band 1 + { 37, 109, 153 }, { 51, 102, 147 }, { 23, 87, 128 }, + { 8, 67, 101 }, { 1, 41, 63 }, { 1, 19, 29 } + }, { // Band 2 + { 31, 154, 185 }, { 17, 127, 175 }, { 6, 96, 145 }, + { 2, 73, 114 }, { 1, 51, 82 }, { 1, 28, 45 } + }, { // Band 3 + { 23, 163, 200 }, { 10, 131, 185 }, { 2, 93, 148 }, + { 1, 67, 111 }, { 1, 41, 69 }, { 1, 14, 24 } + }, { // Band 4 + { 29, 176, 217 }, { 12, 145, 201 }, { 3, 101, 156 }, + { 1, 69, 111 }, { 1, 39, 63 }, { 1, 14, 23 } + }, { // Band 5 + { 57, 192, 233 }, { 25, 154, 215 }, { 6, 109, 167 }, + { 3, 78, 118 }, { 1, 48, 69 }, { 1, 21, 29 } + } + }, { // Inter + { // Band 0 + { 202, 105, 245 }, { 108, 106, 216 }, { 18, 90, 144 } + }, { // Band 1 + { 33, 172, 219 }, { 64, 149, 206 }, { 14, 117, 177 }, + { 5, 90, 141 }, { 2, 61, 95 }, { 1, 37, 57 } + }, { // Band 2 + { 33, 179, 220 }, { 11, 140, 198 }, { 1, 89, 148 }, + { 1, 60, 104 }, { 1, 33, 57 }, { 1, 12, 21 } + }, { // Band 3 + { 30, 181, 221 }, { 8, 141, 198 }, { 1, 87, 145 }, + { 1, 58, 100 }, { 1, 31, 55 }, { 1, 12, 20 } + }, { // Band 4 + { 32, 186, 224 }, { 7, 142, 198 }, { 1, 86, 143 }, + { 1, 58, 100 }, { 1, 31, 55 }, { 1, 12, 22 } + }, { // Band 5 + { 57, 192, 227 }, { 20, 143, 204 }, { 3, 96, 154 }, + { 1, 68, 112 }, { 1, 42, 69 }, { 1, 19, 32 } + } + } + }, { // UV plane + { // Intra + { // Band 0 + { 212, 35, 215 }, { 113, 47, 169 }, { 29, 48, 105 } + }, { // Band 1 + { 74, 129, 203 }, { 106, 120, 203 }, { 49, 107, 178 }, + { 19, 84, 144 }, { 4, 50, 84 }, { 1, 15, 25 } + }, { // Band 2 + { 71, 172, 217 }, { 44, 141, 209 }, { 15, 102, 173 }, + { 6, 76, 133 }, { 2, 51, 89 }, { 1, 24, 42 } + }, { // Band 3 + { 64, 185, 231 }, { 31, 148, 216 }, { 8, 103, 175 }, + { 3, 74, 131 }, { 1, 46, 81 }, { 1, 18, 30 } + }, { // Band 4 + { 65, 196, 235 }, { 25, 157, 221 }, { 5, 105, 174 }, + { 1, 67, 120 }, { 1, 38, 69 }, { 1, 15, 30 } + }, { // Band 5 + { 65, 204, 238 }, { 30, 156, 224 }, { 7, 107, 177 }, + { 2, 70, 124 }, { 1, 42, 73 }, { 1, 18, 34 } + } + }, { // Inter + { // Band 0 + { 225, 86, 251 }, { 144, 104, 235 }, { 42, 99, 181 } + }, { // Band 1 + { 85, 175, 239 }, { 112, 165, 229 }, { 29, 136, 200 }, + { 12, 103, 162 }, { 6, 77, 123 }, { 2, 53, 84 } + }, { // Band 2 + { 75, 183, 239 }, { 30, 155, 221 }, { 3, 106, 171 }, + { 1, 74, 128 }, { 1, 44, 76 }, { 1, 17, 28 } + }, { // Band 3 + { 73, 185, 240 }, { 27, 159, 222 }, { 2, 107, 172 }, + { 1, 75, 127 }, { 1, 42, 73 }, { 1, 17, 29 } + }, { // Band 4 + { 62, 190, 238 }, { 21, 159, 222 }, { 2, 107, 172 }, + { 1, 72, 122 }, { 1, 40, 71 }, { 1, 18, 32 } + }, { // Band 5 + { 61, 199, 240 }, { 27, 161, 226 }, { 4, 113, 180 }, + { 1, 76, 129 }, { 1, 46, 80 }, { 1, 23, 41 } + } + } + } +}; + +static const vp9_coeff_probs_model default_coef_probs_16x16[PLANE_TYPES] = { + { // Y plane + { // Intra + { // Band 0 + { 7, 27, 153 }, { 5, 30, 95 }, { 1, 16, 30 } + }, { // Band 1 + { 50, 75, 127 }, { 57, 75, 124 }, { 27, 67, 108 }, + { 10, 54, 86 }, { 1, 33, 52 }, { 1, 12, 18 } + }, { // Band 2 + { 43, 125, 151 }, { 26, 108, 148 }, { 7, 83, 122 }, + { 2, 59, 89 }, { 1, 38, 60 }, { 1, 17, 27 } + }, { // Band 3 + { 23, 144, 163 }, { 13, 112, 154 }, { 2, 75, 117 }, + { 1, 50, 81 }, { 1, 31, 51 }, { 1, 14, 23 } + }, { // Band 4 + { 18, 162, 185 }, { 6, 123, 171 }, { 1, 78, 125 }, + { 1, 51, 86 }, { 1, 31, 54 }, { 1, 14, 23 } + }, { // Band 5 + { 15, 199, 227 }, { 3, 150, 204 }, { 1, 91, 146 }, + { 1, 55, 95 }, { 1, 30, 53 }, { 1, 11, 20 } + } + }, { // Inter + { // Band 0 + { 19, 55, 240 }, { 19, 59, 196 }, { 3, 52, 105 } + }, { // Band 1 + { 41, 166, 207 }, { 104, 153, 199 }, { 31, 123, 181 }, + { 14, 101, 152 }, { 5, 72, 106 }, { 1, 36, 52 } + }, { // Band 2 + { 35, 176, 211 }, { 12, 131, 190 }, { 2, 88, 144 }, + { 1, 60, 101 }, { 1, 36, 60 }, { 1, 16, 28 } + }, { // Band 3 + { 28, 183, 213 }, { 8, 134, 191 }, { 1, 86, 142 }, + { 1, 56, 96 }, { 1, 30, 53 }, { 1, 12, 20 } + }, { // Band 4 + { 20, 190, 215 }, { 4, 135, 192 }, { 1, 84, 139 }, + { 1, 53, 91 }, { 1, 28, 49 }, { 1, 11, 20 } + }, { // Band 5 + { 13, 196, 216 }, { 2, 137, 192 }, { 1, 86, 143 }, + { 1, 57, 99 }, { 1, 32, 56 }, { 1, 13, 24 } + } + } + }, { // UV plane + { // Intra + { // Band 0 + { 211, 29, 217 }, { 96, 47, 156 }, { 22, 43, 87 } + }, { // Band 1 + { 78, 120, 193 }, { 111, 116, 186 }, { 46, 102, 164 }, + { 15, 80, 128 }, { 2, 49, 76 }, { 1, 18, 28 } + }, { // Band 2 + { 71, 161, 203 }, { 42, 132, 192 }, { 10, 98, 150 }, + { 3, 69, 109 }, { 1, 44, 70 }, { 1, 18, 29 } + }, { // Band 3 + { 57, 186, 211 }, { 30, 140, 196 }, { 4, 93, 146 }, + { 1, 62, 102 }, { 1, 38, 65 }, { 1, 16, 27 } + }, { // Band 4 + { 47, 199, 217 }, { 14, 145, 196 }, { 1, 88, 142 }, + { 1, 57, 98 }, { 1, 36, 62 }, { 1, 15, 26 } + }, { // Band 5 + { 26, 219, 229 }, { 5, 155, 207 }, { 1, 94, 151 }, + { 1, 60, 104 }, { 1, 36, 62 }, { 1, 16, 28 } + } + }, { // Inter + { // Band 0 + { 233, 29, 248 }, { 146, 47, 220 }, { 43, 52, 140 } + }, { // Band 1 + { 100, 163, 232 }, { 179, 161, 222 }, { 63, 142, 204 }, + { 37, 113, 174 }, { 26, 89, 137 }, { 18, 68, 97 } + }, { // Band 2 + { 85, 181, 230 }, { 32, 146, 209 }, { 7, 100, 164 }, + { 3, 71, 121 }, { 1, 45, 77 }, { 1, 18, 30 } + }, { // Band 3 + { 65, 187, 230 }, { 20, 148, 207 }, { 2, 97, 159 }, + { 1, 68, 116 }, { 1, 40, 70 }, { 1, 14, 29 } + }, { // Band 4 + { 40, 194, 227 }, { 8, 147, 204 }, { 1, 94, 155 }, + { 1, 65, 112 }, { 1, 39, 66 }, { 1, 14, 26 } + }, { // Band 5 + { 16, 208, 228 }, { 3, 151, 207 }, { 1, 98, 160 }, + { 1, 67, 117 }, { 1, 41, 74 }, { 1, 17, 31 } + } + } + } +}; + +static const vp9_coeff_probs_model default_coef_probs_32x32[PLANE_TYPES] = { + { // Y plane + { // Intra + { // Band 0 + { 17, 38, 140 }, { 7, 34, 80 }, { 1, 17, 29 } + }, { // Band 1 + { 37, 75, 128 }, { 41, 76, 128 }, { 26, 66, 116 }, + { 12, 52, 94 }, { 2, 32, 55 }, { 1, 10, 16 } + }, { // Band 2 + { 50, 127, 154 }, { 37, 109, 152 }, { 16, 82, 121 }, + { 5, 59, 85 }, { 1, 35, 54 }, { 1, 13, 20 } + }, { // Band 3 + { 40, 142, 167 }, { 17, 110, 157 }, { 2, 71, 112 }, + { 1, 44, 72 }, { 1, 27, 45 }, { 1, 11, 17 } + }, { // Band 4 + { 30, 175, 188 }, { 9, 124, 169 }, { 1, 74, 116 }, + { 1, 48, 78 }, { 1, 30, 49 }, { 1, 11, 18 } + }, { // Band 5 + { 10, 222, 223 }, { 2, 150, 194 }, { 1, 83, 128 }, + { 1, 48, 79 }, { 1, 27, 45 }, { 1, 11, 17 } + } + }, { // Inter + { // Band 0 + { 36, 41, 235 }, { 29, 36, 193 }, { 10, 27, 111 } + }, { // Band 1 + { 85, 165, 222 }, { 177, 162, 215 }, { 110, 135, 195 }, + { 57, 113, 168 }, { 23, 83, 120 }, { 10, 49, 61 } + }, { // Band 2 + { 85, 190, 223 }, { 36, 139, 200 }, { 5, 90, 146 }, + { 1, 60, 103 }, { 1, 38, 65 }, { 1, 18, 30 } + }, { // Band 3 + { 72, 202, 223 }, { 23, 141, 199 }, { 2, 86, 140 }, + { 1, 56, 97 }, { 1, 36, 61 }, { 1, 16, 27 } + }, { // Band 4 + { 55, 218, 225 }, { 13, 145, 200 }, { 1, 86, 141 }, + { 1, 57, 99 }, { 1, 35, 61 }, { 1, 13, 22 } + }, { // Band 5 + { 15, 235, 212 }, { 1, 132, 184 }, { 1, 84, 139 }, + { 1, 57, 97 }, { 1, 34, 56 }, { 1, 14, 23 } + } + } + }, { // UV plane + { // Intra + { // Band 0 + { 181, 21, 201 }, { 61, 37, 123 }, { 10, 38, 71 } + }, { // Band 1 + { 47, 106, 172 }, { 95, 104, 173 }, { 42, 93, 159 }, + { 18, 77, 131 }, { 4, 50, 81 }, { 1, 17, 23 } + }, { // Band 2 + { 62, 147, 199 }, { 44, 130, 189 }, { 28, 102, 154 }, + { 18, 75, 115 }, { 2, 44, 65 }, { 1, 12, 19 } + }, { // Band 3 + { 55, 153, 210 }, { 24, 130, 194 }, { 3, 93, 146 }, + { 1, 61, 97 }, { 1, 31, 50 }, { 1, 10, 16 } + }, { // Band 4 + { 49, 186, 223 }, { 17, 148, 204 }, { 1, 96, 142 }, + { 1, 53, 83 }, { 1, 26, 44 }, { 1, 11, 17 } + }, { // Band 5 + { 13, 217, 212 }, { 2, 136, 180 }, { 1, 78, 124 }, + { 1, 50, 83 }, { 1, 29, 49 }, { 1, 14, 23 } + } + }, { // Inter + { // Band 0 + { 197, 13, 247 }, { 82, 17, 222 }, { 25, 17, 162 } + }, { // Band 1 + { 126, 186, 247 }, { 234, 191, 243 }, { 176, 177, 234 }, + { 104, 158, 220 }, { 66, 128, 186 }, { 55, 90, 137 } + }, { // Band 2 + { 111, 197, 242 }, { 46, 158, 219 }, { 9, 104, 171 }, + { 2, 65, 125 }, { 1, 44, 80 }, { 1, 17, 91 } + }, { // Band 3 + { 104, 208, 245 }, { 39, 168, 224 }, { 3, 109, 162 }, + { 1, 79, 124 }, { 1, 50, 102 }, { 1, 43, 102 } + }, { // Band 4 + { 84, 220, 246 }, { 31, 177, 231 }, { 2, 115, 180 }, + { 1, 79, 134 }, { 1, 55, 77 }, { 1, 60, 79 } + }, { // Band 5 + { 43, 243, 240 }, { 8, 180, 217 }, { 1, 115, 166 }, + { 1, 84, 121 }, { 1, 51, 67 }, { 1, 16, 6 } + } + } + } +}; + +static void extend_to_full_distribution(vp9_prob *probs, vp9_prob p) { + vpx_memcpy(probs, vp9_pareto8_full[p = 0 ? 0 : p - 1], + MODEL_NODES * sizeof(vp9_prob)); } void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) { if (full != model) vpx_memcpy(full, model, sizeof(vp9_prob) * UNCONSTRAINED_NODES); - extend_model_to_full_distribution(model[PIVOT_NODE], full); + extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]); } -static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28]; - -static void init_bit_tree(vp9_tree_index *p, int n) { - int i = 0; - - while (++i < n) { - p[0] = p[1] = i << 1; - p += 2; - } - - p[0] = p[1] = 0; -} - -static void init_bit_trees() { - init_bit_tree(cat1, 1); - init_bit_tree(cat2, 2); - init_bit_tree(cat3, 3); - init_bit_tree(cat4, 4); - init_bit_tree(cat5, 5); - init_bit_tree(cat6, 14); -} - -const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS] = { - { 0, 0, 0, 0}, - { 0, 0, 0, 1}, - { 0, 0, 0, 2}, - { 0, 0, 0, 3}, - { 0, 0, 0, 4}, - { cat1, Pcat1, 1, 5}, - { cat2, Pcat2, 2, 7}, - { cat3, Pcat3, 3, 11}, - { cat4, Pcat4, 4, 19}, - { cat5, Pcat5, 5, 35}, - { cat6, Pcat6, 14, 67}, - { 0, 0, 0, 0} -}; - -#include "vp9/common/vp9_default_coef_probs.h" - void vp9_default_coef_probs(VP9_COMMON *cm) { vp9_copy(cm->fc.coef_probs[TX_4X4], default_coef_probs_4x4); vp9_copy(cm->fc.coef_probs[TX_8X8], default_coef_probs_8x8); @@ -361,13 +734,6 @@ void vp9_default_coef_probs(VP9_COMMON *cm) { vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32); } -void vp9_coef_tree_initialize() { - init_bit_trees(); - vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree); -} - -// #define COEF_COUNT_TESTING - #define COEF_COUNT_SAT 24 #define COEF_MAX_UPDATE_FACTOR 112 #define COEF_COUNT_SAT_KEY 24 @@ -379,29 +745,30 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size, unsigned int count_sat, unsigned int update_factor) { const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx]; - - vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[tx_size]; - const vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[tx_size]; - vp9_coeff_count_model *coef_counts = cm->counts.coef[tx_size]; - unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] = + vp9_coeff_probs_model *const probs = cm->fc.coef_probs[tx_size]; + const vp9_coeff_probs_model *const pre_probs = pre_fc->coef_probs[tx_size]; + vp9_coeff_count_model *counts = cm->counts.coef[tx_size]; + unsigned int (*eob_counts)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] = cm->counts.eob_branch[tx_size]; int i, j, k, l, m; - unsigned int branch_ct[UNCONSTRAINED_NODES][2]; - for (i = 0; i < BLOCK_TYPES; ++i) + for (i = 0; i < PLANE_TYPES; ++i) for (j = 0; j < REF_TYPES; ++j) for (k = 0; k < COEF_BANDS; ++k) - for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { - if (l >= 3 && k == 0) - continue; - vp9_tree_probs_from_distribution(vp9_coefmodel_tree, branch_ct, - coef_counts[i][j][k][l]); - branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0]; + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { + const int n0 = counts[i][j][k][l][ZERO_TOKEN]; + const int n1 = counts[i][j][k][l][ONE_TOKEN]; + const int n2 = counts[i][j][k][l][TWO_TOKEN]; + const int neob = counts[i][j][k][l][EOB_MODEL_TOKEN]; + const unsigned int branch_ct[UNCONSTRAINED_NODES][2] = { + { neob, eob_counts[i][j][k][l] - neob }, + { n0, n1 + n2 }, + { n1, n2 } + }; for (m = 0; m < UNCONSTRAINED_NODES; ++m) - dst_coef_probs[i][j][k][l][m] = merge_probs( - pre_coef_probs[i][j][k][l][m], - branch_ct[m], - count_sat, update_factor); + probs[i][j][k][l][m] = merge_probs(pre_probs[i][j][k][l][m], + branch_ct[m], + count_sat, update_factor); } } diff --git a/media/libvpx/vp9/common/vp9_entropy.h b/media/libvpx/vp9/common/vp9_entropy.h index e133d6509262..8a10f2320475 100644 --- a/media/libvpx/vp9/common/vp9_entropy.h +++ b/media/libvpx/vp9/common/vp9_entropy.h @@ -16,40 +16,51 @@ #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_scan.h" -#include "vp9/common/vp9_treecoder.h" + +#ifdef __cplusplus +extern "C" { +#endif #define DIFF_UPDATE_PROB 252 -/* Coefficient token alphabet */ +// Coefficient token alphabet +#define ZERO_TOKEN 0 // 0 Extra Bits 0+0 +#define ONE_TOKEN 1 // 1 Extra Bits 0+1 +#define TWO_TOKEN 2 // 2 Extra Bits 0+1 +#define THREE_TOKEN 3 // 3 Extra Bits 0+1 +#define FOUR_TOKEN 4 // 4 Extra Bits 0+1 +#define CATEGORY1_TOKEN 5 // 5-6 Extra Bits 1+1 +#define CATEGORY2_TOKEN 6 // 7-10 Extra Bits 2+1 +#define CATEGORY3_TOKEN 7 // 11-18 Extra Bits 3+1 +#define CATEGORY4_TOKEN 8 // 19-34 Extra Bits 4+1 +#define CATEGORY5_TOKEN 9 // 35-66 Extra Bits 5+1 +#define CATEGORY6_TOKEN 10 // 67+ Extra Bits 14+1 +#define EOB_TOKEN 11 // EOB Extra Bits 0+0 -#define ZERO_TOKEN 0 /* 0 Extra Bits 0+0 */ -#define ONE_TOKEN 1 /* 1 Extra Bits 0+1 */ -#define TWO_TOKEN 2 /* 2 Extra Bits 0+1 */ -#define THREE_TOKEN 3 /* 3 Extra Bits 0+1 */ -#define FOUR_TOKEN 4 /* 4 Extra Bits 0+1 */ -#define DCT_VAL_CATEGORY1 5 /* 5-6 Extra Bits 1+1 */ -#define DCT_VAL_CATEGORY2 6 /* 7-10 Extra Bits 2+1 */ -#define DCT_VAL_CATEGORY3 7 /* 11-18 Extra Bits 3+1 */ -#define DCT_VAL_CATEGORY4 8 /* 19-34 Extra Bits 4+1 */ -#define DCT_VAL_CATEGORY5 9 /* 35-66 Extra Bits 5+1 */ -#define DCT_VAL_CATEGORY6 10 /* 67+ Extra Bits 14+1 */ -#define DCT_EOB_TOKEN 11 /* EOB Extra Bits 0+0 */ -#define MAX_ENTROPY_TOKENS 12 -#define ENTROPY_NODES 11 -#define EOSB_TOKEN 127 /* Not signalled, encoder only */ +#define ENTROPY_TOKENS 12 -#define INTER_MODE_CONTEXTS 7 +#define ENTROPY_NODES 11 -extern DECLARE_ALIGNED(16, const uint8_t, - vp9_pt_energy_class[MAX_ENTROPY_TOKENS]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_pt_energy_class[ENTROPY_TOKENS]); -extern const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)]; +#define CAT1_MIN_VAL 5 +#define CAT2_MIN_VAL 7 +#define CAT3_MIN_VAL 11 +#define CAT4_MIN_VAL 19 +#define CAT5_MIN_VAL 35 +#define CAT6_MIN_VAL 67 -#define DCT_EOB_MODEL_TOKEN 3 /* EOB Extra Bits 0+0 */ +// Extra bit probabilities. +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat1_prob[1]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat2_prob[2]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat3_prob[3]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat4_prob[4]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat5_prob[5]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_cat6_prob[14]); + +#define EOB_MODEL_TOKEN 3 extern const vp9_tree_index vp9_coefmodel_tree[]; -extern struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS]; - typedef struct { const vp9_tree_index *tree; const vp9_prob *prob; @@ -58,15 +69,12 @@ typedef struct { } vp9_extra_bit; // indexed by token value -extern const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS]; +extern const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS]; -#define MAX_PROB 255 #define DCT_MAX_VALUE 16384 /* Coefficients are predicted via a 3-dimensional probability table. */ -/* Outside dimension. 0 = Y with DC, 1 = UV */ -#define BLOCK_TYPES 2 #define REF_TYPES 2 // intra=0, inter=1 /* Middle dimension reflects the coefficient position within the transform. */ @@ -88,13 +96,14 @@ extern const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS]; coefficient band (and since zigzag positions 0, 1, and 2 are in distinct bands). */ -#define PREV_COEF_CONTEXTS 6 +#define COEFF_CONTEXTS 6 +#define BAND_COEFF_CONTEXTS(band) ((band) == 0 ? 3 : COEFF_CONTEXTS) // #define ENTROPY_STATS -typedef unsigned int vp9_coeff_count[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] - [MAX_ENTROPY_TOKENS]; -typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] +typedef unsigned int vp9_coeff_count[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] + [ENTROPY_TOKENS]; +typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] [ENTROPY_NODES][2]; #define SUBEXP_PARAM 4 /* Subexponential code parameter */ @@ -102,8 +111,6 @@ typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] struct VP9Common; void vp9_default_coef_probs(struct VP9Common *cm); - -void vp9_coef_tree_initialize(); void vp9_adapt_coef_probs(struct VP9Common *cm); static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) { @@ -123,10 +130,10 @@ static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) { // This macro is currently unused but may be used by certain implementations #define MAXBAND_INDEX 21 -extern const uint8_t vp9_coefband_trans_8x8plus[1024]; -extern const uint8_t vp9_coefband_trans_4x4[16]; +DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_8x8plus[1024]); +DECLARE_ALIGNED(16, extern const uint8_t, vp9_coefband_trans_4x4[16]); -static const uint8_t *get_band_translate(TX_SIZE tx_size) { +static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) { return tx_size == TX_4X4 ? vp9_coefband_trans_4x4 : vp9_coefband_trans_8x8plus; } @@ -135,24 +142,26 @@ static const uint8_t *get_band_translate(TX_SIZE tx_size) { // 1, 3, 5, 7, ..., 253, 255 // In between probabilities are interpolated linearly -#define COEFPROB_MODELS 128 +#define COEFF_PROB_MODELS 256 #define UNCONSTRAINED_NODES 3 #define PIVOT_NODE 2 // which node is pivot +#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES) +extern const vp9_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES]; + typedef vp9_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS] - [PREV_COEF_CONTEXTS] - [UNCONSTRAINED_NODES]; + [COEFF_CONTEXTS][UNCONSTRAINED_NODES]; typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS] - [PREV_COEF_CONTEXTS] + [COEFF_CONTEXTS] [UNCONSTRAINED_NODES + 1]; void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full); -static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, - const ENTROPY_CONTEXT *l) { +static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, + const ENTROPY_CONTEXT *l) { ENTROPY_CONTEXT above_ec = 0, left_ec = 0; switch (tx_size) { @@ -173,32 +182,27 @@ static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, left_ec = !!*(const uint64_t *)l; break; default: - assert(!"Invalid transform size."); + assert(0 && "Invalid transform size."); + break; } return combine_entropy_contexts(above_ec, left_ec); } -static void get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, - PLANE_TYPE type, int block_idx, - const int16_t **scan, const int16_t **scan_nb) { - switch (tx_size) { - case TX_4X4: - get_scan_nb_4x4(get_tx_type_4x4(type, xd, block_idx), scan, scan_nb); - break; - case TX_8X8: - get_scan_nb_8x8(get_tx_type_8x8(type, xd), scan, scan_nb); - break; - case TX_16X16: - get_scan_nb_16x16(get_tx_type_16x16(type, xd), scan, scan_nb); - break; - case TX_32X32: - *scan = vp9_default_scan_32x32; - *scan_nb = vp9_default_scan_32x32_neighbors; - break; - default: - assert(!"Invalid transform size."); +static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, + PLANE_TYPE type, int block_idx) { + const MODE_INFO *const mi = xd->mi[0]; + + if (is_inter_block(&mi->mbmi) || type != PLANE_TYPE_Y || xd->lossless) { + return &vp9_default_scan_orders[tx_size]; + } else { + const PREDICTION_MODE mode = get_y_mode(mi, block_idx); + return &vp9_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]]; } } +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_ENTROPY_H_ diff --git a/media/libvpx/vp9/common/vp9_entropymode.c b/media/libvpx/vp9/common/vp9_entropymode.c index 3b2510dcd476..5b00b0082a13 100644 --- a/media/libvpx/vp9/common/vp9_entropymode.c +++ b/media/libvpx/vp9/common/vp9_entropymode.c @@ -10,7 +10,6 @@ #include "vpx_mem/vpx_mem.h" -#include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_seg_common.h" @@ -232,21 +231,18 @@ const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = { -D63_PRED, 16, /* 7 = D63_NODE */ -D153_PRED, -D207_PRED /* 8 = D153_NODE */ }; -struct vp9_token vp9_intra_mode_encodings[INTRA_MODES]; const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = { -INTER_OFFSET(ZEROMV), 2, -INTER_OFFSET(NEARESTMV), 4, -INTER_OFFSET(NEARMV), -INTER_OFFSET(NEWMV) }; -struct vp9_token vp9_inter_mode_encodings[INTER_MODES]; const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = { -PARTITION_NONE, 2, -PARTITION_HORZ, 4, -PARTITION_VERT, -PARTITION_SPLIT }; -struct vp9_token vp9_partition_encodings[PARTITION_TYPES]; static const vp9_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = { 9, 102, 187, 225 @@ -306,7 +302,7 @@ void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p, ct_8x8p[0][1] = tx_count_8x8p[TX_8X8]; } -static const vp9_prob default_mbskip_probs[MBSKIP_CONTEXTS] = { +static const vp9_prob default_skip_probs[SKIP_CONTEXTS] = { 192, 128, 64 }; @@ -318,17 +314,18 @@ static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS] { 149, 144, }, }; -void vp9_init_mbmode_probs(VP9_COMMON *cm) { - vp9_copy(cm->fc.uv_mode_prob, default_if_uv_probs); - vp9_copy(cm->fc.y_mode_prob, default_if_y_probs); - vp9_copy(cm->fc.switchable_interp_prob, default_switchable_interp_prob); - vp9_copy(cm->fc.partition_prob, default_partition_probs); - vp9_copy(cm->fc.intra_inter_prob, default_intra_inter_p); - vp9_copy(cm->fc.comp_inter_prob, default_comp_inter_p); - vp9_copy(cm->fc.comp_ref_prob, default_comp_ref_p); - vp9_copy(cm->fc.single_ref_prob, default_single_ref_p); - cm->fc.tx_probs = default_tx_probs; - vp9_copy(cm->fc.mbskip_probs, default_mbskip_probs); +void vp9_init_mode_probs(FRAME_CONTEXT *fc) { + vp9_copy(fc->uv_mode_prob, default_if_uv_probs); + vp9_copy(fc->y_mode_prob, default_if_y_probs); + vp9_copy(fc->switchable_interp_prob, default_switchable_interp_prob); + vp9_copy(fc->partition_prob, default_partition_probs); + vp9_copy(fc->intra_inter_prob, default_intra_inter_p); + vp9_copy(fc->comp_inter_prob, default_comp_inter_p); + vp9_copy(fc->comp_ref_prob, default_comp_ref_p); + vp9_copy(fc->single_ref_prob, default_single_ref_p); + fc->tx_probs = default_tx_probs; + vp9_copy(fc->skip_probs, default_skip_probs); + vp9_copy(fc->inter_mode_probs, default_inter_mode_probs); } const vp9_tree_index vp9_switchable_interp_tree @@ -336,15 +333,6 @@ const vp9_tree_index vp9_switchable_interp_tree -EIGHTTAP, 2, -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP }; -struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS]; - -void vp9_entropy_mode_init() { - vp9_tokens_from_tree(vp9_intra_mode_encodings, vp9_intra_mode_tree); - vp9_tokens_from_tree(vp9_switchable_interp_encodings, - vp9_switchable_interp_tree); - vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree); - vp9_tokens_from_tree(vp9_inter_mode_encodings, vp9_inter_mode_tree); -} #define COUNT_SAT 20 #define MAX_UPDATE_FACTOR 128 @@ -356,7 +344,7 @@ static int adapt_prob(vp9_prob pre_prob, const unsigned int ct[2]) { static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, const unsigned int *counts, vp9_prob *probs) { - tree_merge_probs(tree, pre_probs, counts, COUNT_SAT, MAX_UPDATE_FACTOR, + vp9_tree_merge_probs(tree, pre_probs, counts, COUNT_SAT, MAX_UPDATE_FACTOR, probs); } @@ -396,7 +384,7 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) { adapt_probs(vp9_partition_tree, pre_fc->partition_prob[i], counts->partition[i], fc->partition_prob[i]); - if (cm->mcomp_filter_type == SWITCHABLE) { + if (cm->interp_filter == SWITCHABLE) { for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) adapt_probs(vp9_switchable_interp_tree, pre_fc->switchable_interp_prob[i], counts->switchable_interp[i], fc->switchable_interp_prob[i]); @@ -426,9 +414,8 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) { } } - for (i = 0; i < MBSKIP_CONTEXTS; ++i) - fc->mbskip_probs[i] = adapt_prob(pre_fc->mbskip_probs[i], - counts->mbskip[i]); + for (i = 0; i < SKIP_CONTEXTS; ++i) + fc->skip_probs[i] = adapt_prob(pre_fc->skip_probs[i], counts->skip[i]); } static void set_default_lf_deltas(struct loopfilter *lf) { @@ -464,27 +451,24 @@ void vp9_setup_past_independence(VP9_COMMON *cm) { lf->last_sharpness_level = -1; vp9_default_coef_probs(cm); - vp9_init_mbmode_probs(cm); + vp9_init_mode_probs(&cm->fc); vp9_init_mv_probs(cm); - vp9_copy(cm->fc.inter_mode_probs, default_inter_mode_probs); if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode || cm->reset_frame_context == 3) { // Reset all frame contexts. - for (i = 0; i < NUM_FRAME_CONTEXTS; ++i) + for (i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = cm->fc; } else if (cm->reset_frame_context == 2) { // Reset only the frame context specified in the frame header. cm->frame_contexts[cm->frame_context_idx] = cm->fc; } - vpx_memset(cm->prev_mip, 0, - cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO)); - vpx_memset(cm->mip, 0, - cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO)); + if (frame_is_intra_only(cm)) + vpx_memset(cm->prev_mip, 0, cm->mi_stride * (cm->mi_rows + 1) * + sizeof(*cm->prev_mip)); - vp9_update_mode_info_border(cm, cm->mip); - vp9_update_mode_info_border(cm, cm->prev_mip); + vpx_memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip)); vp9_zero(cm->ref_frame_sign_bias); diff --git a/media/libvpx/vp9/common/vp9_entropymode.h b/media/libvpx/vp9/common/vp9_entropymode.h index 38b419948dd1..533757bef031 100644 --- a/media/libvpx/vp9/common/vp9_entropymode.h +++ b/media/libvpx/vp9/common/vp9_entropymode.h @@ -12,14 +12,17 @@ #define VP9_COMMON_VP9_ENTROPYMODE_H_ #include "vp9/common/vp9_blockd.h" -#include "vp9/common/vp9_treecoder.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_entropymv.h" + +#ifdef __cplusplus +extern "C" { +#endif #define TX_SIZE_CONTEXTS 2 #define SWITCHABLE_FILTERS 3 // number of switchable filters #define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1) -// #define MODE_STATS - struct VP9Common; struct tx_probs { @@ -34,31 +37,56 @@ struct tx_counts { unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2]; }; +typedef struct frame_contexts { + vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1]; + vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1]; + vp9_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1]; + vp9_coeff_probs_model coef_probs[TX_SIZES][PLANE_TYPES]; + vp9_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS - 1]; + vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1]; + vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS]; + vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS]; + vp9_prob single_ref_prob[REF_CONTEXTS][2]; + vp9_prob comp_ref_prob[REF_CONTEXTS]; + struct tx_probs tx_probs; + vp9_prob skip_probs[SKIP_CONTEXTS]; + nmv_context nmvc; +} FRAME_CONTEXT; + +typedef struct { + unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES]; + unsigned int uv_mode[INTRA_MODES][INTRA_MODES]; + unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES]; + vp9_coeff_count_model coef[TX_SIZES][PLANE_TYPES]; + unsigned int eob_branch[TX_SIZES][PLANE_TYPES][REF_TYPES] + [COEF_BANDS][COEFF_CONTEXTS]; + unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS] + [SWITCHABLE_FILTERS]; + unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES]; + unsigned int intra_inter[INTRA_INTER_CONTEXTS][2]; + unsigned int comp_inter[COMP_INTER_CONTEXTS][2]; + unsigned int single_ref[REF_CONTEXTS][2][2]; + unsigned int comp_ref[REF_CONTEXTS][2]; + struct tx_counts tx; + unsigned int skip[SKIP_CONTEXTS][2]; + nmv_context_counts mv; +} FRAME_COUNTS; + extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1]; extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES] [INTRA_MODES - 1]; - extern const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS] [PARTITION_TYPES - 1]; - extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)]; -extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES]; - extern const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)]; -extern struct vp9_token vp9_inter_mode_encodings[INTER_MODES]; - extern const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)]; -extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES]; - extern const vp9_tree_index vp9_switchable_interp_tree [TREE_SIZE(SWITCHABLE_FILTERS)]; -extern struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS]; - -void vp9_entropy_mode_init(); void vp9_setup_past_independence(struct VP9Common *cm); -void vp9_init_mbmode_probs(struct VP9Common *cm); +void vp9_init_mode_probs(FRAME_CONTEXT *fc); void vp9_adapt_mode_probs(struct VP9Common *cm); @@ -69,4 +97,17 @@ void tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p, void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p, unsigned int (*ct_8x8p)[2]); +static INLINE const vp9_prob *get_y_mode_probs(const MODE_INFO *mi, + const MODE_INFO *above_mi, + const MODE_INFO *left_mi, + int block) { + const PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, block); + const PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, block); + return vp9_kf_y_mode_prob[above][left]; +} + +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_ENTROPYMODE_H_ diff --git a/media/libvpx/vp9/common/vp9_entropymv.c b/media/libvpx/vp9/common/vp9_entropymv.c index 290dcdd17278..5bb048202b73 100644 --- a/media/libvpx/vp9/common/vp9_entropymv.c +++ b/media/libvpx/vp9/common/vp9_entropymv.c @@ -8,14 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ - #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_entropymv.h" #define MV_COUNT_SAT 20 #define MV_MAX_UPDATE_FACTOR 128 -/* Integer pel reference mv threshold for use of high-precision 1/8 mv */ +// Integer pel reference mv threshold for use of high-precision 1/8 mv #define COMPANDED_MVREF_THRESH 8 const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = { @@ -23,7 +22,6 @@ const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = { -MV_JOINT_HNZVZ, 4, -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ }; -struct vp9_token vp9_mv_joint_encodings[MV_JOINTS]; const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = { -MV_CLASS_0, 2, @@ -37,48 +35,43 @@ const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = { -MV_CLASS_7, -MV_CLASS_8, -MV_CLASS_9, -MV_CLASS_10, }; -struct vp9_token vp9_mv_class_encodings[MV_CLASSES]; const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = { -0, -1, }; -struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE]; -const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(4)] = { +const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = { -0, 2, -1, 4, -2, -3 }; -struct vp9_token vp9_mv_fp_encodings[4]; static const nmv_context default_nmv_context = { {32, 64, 96}, - { // NOLINT - { /* vert component */ // NOLINT - 128, /* sign */ - {224, 144, 192, 168, 192, 176, 192, 198, 198, 245}, /* class */ - {216}, /* class0 */ - {136, 140, 148, 160, 176, 192, 224, 234, 234, 240}, /* bits */ - {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */ - {64, 96, 64}, /* fp */ - 160, /* class0_hp bit */ - 128, /* hp */ + { + { // Vertical component + 128, // sign + {224, 144, 192, 168, 192, 176, 192, 198, 198, 245}, // class + {216}, // class0 + {136, 140, 148, 160, 176, 192, 224, 234, 234, 240}, // bits + {{128, 128, 64}, {96, 112, 64}}, // class0_fp + {64, 96, 64}, // fp + 160, // class0_hp bit + 128, // hp }, - { /* hor component */ // NOLINT - 128, /* sign */ - {216, 128, 176, 160, 176, 176, 192, 198, 198, 208}, /* class */ - {208}, /* class0 */ - {136, 140, 148, 160, 176, 192, 224, 234, 234, 240}, /* bits */ - {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */ - {64, 96, 64}, /* fp */ - 160, /* class0_hp bit */ - 128, /* hp */ + { // Horizontal component + 128, // sign + {216, 128, 176, 160, 176, 176, 192, 198, 198, 208}, // class + {208}, // class0 + {136, 140, 148, 160, 176, 192, 224, 234, 234, 240}, // bits + {{128, 128, 64}, {96, 112, 64}}, // class0_fp + {64, 96, 64}, // fp + 160, // class0_hp bit + 128, // hp } }, }; -#define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0) - static const uint8_t log_in_base_2[] = { 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, @@ -125,13 +118,13 @@ static const uint8_t log_in_base_2[] = { 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10 }; -MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) { - MV_CLASS_TYPE c = MV_CLASS_0; - if (z >= CLASS0_SIZE * 4096) - c = MV_CLASS_10; - else - c = log_in_base_2[z >> 3]; +static INLINE int mv_class_base(MV_CLASS_TYPE c) { + return c ? CLASS0_SIZE << (c + 2) : 0; +} +MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) { + const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096) ? + MV_CLASS_10 : (MV_CLASS_TYPE)log_in_base_2[z >> 3]; if (offset) *offset = z - mv_class_base(c); return c; @@ -196,8 +189,8 @@ static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) { static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, const unsigned int *counts, vp9_prob *probs) { - tree_merge_probs(tree, pre_probs, counts, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR, - probs); + vp9_tree_merge_probs(tree, pre_probs, counts, MV_COUNT_SAT, + MV_MAX_UPDATE_FACTOR, probs); } void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) { @@ -235,13 +228,6 @@ void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) { } } -void vp9_entropy_mv_init() { - vp9_tokens_from_tree(vp9_mv_joint_encodings, vp9_mv_joint_tree); - vp9_tokens_from_tree(vp9_mv_class_encodings, vp9_mv_class_tree); - vp9_tokens_from_tree(vp9_mv_class0_encodings, vp9_mv_class0_tree); - vp9_tokens_from_tree(vp9_mv_fp_encodings, vp9_mv_fp_tree); -} - void vp9_init_mv_probs(VP9_COMMON *cm) { cm->fc.nmvc = default_nmv_context; } diff --git a/media/libvpx/vp9/common/vp9_entropymv.h b/media/libvpx/vp9/common/vp9_entropymv.h index d843f5bfee16..e7033e437bd9 100644 --- a/media/libvpx/vp9/common/vp9_entropymv.h +++ b/media/libvpx/vp9/common/vp9_entropymv.h @@ -12,19 +12,21 @@ #ifndef VP9_COMMON_VP9_ENTROPYMV_H_ #define VP9_COMMON_VP9_ENTROPYMV_H_ -#include "vp9/common/vp9_treecoder.h" #include "./vpx_config.h" #include "vp9/common/vp9_blockd.h" +#ifdef __cplusplus +extern "C" { +#endif + struct VP9Common; -void vp9_entropy_mv_init(); void vp9_init_mv_probs(struct VP9Common *cm); void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp); int vp9_use_mv_hp(const MV *ref); -#define NMV_UPDATE_PROB 252 +#define MV_UPDATE_PROB 252 /* Symbols for coding which components are zero jointly */ #define MV_JOINTS 4 @@ -62,6 +64,7 @@ typedef enum { #define CLASS0_BITS 1 /* bits at integer precision for class 0 */ #define CLASS0_SIZE (1 << CLASS0_BITS) #define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2) +#define MV_FP_SIZE 4 #define MV_MAX_BITS (MV_CLASSES + CLASS0_BITS + 2) #define MV_MAX ((1 << MV_MAX_BITS) - 1) @@ -71,25 +74,18 @@ typedef enum { #define MV_UPP ((1 << MV_IN_USE_BITS) - 1) #define MV_LOW (-(1 << MV_IN_USE_BITS)) -extern const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)]; -extern struct vp9_token vp9_mv_joint_encodings[MV_JOINTS]; - -extern const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)]; -extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES]; - -extern const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)]; -extern struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE]; - -extern const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(4)]; -extern struct vp9_token vp9_mv_fp_encodings[4]; +extern const vp9_tree_index vp9_mv_joint_tree[]; +extern const vp9_tree_index vp9_mv_class_tree[]; +extern const vp9_tree_index vp9_mv_class0_tree[]; +extern const vp9_tree_index vp9_mv_fp_tree[]; typedef struct { vp9_prob sign; vp9_prob classes[MV_CLASSES - 1]; vp9_prob class0[CLASS0_SIZE - 1]; vp9_prob bits[MV_OFFSET_BITS]; - vp9_prob class0_fp[CLASS0_SIZE][4 - 1]; - vp9_prob fp[4 - 1]; + vp9_prob class0_fp[CLASS0_SIZE][MV_FP_SIZE - 1]; + vp9_prob fp[MV_FP_SIZE - 1]; vp9_prob class0_hp; vp9_prob hp; } nmv_component; @@ -116,8 +112,8 @@ typedef struct { unsigned int classes[MV_CLASSES]; unsigned int class0[CLASS0_SIZE]; unsigned int bits[MV_OFFSET_BITS][2]; - unsigned int class0_fp[CLASS0_SIZE][4]; - unsigned int fp[4]; + unsigned int class0_fp[CLASS0_SIZE][MV_FP_SIZE]; + unsigned int fp[MV_FP_SIZE]; unsigned int class0_hp[2]; unsigned int hp[2]; } nmv_component_counts; @@ -129,4 +125,8 @@ typedef struct { void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx); +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_ENTROPYMV_H_ diff --git a/media/libvpx/vp9/common/vp9_enums.h b/media/libvpx/vp9/common/vp9_enums.h index 1651b905055d..8817fdbb9226 100644 --- a/media/libvpx/vp9/common/vp9_enums.h +++ b/media/libvpx/vp9/common/vp9_enums.h @@ -13,6 +13,10 @@ #include "./vpx_config.h" +#ifdef __cplusplus +extern "C" { +#endif + #define MI_SIZE_LOG2 3 #define MI_BLOCK_SIZE_LOG2 (6 - MI_SIZE_LOG2) // 64 = 2^6 @@ -21,6 +25,20 @@ #define MI_MASK (MI_BLOCK_SIZE - 1) +// Bitstream profiles indicated by 2-3 bits in the uncompressed header. +// 00: Profile 0. 8-bit 4:2:0 only. +// 10: Profile 1. 8-bit 4:4:4, 4:2:2, and 4:4:0. +// 01: Profile 2. 10-bit and 12-bit color only, with 4:2:0 sampling. +// 110: Profile 3. 10-bit and 12-bit color only, with 4:2:2/4:4:4/4:4:0 +// sampling. +// 111: Undefined profile. +typedef enum BITSTREAM_PROFILE { + PROFILE_0, + PROFILE_1, + PROFILE_2, + PROFILE_3, + MAX_PROFILES +} BITSTREAM_PROFILE; typedef enum BLOCK_SIZE { BLOCK_4X4, @@ -52,20 +70,22 @@ typedef enum PARTITION_TYPE { #define PARTITION_PLOFFSET 4 // number of probability models per block size #define PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET) +// block transform size typedef enum { - TX_4X4 = 0, // 4x4 dct transform - TX_8X8 = 1, // 8x8 dct transform - TX_16X16 = 2, // 16x16 dct transform - TX_32X32 = 3, // 32x32 dct transform + TX_4X4 = 0, // 4x4 transform + TX_8X8 = 1, // 8x8 transform + TX_16X16 = 2, // 16x16 transform + TX_32X32 = 3, // 32x32 transform TX_SIZES } TX_SIZE; +// frame transform mode typedef enum { - ONLY_4X4 = 0, - ALLOW_8X8 = 1, - ALLOW_16X16 = 2, - ALLOW_32X32 = 3, - TX_MODE_SELECT = 4, + ONLY_4X4 = 0, // only 4x4 transform used + ALLOW_8X8 = 1, // allow block transform size up to 8x8 + ALLOW_16X16 = 2, // allow block transform size up to 16x16 + ALLOW_32X32 = 3, // allow block transform size up to 32x32 + TX_MODE_SELECT = 4, // transform specified for each block TX_MODES = 5, } TX_MODE; @@ -73,7 +93,8 @@ typedef enum { DCT_DCT = 0, // DCT in both horizontal and vertical ADST_DCT = 1, // ADST in vertical, DCT in horizontal DCT_ADST = 2, // DCT in vertical, ADST in horizontal - ADST_ADST = 3 // ADST in both directions + ADST_ADST = 3, // ADST in both directions + TX_TYPES = 4 } TX_TYPE; typedef enum { @@ -87,4 +108,14 @@ typedef enum { SRGB = 7 // RGB } COLOR_SPACE; +typedef enum { + VP9_LAST_FLAG = 1 << 0, + VP9_GOLD_FLAG = 1 << 1, + VP9_ALT_FLAG = 1 << 2, +} VP9_REFFRAME; + +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_ENUMS_H_ diff --git a/media/libvpx/vp9/common/vp9_filter.c b/media/libvpx/vp9/common/vp9_filter.c index 79ace147cb84..afcdf22ec63e 100644 --- a/media/libvpx/vp9/common/vp9_filter.c +++ b/media/libvpx/vp9/common/vp9_filter.c @@ -10,12 +10,9 @@ #include -#include "vpx_ports/mem.h" - #include "vp9/common/vp9_filter.h" -DECLARE_ALIGNED(256, const subpel_kernel, - vp9_bilinear_filters[SUBPEL_SHIFTS]) = { +const InterpKernel vp9_bilinear_filters[SUBPEL_SHIFTS] = { { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 }, { 0, 0, 0, 112, 16, 0, 0, 0 }, @@ -35,7 +32,7 @@ DECLARE_ALIGNED(256, const subpel_kernel, }; // Lagrangian interpolation filter -DECLARE_ALIGNED(256, const subpel_kernel, +DECLARE_ALIGNED(256, const InterpKernel, vp9_sub_pel_filters_8[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0}, { 0, 1, -5, 126, 8, -3, 1, 0}, @@ -56,7 +53,7 @@ DECLARE_ALIGNED(256, const subpel_kernel, }; // DCT based filter -DECLARE_ALIGNED(256, const subpel_kernel, +DECLARE_ALIGNED(256, const InterpKernel, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]) = { {0, 0, 0, 128, 0, 0, 0, 0}, {-1, 3, -7, 127, 8, -3, 1, 0}, @@ -77,7 +74,7 @@ DECLARE_ALIGNED(256, const subpel_kernel, }; // freqmultiplier = 0.5 -DECLARE_ALIGNED(256, const subpel_kernel, +DECLARE_ALIGNED(256, const InterpKernel, vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]) = { { 0, 0, 0, 128, 0, 0, 0, 0}, {-3, -1, 32, 64, 38, 1, -3, 0}, @@ -98,14 +95,15 @@ DECLARE_ALIGNED(256, const subpel_kernel, }; -static const subpel_kernel* vp9_filter_kernels[4] = { +static const InterpKernel* vp9_filter_kernels[4] = { vp9_sub_pel_filters_8, vp9_sub_pel_filters_8lp, vp9_sub_pel_filters_8s, vp9_bilinear_filters }; -const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type) { - return vp9_filter_kernels[type]; +const InterpKernel *vp9_get_interp_kernel(INTERP_FILTER filter) { + assert(filter != SWITCHABLE); + return vp9_filter_kernels[filter]; } diff --git a/media/libvpx/vp9/common/vp9_filter.h b/media/libvpx/vp9/common/vp9_filter.h index b1e7e6499e43..8c359c7175c8 100644 --- a/media/libvpx/vp9/common/vp9_filter.h +++ b/media/libvpx/vp9/common/vp9_filter.h @@ -13,6 +13,12 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + + +#ifdef __cplusplus +extern "C" { +#endif #define FILTER_BITS 7 @@ -27,25 +33,22 @@ typedef enum { EIGHTTAP_SHARP = 2, BILINEAR = 3, SWITCHABLE = 4 /* should be the last one */ -} INTERPOLATION_TYPE; +} INTERP_FILTER; -typedef int16_t subpel_kernel[SUBPEL_TAPS]; +typedef int16_t InterpKernel[SUBPEL_TAPS]; -struct subpix_fn_table { - const subpel_kernel *filter_x; - const subpel_kernel *filter_y; -}; +const InterpKernel *vp9_get_interp_kernel(INTERP_FILTER filter); -const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type); - -extern const subpel_kernel vp9_bilinear_filters[SUBPEL_SHIFTS]; -extern const subpel_kernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS]; -extern const subpel_kernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS]; -extern const subpel_kernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS]; +DECLARE_ALIGNED(256, extern const InterpKernel, + vp9_bilinear_filters[SUBPEL_SHIFTS]); // The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear // filter kernel as a 2 tap filter. #define BILINEAR_FILTERS_2TAP(x) \ (vp9_bilinear_filters[(x)] + SUBPEL_TAPS/2 - 1) +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_FILTER_H_ diff --git a/media/libvpx/vp9/common/vp9_findnearmv.c b/media/libvpx/vp9/common/vp9_findnearmv.c deleted file mode 100644 index b91c501435e2..000000000000 --- a/media/libvpx/vp9/common/vp9_findnearmv.c +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vp9/common/vp9_findnearmv.h" -#include "vp9/common/vp9_mvref_common.h" - -static void lower_mv_precision(MV *mv, int allow_hp) { - const int use_hp = allow_hp && vp9_use_mv_hp(mv); - if (!use_hp) { - if (mv->row & 1) - mv->row += (mv->row > 0 ? -1 : 1); - if (mv->col & 1) - mv->col += (mv->col > 0 ? -1 : 1); - } -} - - -void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, - int_mv *mvlist, int_mv *nearest, int_mv *near) { - int i; - // Make sure all the candidates are properly clamped etc - for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) { - lower_mv_precision(&mvlist[i].as_mv, allow_hp); - clamp_mv2(&mvlist[i].as_mv, xd); - } - *nearest = mvlist[0]; - *near = mvlist[1]; -} - -void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, - const TileInfo *const tile, - int_mv *dst_nearest, - int_mv *dst_near, - int block_idx, int ref_idx, - int mi_row, int mi_col) { - int_mv dst_list[MAX_MV_REF_CANDIDATES]; - int_mv mv_list[MAX_MV_REF_CANDIDATES]; - MODE_INFO *const mi = xd->mi_8x8[0]; - - assert(ref_idx == 0 || ref_idx == 1); - assert(MAX_MV_REF_CANDIDATES == 2); // makes code here slightly easier - - vp9_find_mv_refs_idx(cm, xd, tile, mi, xd->last_mi, - mi->mbmi.ref_frame[ref_idx], - mv_list, block_idx, mi_row, mi_col); - - dst_list[1].as_int = 0; - if (block_idx == 0) { - vpx_memcpy(dst_list, mv_list, MAX_MV_REF_CANDIDATES * sizeof(int_mv)); - } else if (block_idx == 1 || block_idx == 2) { - int dst = 0, n; - b_mode_info *bmi = mi->bmi; - - dst_list[dst++].as_int = bmi[0].as_mv[ref_idx].as_int; - for (n = 0; dst < MAX_MV_REF_CANDIDATES && - n < MAX_MV_REF_CANDIDATES; n++) - if (mv_list[n].as_int != dst_list[0].as_int) - dst_list[dst++].as_int = mv_list[n].as_int; - } else { - int dst = 0, n; - b_mode_info *bmi = mi->bmi; - - assert(block_idx == 3); - dst_list[dst++].as_int = bmi[2].as_mv[ref_idx].as_int; - if (dst_list[0].as_int != bmi[1].as_mv[ref_idx].as_int) - dst_list[dst++].as_int = bmi[1].as_mv[ref_idx].as_int; - if (dst < MAX_MV_REF_CANDIDATES && - dst_list[0].as_int != bmi[0].as_mv[ref_idx].as_int) - dst_list[dst++].as_int = bmi[0].as_mv[ref_idx].as_int; - for (n = 0; dst < MAX_MV_REF_CANDIDATES && - n < MAX_MV_REF_CANDIDATES; n++) - if (mv_list[n].as_int != dst_list[0].as_int) - dst_list[dst++].as_int = mv_list[n].as_int; - } - - dst_nearest->as_int = dst_list[0].as_int; - dst_near->as_int = dst_list[1].as_int; -} diff --git a/media/libvpx/vp9/common/vp9_findnearmv.h b/media/libvpx/vp9/common/vp9_findnearmv.h deleted file mode 100644 index 2362caa417df..000000000000 --- a/media/libvpx/vp9/common/vp9_findnearmv.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_COMMON_VP9_FINDNEARMV_H_ -#define VP9_COMMON_VP9_FINDNEARMV_H_ - -#include "vp9/common/vp9_mv.h" -#include "vp9/common/vp9_blockd.h" -#include "vp9/common/vp9_treecoder.h" -#include "vp9/common/vp9_onyxc_int.h" - -#define LEFT_TOP_MARGIN ((VP9BORDERINPIXELS - VP9_INTERP_EXTEND) << 3) -#define RIGHT_BOTTOM_MARGIN ((VP9BORDERINPIXELS - VP9_INTERP_EXTEND) << 3) - -// check a list of motion vectors by sad score using a number rows of pixels -// above and a number cols of pixels in the left to select the one with best -// score to use as ref motion vector -void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, - int_mv *mvlist, int_mv *nearest, int_mv *near); - -// TODO(jingning): this mv clamping function should be block size dependent. -static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { - clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN, - xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, - xd->mb_to_top_edge - LEFT_TOP_MARGIN, - xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); -} - -void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, - const TileInfo *const tile, - int_mv *dst_nearest, - int_mv *dst_near, - int block_idx, int ref_idx, - int mi_row, int mi_col); - -static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mi, - const MODE_INFO *left_mi, int b) { - if (b == 0 || b == 2) { - if (!left_mi || is_inter_block(&left_mi->mbmi)) - return DC_PRED; - - return left_mi->mbmi.sb_type < BLOCK_8X8 ? left_mi->bmi[b + 1].as_mode - : left_mi->mbmi.mode; - } else { - assert(b == 1 || b == 3); - return cur_mi->bmi[b - 1].as_mode; - } -} - -static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mi, - const MODE_INFO *above_mi, int b) { - if (b == 0 || b == 1) { - if (!above_mi || is_inter_block(&above_mi->mbmi)) - return DC_PRED; - - return above_mi->mbmi.sb_type < BLOCK_8X8 ? above_mi->bmi[b + 2].as_mode - : above_mi->mbmi.mode; - } else { - assert(b == 2 || b == 3); - return cur_mi->bmi[b - 2].as_mode; - } -} - -#endif // VP9_COMMON_VP9_FINDNEARMV_H_ diff --git a/media/libvpx/vp9/common/vp9_frame_buffers.c b/media/libvpx/vp9/common/vp9_frame_buffers.c new file mode 100644 index 000000000000..733b3a927233 --- /dev/null +++ b/media/libvpx/vp9/common/vp9_frame_buffers.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/common/vp9_frame_buffers.h" +#include "vpx_mem/vpx_mem.h" + +int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list) { + assert(list != NULL); + vp9_free_internal_frame_buffers(list); + + list->num_internal_frame_buffers = + VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; + list->int_fb = + (InternalFrameBuffer *)vpx_calloc(list->num_internal_frame_buffers, + sizeof(*list->int_fb)); + return (list->int_fb == NULL); +} + +void vp9_free_internal_frame_buffers(InternalFrameBufferList *list) { + int i; + + assert(list != NULL); + + for (i = 0; i < list->num_internal_frame_buffers; ++i) { + vpx_free(list->int_fb[i].data); + list->int_fb[i].data = NULL; + } + vpx_free(list->int_fb); + list->int_fb = NULL; +} + +int vp9_get_frame_buffer(void *cb_priv, size_t min_size, + vpx_codec_frame_buffer_t *fb) { + int i; + InternalFrameBufferList *const int_fb_list = + (InternalFrameBufferList *)cb_priv; + if (int_fb_list == NULL) + return -1; + + // Find a free frame buffer. + for (i = 0; i < int_fb_list->num_internal_frame_buffers; ++i) { + if (!int_fb_list->int_fb[i].in_use) + break; + } + + if (i == int_fb_list->num_internal_frame_buffers) + return -1; + + if (int_fb_list->int_fb[i].size < min_size) { + int_fb_list->int_fb[i].data = + (uint8_t *)vpx_realloc(int_fb_list->int_fb[i].data, min_size); + if (!int_fb_list->int_fb[i].data) + return -1; + + int_fb_list->int_fb[i].size = min_size; + } + + fb->data = int_fb_list->int_fb[i].data; + fb->size = int_fb_list->int_fb[i].size; + int_fb_list->int_fb[i].in_use = 1; + + // Set the frame buffer's private data to point at the internal frame buffer. + fb->priv = &int_fb_list->int_fb[i]; + return 0; +} + +int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb) { + InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv; + (void)cb_priv; + if (int_fb) + int_fb->in_use = 0; + return 0; +} diff --git a/media/libvpx/vp9/common/vp9_frame_buffers.h b/media/libvpx/vp9/common/vp9_frame_buffers.h new file mode 100644 index 000000000000..e2cfe61b6621 --- /dev/null +++ b/media/libvpx/vp9/common/vp9_frame_buffers.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_FRAME_BUFFERS_H_ +#define VP9_COMMON_VP9_FRAME_BUFFERS_H_ + +#include "vpx/vpx_frame_buffer.h" +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct InternalFrameBuffer { + uint8_t *data; + size_t size; + int in_use; +} InternalFrameBuffer; + +typedef struct InternalFrameBufferList { + int num_internal_frame_buffers; + InternalFrameBuffer *int_fb; +} InternalFrameBufferList; + +// Initializes |list|. Returns 0 on success. +int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list); + +// Free any data allocated to the frame buffers. +void vp9_free_internal_frame_buffers(InternalFrameBufferList *list); + +// Callback used by libvpx to request an external frame buffer. |cb_priv| +// Callback private data, which points to an InternalFrameBufferList. +// |min_size| is the minimum size in bytes needed to decode the next frame. +// |fb| pointer to the frame buffer. +int vp9_get_frame_buffer(void *cb_priv, size_t min_size, + vpx_codec_frame_buffer_t *fb); + +// Callback used by libvpx when there are no references to the frame buffer. +// |cb_priv| is not used. |fb| pointer to the frame buffer. +int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_FRAME_BUFFERS_H_ diff --git a/media/libvpx/vp9/common/vp9_idct.c b/media/libvpx/vp9/common/vp9_idct.c index 149362a4a636..856d41e7001e 100644 --- a/media/libvpx/vp9/common/vp9_idct.c +++ b/media/libvpx/vp9/common/vp9_idct.c @@ -96,7 +96,7 @@ void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) { } } -static void idct4_1d(const int16_t *input, int16_t *output) { +static void idct4(const int16_t *input, int16_t *output) { int16_t step[4]; int temp1, temp2; // stage 1 @@ -124,7 +124,7 @@ void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { // Rows for (i = 0; i < 4; ++i) { - idct4_1d(input, outptr); + idct4(input, outptr); input += 4; outptr += 4; } @@ -133,7 +133,7 @@ void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { for (i = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - idct4_1d(temp_in, temp_out); + idct4(temp_in, temp_out); for (j = 0; j < 4; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * stride + i]); @@ -156,7 +156,7 @@ void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) { } } -static void idct8_1d(const int16_t *input, int16_t *output) { +static void idct8(const int16_t *input, int16_t *output) { int16_t step1[8], step2[8]; int temp1, temp2; // stage 1 @@ -174,7 +174,7 @@ static void idct8_1d(const int16_t *input, int16_t *output) { step1[6] = dct_const_round_shift(temp2); // stage 2 & stage 3 - even half - idct4_1d(step1, step1); + idct4(step1, step1); // stage 2 - odd half step2[4] = step1[4] + step1[5]; @@ -209,7 +209,7 @@ void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) { // First transform rows for (i = 0; i < 8; ++i) { - idct8_1d(input, outptr); + idct8(input, outptr); input += 8; outptr += 8; } @@ -218,7 +218,7 @@ void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) { for (i = 0; i < 8; ++i) { for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - idct8_1d(temp_in, temp_out); + idct8(temp_in, temp_out); for (j = 0; j < 8; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * stride + i]); @@ -238,7 +238,7 @@ void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) { } } -static void iadst4_1d(const int16_t *input, int16_t *output) { +static void iadst4(const int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; int x0 = input[0]; @@ -283,10 +283,10 @@ static void iadst4_1d(const int16_t *input, int16_t *output) { void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride, int tx_type) { const transform_2d IHT_4[] = { - { idct4_1d, idct4_1d }, // DCT_DCT = 0 - { iadst4_1d, idct4_1d }, // ADST_DCT = 1 - { idct4_1d, iadst4_1d }, // DCT_ADST = 2 - { iadst4_1d, iadst4_1d } // ADST_ADST = 3 + { idct4, idct4 }, // DCT_DCT = 0 + { iadst4, idct4 }, // ADST_DCT = 1 + { idct4, iadst4 }, // DCT_ADST = 2 + { iadst4, iadst4 } // ADST_ADST = 3 }; int i, j; @@ -311,7 +311,7 @@ void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride, + dest[j * stride + i]); } } -static void iadst8_1d(const int16_t *input, int16_t *output) { +static void iadst8(const int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; int x0 = input[7]; @@ -389,10 +389,10 @@ static void iadst8_1d(const int16_t *input, int16_t *output) { } static const transform_2d IHT_8[] = { - { idct8_1d, idct8_1d }, // DCT_DCT = 0 - { iadst8_1d, idct8_1d }, // ADST_DCT = 1 - { idct8_1d, iadst8_1d }, // DCT_ADST = 2 - { iadst8_1d, iadst8_1d } // ADST_ADST = 3 + { idct8, idct8 }, // DCT_DCT = 0 + { iadst8, idct8 }, // ADST_DCT = 1 + { idct8, iadst8 }, // DCT_ADST = 2 + { iadst8, iadst8 } // ADST_ADST = 3 }; void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride, @@ -421,7 +421,7 @@ void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride, } } -void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) { +void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int stride) { int16_t out[8 * 8] = { 0 }; int16_t *outptr = out; int i, j; @@ -430,7 +430,7 @@ void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) { // First transform rows // only first 4 row has non-zero coefs for (i = 0; i < 4; ++i) { - idct8_1d(input, outptr); + idct8(input, outptr); input += 8; outptr += 8; } @@ -439,14 +439,14 @@ void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) { for (i = 0; i < 8; ++i) { for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - idct8_1d(temp_in, temp_out); + idct8(temp_in, temp_out); for (j = 0; j < 8; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * stride + i]); } } -static void idct16_1d(const int16_t *input, int16_t *output) { +static void idct16(const int16_t *input, int16_t *output) { int16_t step1[16], step2[16]; int temp1, temp2; @@ -619,7 +619,7 @@ void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) { // First transform rows for (i = 0; i < 16; ++i) { - idct16_1d(input, outptr); + idct16(input, outptr); input += 16; outptr += 16; } @@ -628,14 +628,14 @@ void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) { for (i = 0; i < 16; ++i) { for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - idct16_1d(temp_in, temp_out); + idct16(temp_in, temp_out); for (j = 0; j < 16; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + dest[j * stride + i]); } } -static void iadst16_1d(const int16_t *input, int16_t *output) { +static void iadst16(const int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; int x0 = input[15]; @@ -807,10 +807,10 @@ static void iadst16_1d(const int16_t *input, int16_t *output) { } static const transform_2d IHT_16[] = { - { idct16_1d, idct16_1d }, // DCT_DCT = 0 - { iadst16_1d, idct16_1d }, // ADST_DCT = 1 - { idct16_1d, iadst16_1d }, // DCT_ADST = 2 - { iadst16_1d, iadst16_1d } // ADST_ADST = 3 + { idct16, idct16 }, // DCT_DCT = 0 + { iadst16, idct16 }, // ADST_DCT = 1 + { idct16, iadst16 }, // DCT_ADST = 2 + { iadst16, iadst16 } // ADST_ADST = 3 }; void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride, @@ -848,7 +848,7 @@ void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) { // First transform rows. Since all non-zero dct coefficients are in // upper-left 4x4 area, we only need to calculate first 4 rows here. for (i = 0; i < 4; ++i) { - idct16_1d(input, outptr); + idct16(input, outptr); input += 16; outptr += 16; } @@ -857,7 +857,7 @@ void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) { for (i = 0; i < 16; ++i) { for (j = 0; j < 16; ++j) temp_in[j] = out[j*16 + i]; - idct16_1d(temp_in, temp_out); + idct16(temp_in, temp_out); for (j = 0; j < 16; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + dest[j * stride + i]); @@ -877,7 +877,7 @@ void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) { } } -static void idct32_1d(const int16_t *input, int16_t *output) { +static void idct32(const int16_t *input, int16_t *output) { int16_t step1[32], step2[32]; int temp1, temp2; @@ -1263,7 +1263,7 @@ void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) { zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; if (zero_coeff[0] | zero_coeff[1]) - idct32_1d(input, outptr); + idct32(input, outptr); else vpx_memset(outptr, 0, sizeof(int16_t) * 32); input += 32; @@ -1274,7 +1274,7 @@ void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) { for (i = 0; i < 32; ++i) { for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; - idct32_1d(temp_in, temp_out); + idct32(temp_in, temp_out); for (j = 0; j < 32; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + dest[j * stride + i]); @@ -1290,7 +1290,7 @@ void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) { // Rows // only upper-left 8x8 has non-zero coeff for (i = 0; i < 8; ++i) { - idct32_1d(input, outptr); + idct32(input, outptr); input += 32; outptr += 32; } @@ -1299,7 +1299,7 @@ void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) { for (i = 0; i < 32; ++i) { for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; - idct32_1d(temp_in, temp_out); + idct32(temp_in, temp_out); for (j = 0; j < 32; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + dest[j * stride + i]); @@ -1345,43 +1345,37 @@ void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) { // coefficients. Use eobs to decide what to do. // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. // Combine that with code here. - if (eob) { - if (eob == 1) - // DC only DCT coefficient - vp9_idct8x8_1_add(input, dest, stride); - else if (eob <= 10) - vp9_idct8x8_10_add(input, dest, stride); - else - vp9_idct8x8_64_add(input, dest, stride); - } + if (eob == 1) + // DC only DCT coefficient + vp9_idct8x8_1_add(input, dest, stride); + else if (eob <= 12) + vp9_idct8x8_12_add(input, dest, stride); + else + vp9_idct8x8_64_add(input, dest, stride); } void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, int eob) { /* The calculation can be simplified if there are not many non-zero dct * coefficients. Use eobs to separate different cases. */ - if (eob) { - if (eob == 1) - /* DC only DCT coefficient. */ - vp9_idct16x16_1_add(input, dest, stride); - else if (eob <= 10) - vp9_idct16x16_10_add(input, dest, stride); - else - vp9_idct16x16_256_add(input, dest, stride); - } + if (eob == 1) + /* DC only DCT coefficient. */ + vp9_idct16x16_1_add(input, dest, stride); + else if (eob <= 10) + vp9_idct16x16_10_add(input, dest, stride); + else + vp9_idct16x16_256_add(input, dest, stride); } void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride, int eob) { - if (eob) { - if (eob == 1) - vp9_idct32x32_1_add(input, dest, stride); - else if (eob <= 34) - // non-zero coeff only in upper-left 8x8 - vp9_idct32x32_34_add(input, dest, stride); - else - vp9_idct32x32_1024_add(input, dest, stride); - } + if (eob == 1) + vp9_idct32x32_1_add(input, dest, stride); + else if (eob <= 34) + // non-zero coeff only in upper-left 8x8 + vp9_idct32x32_34_add(input, dest, stride); + else + vp9_idct32x32_1024_add(input, dest, stride); } // iht @@ -1398,9 +1392,7 @@ void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, if (tx_type == DCT_DCT) { vp9_idct8x8_add(input, dest, stride, eob); } else { - if (eob > 0) { - vp9_iht8x8_64_add(input, dest, stride, tx_type); - } + vp9_iht8x8_64_add(input, dest, stride, tx_type); } } @@ -1409,8 +1401,6 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, if (tx_type == DCT_DCT) { vp9_idct16x16_add(input, dest, stride, eob); } else { - if (eob > 0) { - vp9_iht16x16_256_add(input, dest, stride, tx_type); - } + vp9_iht16x16_256_add(input, dest, stride, tx_type); } } diff --git a/media/libvpx/vp9/common/vp9_idct.h b/media/libvpx/vp9/common/vp9_idct.h index 183c50abfe08..7f595e1cca20 100644 --- a/media/libvpx/vp9/common/vp9_idct.h +++ b/media/libvpx/vp9/common/vp9_idct.h @@ -18,6 +18,10 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_enums.h" +#ifdef __cplusplus +extern "C" { +#endif + // Constants and Macros used by all idct/dct functions #define DCT_CONST_BITS 14 @@ -29,8 +33,8 @@ #define pair_set_epi16(a, b) \ _mm_set_epi16(b, a, b, a, b, a, b, a) -#define pair_set_epi32(a, b) \ - _mm_set_epi32(b, a, b, a) +#define dual_set_epi16(a, b) \ + _mm_set_epi16(b, b, b, b, a, a, a, a) // Constants: // for (int i = 1; i< 32; ++i) @@ -77,6 +81,16 @@ static const int sinpi_4_9 = 15212; static INLINE int dct_const_round_shift(int input) { int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); +#if CONFIG_COEFFICIENT_RANGE_CHECKING + // For valid VP9 input streams, intermediate stage coefficients should always + // stay within the range of a signed 16 bit integer. Coefficients can go out + // of this range for invalid/corrupt VP9 streams. However, strictly checking + // this range for every intermediate coefficient can burdensome for a decoder, + // therefore the following assertion is only enabled when configured with + // --enable-coefficient-range-checking. + assert(INT16_MIN <= rv); + assert(rv <= INT16_MAX); +#endif return (int16_t)rv; } @@ -103,4 +117,8 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, int stride, int eob); +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_IDCT_H_ diff --git a/media/libvpx/vp9/common/vp9_loopfilter.c b/media/libvpx/vp9/common/vp9_loopfilter.c index a804d3b45089..3b39d4274b10 100644 --- a/media/libvpx/vp9/common/vp9_loopfilter.c +++ b/media/libvpx/vp9/common/vp9_loopfilter.c @@ -16,27 +16,7 @@ #include "vp9/common/vp9_seg_common.h" -// This structure holds bit masks for all 8x8 blocks in a 64x64 region. -// Each 1 bit represents a position in which we want to apply the loop filter. -// Left_ entries refer to whether we apply a filter on the border to the -// left of the block. Above_ entries refer to whether or not to apply a -// filter on the above border. Int_ entries refer to whether or not to -// apply borders on the 4x4 edges within the 8x8 block that each bit -// represents. -// Since each transform is accompanied by a potentially different type of -// loop filter there is a different entry in the array for each transform size. -typedef struct { - uint64_t left_y[TX_SIZES]; - uint64_t above_y[TX_SIZES]; - uint64_t int_4x4_y; - uint16_t left_uv[TX_SIZES]; - uint16_t above_uv[TX_SIZES]; - uint16_t int_4x4_uv; - uint8_t lfl_y[64]; - uint8_t lfl_uv[16]; -} LOOP_FILTER_MASK; - -// 64 bit masks for left transform size. Each 1 represents a position where +// 64 bit masks for left transform size. Each 1 represents a position where // we should apply a loop filter across the left border of an 8x8 block // boundary. // @@ -54,13 +34,13 @@ typedef struct { // // A loopfilter should be applied to every other 8x8 horizontally. static const uint64_t left_64x64_txform_mask[TX_SIZES]= { - 0xffffffffffffffff, // TX_4X4 - 0xffffffffffffffff, // TX_8x8 - 0x5555555555555555, // TX_16x16 - 0x1111111111111111, // TX_32x32 + 0xffffffffffffffff, // TX_4X4 + 0xffffffffffffffff, // TX_8x8 + 0x5555555555555555, // TX_16x16 + 0x1111111111111111, // TX_32x32 }; -// 64 bit masks for above transform size. Each 1 represents a position where +// 64 bit masks for above transform size. Each 1 represents a position where // we should apply a loop filter across the top border of an 8x8 block // boundary. // @@ -78,15 +58,15 @@ static const uint64_t left_64x64_txform_mask[TX_SIZES]= { // // A loopfilter should be applied to every other 4 the row vertically. static const uint64_t above_64x64_txform_mask[TX_SIZES]= { - 0xffffffffffffffff, // TX_4X4 - 0xffffffffffffffff, // TX_8x8 - 0x00ff00ff00ff00ff, // TX_16x16 - 0x000000ff000000ff, // TX_32x32 + 0xffffffffffffffff, // TX_4X4 + 0xffffffffffffffff, // TX_8x8 + 0x00ff00ff00ff00ff, // TX_16x16 + 0x000000ff000000ff, // TX_32x32 }; -// 64 bit masks for prediction sizes (left). Each 1 represents a position -// where left border of an 8x8 block. These are aligned to the right most -// appropriate bit, and then shifted into place. +// 64 bit masks for prediction sizes (left). Each 1 represents a position +// where left border of an 8x8 block. These are aligned to the right most +// appropriate bit, and then shifted into place. // // In the case of TX_16x32 -> ( low order byte first ) we end up with // a mask that looks like this : @@ -100,54 +80,54 @@ static const uint64_t above_64x64_txform_mask[TX_SIZES]= { // 00000000 // 00000000 static const uint64_t left_prediction_mask[BLOCK_SIZES] = { - 0x0000000000000001, // BLOCK_4X4, - 0x0000000000000001, // BLOCK_4X8, - 0x0000000000000001, // BLOCK_8X4, - 0x0000000000000001, // BLOCK_8X8, - 0x0000000000000101, // BLOCK_8X16, - 0x0000000000000001, // BLOCK_16X8, - 0x0000000000000101, // BLOCK_16X16, - 0x0000000001010101, // BLOCK_16X32, - 0x0000000000000101, // BLOCK_32X16, - 0x0000000001010101, // BLOCK_32X32, - 0x0101010101010101, // BLOCK_32X64, - 0x0000000001010101, // BLOCK_64X32, - 0x0101010101010101, // BLOCK_64X64 + 0x0000000000000001, // BLOCK_4X4, + 0x0000000000000001, // BLOCK_4X8, + 0x0000000000000001, // BLOCK_8X4, + 0x0000000000000001, // BLOCK_8X8, + 0x0000000000000101, // BLOCK_8X16, + 0x0000000000000001, // BLOCK_16X8, + 0x0000000000000101, // BLOCK_16X16, + 0x0000000001010101, // BLOCK_16X32, + 0x0000000000000101, // BLOCK_32X16, + 0x0000000001010101, // BLOCK_32X32, + 0x0101010101010101, // BLOCK_32X64, + 0x0000000001010101, // BLOCK_64X32, + 0x0101010101010101, // BLOCK_64X64 }; // 64 bit mask to shift and set for each prediction size. static const uint64_t above_prediction_mask[BLOCK_SIZES] = { - 0x0000000000000001, // BLOCK_4X4 - 0x0000000000000001, // BLOCK_4X8 - 0x0000000000000001, // BLOCK_8X4 - 0x0000000000000001, // BLOCK_8X8 - 0x0000000000000001, // BLOCK_8X16, - 0x0000000000000003, // BLOCK_16X8 - 0x0000000000000003, // BLOCK_16X16 - 0x0000000000000003, // BLOCK_16X32, - 0x000000000000000f, // BLOCK_32X16, - 0x000000000000000f, // BLOCK_32X32, - 0x000000000000000f, // BLOCK_32X64, - 0x00000000000000ff, // BLOCK_64X32, - 0x00000000000000ff, // BLOCK_64X64 + 0x0000000000000001, // BLOCK_4X4 + 0x0000000000000001, // BLOCK_4X8 + 0x0000000000000001, // BLOCK_8X4 + 0x0000000000000001, // BLOCK_8X8 + 0x0000000000000001, // BLOCK_8X16, + 0x0000000000000003, // BLOCK_16X8 + 0x0000000000000003, // BLOCK_16X16 + 0x0000000000000003, // BLOCK_16X32, + 0x000000000000000f, // BLOCK_32X16, + 0x000000000000000f, // BLOCK_32X32, + 0x000000000000000f, // BLOCK_32X64, + 0x00000000000000ff, // BLOCK_64X32, + 0x00000000000000ff, // BLOCK_64X64 }; -// 64 bit mask to shift and set for each prediction size. A bit is set for +// 64 bit mask to shift and set for each prediction size. A bit is set for // each 8x8 block that would be in the left most block of the given block // size in the 64x64 block. static const uint64_t size_mask[BLOCK_SIZES] = { - 0x0000000000000001, // BLOCK_4X4 - 0x0000000000000001, // BLOCK_4X8 - 0x0000000000000001, // BLOCK_8X4 - 0x0000000000000001, // BLOCK_8X8 - 0x0000000000000101, // BLOCK_8X16, - 0x0000000000000003, // BLOCK_16X8 - 0x0000000000000303, // BLOCK_16X16 - 0x0000000003030303, // BLOCK_16X32, - 0x0000000000000f0f, // BLOCK_32X16, - 0x000000000f0f0f0f, // BLOCK_32X32, - 0x0f0f0f0f0f0f0f0f, // BLOCK_32X64, - 0x00000000ffffffff, // BLOCK_64X32, - 0xffffffffffffffff, // BLOCK_64X64 + 0x0000000000000001, // BLOCK_4X4 + 0x0000000000000001, // BLOCK_4X8 + 0x0000000000000001, // BLOCK_8X4 + 0x0000000000000001, // BLOCK_8X8 + 0x0000000000000101, // BLOCK_8X16, + 0x0000000000000003, // BLOCK_16X8 + 0x0000000000000303, // BLOCK_16X16 + 0x0000000003030303, // BLOCK_16X32, + 0x0000000000000f0f, // BLOCK_32X16, + 0x000000000f0f0f0f, // BLOCK_32X32, + 0x0f0f0f0f0f0f0f0f, // BLOCK_32X64, + 0x00000000ffffffff, // BLOCK_64X32, + 0xffffffffffffffff, // BLOCK_64X64 }; // These are used for masking the left and above borders. @@ -156,95 +136,82 @@ static const uint64_t above_border = 0x000000ff000000ff; // 16 bit masks for uv transform sizes. static const uint16_t left_64x64_txform_mask_uv[TX_SIZES]= { - 0xffff, // TX_4X4 - 0xffff, // TX_8x8 - 0x5555, // TX_16x16 - 0x1111, // TX_32x32 + 0xffff, // TX_4X4 + 0xffff, // TX_8x8 + 0x5555, // TX_16x16 + 0x1111, // TX_32x32 }; static const uint16_t above_64x64_txform_mask_uv[TX_SIZES]= { - 0xffff, // TX_4X4 - 0xffff, // TX_8x8 - 0x0f0f, // TX_16x16 - 0x000f, // TX_32x32 + 0xffff, // TX_4X4 + 0xffff, // TX_8x8 + 0x0f0f, // TX_16x16 + 0x000f, // TX_32x32 }; // 16 bit left mask to shift and set for each uv prediction size. static const uint16_t left_prediction_mask_uv[BLOCK_SIZES] = { - 0x0001, // BLOCK_4X4, - 0x0001, // BLOCK_4X8, - 0x0001, // BLOCK_8X4, - 0x0001, // BLOCK_8X8, - 0x0001, // BLOCK_8X16, - 0x0001, // BLOCK_16X8, - 0x0001, // BLOCK_16X16, - 0x0011, // BLOCK_16X32, - 0x0001, // BLOCK_32X16, - 0x0011, // BLOCK_32X32, - 0x1111, // BLOCK_32X64 - 0x0011, // BLOCK_64X32, - 0x1111, // BLOCK_64X64 + 0x0001, // BLOCK_4X4, + 0x0001, // BLOCK_4X8, + 0x0001, // BLOCK_8X4, + 0x0001, // BLOCK_8X8, + 0x0001, // BLOCK_8X16, + 0x0001, // BLOCK_16X8, + 0x0001, // BLOCK_16X16, + 0x0011, // BLOCK_16X32, + 0x0001, // BLOCK_32X16, + 0x0011, // BLOCK_32X32, + 0x1111, // BLOCK_32X64 + 0x0011, // BLOCK_64X32, + 0x1111, // BLOCK_64X64 }; // 16 bit above mask to shift and set for uv each prediction size. static const uint16_t above_prediction_mask_uv[BLOCK_SIZES] = { - 0x0001, // BLOCK_4X4 - 0x0001, // BLOCK_4X8 - 0x0001, // BLOCK_8X4 - 0x0001, // BLOCK_8X8 - 0x0001, // BLOCK_8X16, - 0x0001, // BLOCK_16X8 - 0x0001, // BLOCK_16X16 - 0x0001, // BLOCK_16X32, - 0x0003, // BLOCK_32X16, - 0x0003, // BLOCK_32X32, - 0x0003, // BLOCK_32X64, - 0x000f, // BLOCK_64X32, - 0x000f, // BLOCK_64X64 + 0x0001, // BLOCK_4X4 + 0x0001, // BLOCK_4X8 + 0x0001, // BLOCK_8X4 + 0x0001, // BLOCK_8X8 + 0x0001, // BLOCK_8X16, + 0x0001, // BLOCK_16X8 + 0x0001, // BLOCK_16X16 + 0x0001, // BLOCK_16X32, + 0x0003, // BLOCK_32X16, + 0x0003, // BLOCK_32X32, + 0x0003, // BLOCK_32X64, + 0x000f, // BLOCK_64X32, + 0x000f, // BLOCK_64X64 }; // 64 bit mask to shift and set for each uv prediction size static const uint16_t size_mask_uv[BLOCK_SIZES] = { - 0x0001, // BLOCK_4X4 - 0x0001, // BLOCK_4X8 - 0x0001, // BLOCK_8X4 - 0x0001, // BLOCK_8X8 - 0x0001, // BLOCK_8X16, - 0x0001, // BLOCK_16X8 - 0x0001, // BLOCK_16X16 - 0x0011, // BLOCK_16X32, - 0x0003, // BLOCK_32X16, - 0x0033, // BLOCK_32X32, - 0x3333, // BLOCK_32X64, - 0x00ff, // BLOCK_64X32, - 0xffff, // BLOCK_64X64 + 0x0001, // BLOCK_4X4 + 0x0001, // BLOCK_4X8 + 0x0001, // BLOCK_8X4 + 0x0001, // BLOCK_8X8 + 0x0001, // BLOCK_8X16, + 0x0001, // BLOCK_16X8 + 0x0001, // BLOCK_16X16 + 0x0011, // BLOCK_16X32, + 0x0003, // BLOCK_32X16, + 0x0033, // BLOCK_32X32, + 0x3333, // BLOCK_32X64, + 0x00ff, // BLOCK_64X32, + 0xffff, // BLOCK_64X64 }; static const uint16_t left_border_uv = 0x1111; static const uint16_t above_border_uv = 0x000f; - -static void lf_init_lut(loop_filter_info_n *lfi) { - lfi->mode_lf_lut[DC_PRED] = 0; - lfi->mode_lf_lut[D45_PRED] = 0; - lfi->mode_lf_lut[D135_PRED] = 0; - lfi->mode_lf_lut[D117_PRED] = 0; - lfi->mode_lf_lut[D153_PRED] = 0; - lfi->mode_lf_lut[D207_PRED] = 0; - lfi->mode_lf_lut[D63_PRED] = 0; - lfi->mode_lf_lut[V_PRED] = 0; - lfi->mode_lf_lut[H_PRED] = 0; - lfi->mode_lf_lut[TM_PRED] = 0; - lfi->mode_lf_lut[ZEROMV] = 0; - lfi->mode_lf_lut[NEARESTMV] = 1; - lfi->mode_lf_lut[NEARMV] = 1; - lfi->mode_lf_lut[NEWMV] = 1; -} +static const int mode_lf_lut[MB_MODE_COUNT] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // INTRA_MODES + 1, 1, 0, 1 // INTER_MODES (ZEROMV == 0) +}; static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) { int lvl; // For each possible value for the loop filter fill out limits for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) { - // Set loop filter paramaeters that control sharpness. + // Set loop filter parameters that control sharpness. int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4)); if (sharpness_lvl > 0) { @@ -261,6 +228,12 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) { } } +static uint8_t get_filter_level(const loop_filter_info_n *lfi_n, + const MB_MODE_INFO *mbmi) { + return lfi_n->lvl[mbmi->segment_id][mbmi->ref_frame[0]] + [mode_lf_lut[mbmi->mode]]; +} + void vp9_loop_filter_init(VP9_COMMON *cm) { loop_filter_info_n *lfi = &cm->lf_info; struct loopfilter *lf = &cm->lf; @@ -270,9 +243,6 @@ void vp9_loop_filter_init(VP9_COMMON *cm) { update_sharpness(lfi, lf->sharpness_level); lf->last_sharpness_level = lf->sharpness_level; - // init LUT for lvl and hev thr picking - lf_init_lut(lfi); - // init hev threshold const vectors for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) vpx_memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH); @@ -280,7 +250,7 @@ void vp9_loop_filter_init(VP9_COMMON *cm) { void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) { int seg_id; - // n_shift is the a multiplier for lf_deltas + // n_shift is the multiplier for lf_deltas // the multiplier is 1 for when filter_lvl is between 0 and 31; // 2 when filter_lvl is between 32 and 63 const int scale = 1 << (default_filt_lvl >> 5); @@ -298,9 +268,9 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) { int lvl_seg = default_filt_lvl; if (vp9_segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) { const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF); - lvl_seg = seg->abs_delta == SEGMENT_ABSDATA - ? data - : clamp(default_filt_lvl + data, 0, MAX_LOOP_FILTER); + lvl_seg = clamp(seg->abs_delta == SEGMENT_ABSDATA ? + data : default_filt_lvl + data, + 0, MAX_LOOP_FILTER); } if (!lf->mode_ref_delta_enabled) { @@ -323,40 +293,102 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) { } } -static void filter_selectively_vert(uint8_t *s, int pitch, - unsigned int mask_16x16, - unsigned int mask_8x8, - unsigned int mask_4x4, - unsigned int mask_4x4_int, - const loop_filter_info_n *lfi_n, - const uint8_t *lfl) { +static void filter_selectively_vert_row2(PLANE_TYPE plane_type, + uint8_t *s, int pitch, + unsigned int mask_16x16_l, + unsigned int mask_8x8_l, + unsigned int mask_4x4_l, + unsigned int mask_4x4_int_l, + const loop_filter_info_n *lfi_n, + const uint8_t *lfl) { + const int mask_shift = plane_type ? 4 : 8; + const int mask_cutoff = plane_type ? 0xf : 0xff; + const int lfl_forward = plane_type ? 4 : 8; + + unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff; + unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff; + unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff; + unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff; + unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff; + unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff; + unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff; + unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff; unsigned int mask; - for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; + for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 | + mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1; mask; mask >>= 1) { - const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl; + const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl; + const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward); + // TODO(yunqingwang): count in loopfilter functions should be removed. if (mask & 1) { - if (mask_16x16 & 1) { - vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr); - } else if (mask_8x8 & 1) { - vp9_mbloop_filter_vertical_edge(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); - } else if (mask_4x4 & 1) { - vp9_loop_filter_vertical_edge(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + if ((mask_16x16_0 | mask_16x16_1) & 1) { + if ((mask_16x16_0 & mask_16x16_1) & 1) { + vp9_lpf_vertical_16_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr); + } else if (mask_16x16_0 & 1) { + vp9_lpf_vertical_16(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr); + } else { + vp9_lpf_vertical_16(s + 8 *pitch, pitch, lfi1->mblim, + lfi1->lim, lfi1->hev_thr); + } + } + + if ((mask_8x8_0 | mask_8x8_1) & 1) { + if ((mask_8x8_0 & mask_8x8_1) & 1) { + vp9_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); + } else if (mask_8x8_0 & 1) { + vp9_lpf_vertical_8(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, + 1); + } else { + vp9_lpf_vertical_8(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim, + lfi1->hev_thr, 1); + } + } + + if ((mask_4x4_0 | mask_4x4_1) & 1) { + if ((mask_4x4_0 & mask_4x4_1) & 1) { + vp9_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); + } else if (mask_4x4_0 & 1) { + vp9_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr, + 1); + } else { + vp9_lpf_vertical_4(s + 8 * pitch, pitch, lfi1->mblim, lfi1->lim, + lfi1->hev_thr, 1); + } + } + + if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) { + if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) { + vp9_lpf_vertical_4_dual(s + 4, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, lfi1->mblim, lfi1->lim, + lfi1->hev_thr); + } else if (mask_4x4_int_0 & 1) { + vp9_lpf_vertical_4(s + 4, pitch, lfi0->mblim, lfi0->lim, + lfi0->hev_thr, 1); + } else { + vp9_lpf_vertical_4(s + 8 * pitch + 4, pitch, lfi1->mblim, lfi1->lim, + lfi1->hev_thr, 1); + } } } - if (mask_4x4_int & 1) - vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + s += 8; lfl += 1; - mask_16x16 >>= 1; - mask_8x8 >>= 1; - mask_4x4 >>= 1; - mask_4x4_int >>= 1; + mask_16x16_0 >>= 1; + mask_8x8_0 >>= 1; + mask_4x4_0 >>= 1; + mask_4x4_int_0 >>= 1; + mask_16x16_1 >>= 1; + mask_8x8_1 >>= 1; + mask_4x4_1 >>= 1; + mask_4x4_int_1 >>= 1; } } @@ -378,88 +410,73 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, if (mask & 1) { if (mask_16x16 & 1) { if ((mask_16x16 & 3) == 3) { - vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 2); + vp9_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); count = 2; } else { - vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + vp9_lpf_horizontal_16(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); } } else if (mask_8x8 & 1) { if ((mask_8x8 & 3) == 3) { // Next block's thresholds const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1); - // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering. - vp9_mbloop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); - vp9_mbloop_filter_horizontal_edge(s + 8, pitch, lfin->mblim, - lfin->lim, lfin->hev_thr, 1); + vp9_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); if ((mask_4x4_int & 3) == 3) { - // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering. - vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); - vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch, - lfin->mblim, lfin->lim, - lfin->hev_thr, 1); + vp9_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr); } else { if (mask_4x4_int & 1) - vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); + vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); else if (mask_4x4_int & 2) - vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch, - lfin->mblim, lfin->lim, - lfin->hev_thr, 1); + vp9_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, + lfin->lim, lfin->hev_thr, 1); } count = 2; } else { - vp9_mbloop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + vp9_lpf_horizontal_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); if (mask_4x4_int & 1) - vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); + vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); } } else if (mask_4x4 & 1) { if ((mask_4x4 & 3) == 3) { // Next block's thresholds const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1); - // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering. - vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); - vp9_loop_filter_horizontal_edge(s + 8, pitch, lfin->mblim, lfin->lim, - lfin->hev_thr, 1); - + vp9_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, lfin->lim, + lfin->hev_thr); if ((mask_4x4_int & 3) == 3) { - // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering. - vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); - vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch, - lfin->mblim, lfin->lim, - lfin->hev_thr, 1); + vp9_lpf_horizontal_4_dual(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr); } else { if (mask_4x4_int & 1) - vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); + vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); else if (mask_4x4_int & 2) - vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch, - lfin->mblim, lfin->lim, - lfin->hev_thr, 1); + vp9_lpf_horizontal_4(s + 8 + 4 * pitch, pitch, lfin->mblim, + lfin->lim, lfin->hev_thr, 1); } count = 2; } else { - vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); + vp9_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); - if (mask_4x4_int & 1) - vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); + if (mask_4x4_int & 1) + vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); } } else if (mask_4x4_int & 1) { - vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); + vp9_lpf_horizontal_4(s + 4 * pitch, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); } } s += 8 * count; @@ -472,8 +489,8 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, } // This function ors into the current lfm structure, where to do loop -// filters for the specific mi we are looking at. It uses information -// including the block_size_type (32x16, 32x32, etc), the transform size, +// filters for the specific mi we are looking at. It uses information +// including the block_size_type (32x16, 32x32, etc.), the transform size, // whether there were any coefficients encoded, and the loop filter strength // block we are currently looking at. Shift is used to position the // 1's we produce. @@ -482,28 +499,25 @@ static void build_masks(const loop_filter_info_n *const lfi_n, const MODE_INFO *mi, const int shift_y, const int shift_uv, LOOP_FILTER_MASK *lfm) { - const BLOCK_SIZE block_size = mi->mbmi.sb_type; - const TX_SIZE tx_size_y = mi->mbmi.tx_size; - const TX_SIZE tx_size_uv = get_uv_tx_size(&mi->mbmi); - const int skip = mi->mbmi.skip_coeff; - const int seg = mi->mbmi.segment_id; - const int ref = mi->mbmi.ref_frame[0]; - const int mode = lfi_n->mode_lf_lut[mi->mbmi.mode]; - const int filter_level = lfi_n->lvl[seg][ref][mode]; - uint64_t *left_y = &lfm->left_y[tx_size_y]; - uint64_t *above_y = &lfm->above_y[tx_size_y]; - uint64_t *int_4x4_y = &lfm->int_4x4_y; - uint16_t *left_uv = &lfm->left_uv[tx_size_uv]; - uint16_t *above_uv = &lfm->above_uv[tx_size_uv]; - uint16_t *int_4x4_uv = &lfm->int_4x4_uv; + const MB_MODE_INFO *mbmi = &mi->mbmi; + const BLOCK_SIZE block_size = mbmi->sb_type; + const TX_SIZE tx_size_y = mbmi->tx_size; + const TX_SIZE tx_size_uv = get_uv_tx_size_impl(tx_size_y, block_size, 1, 1); + const int filter_level = get_filter_level(lfi_n, mbmi); + uint64_t *const left_y = &lfm->left_y[tx_size_y]; + uint64_t *const above_y = &lfm->above_y[tx_size_y]; + uint64_t *const int_4x4_y = &lfm->int_4x4_y; + uint16_t *const left_uv = &lfm->left_uv[tx_size_uv]; + uint16_t *const above_uv = &lfm->above_uv[tx_size_uv]; + uint16_t *const int_4x4_uv = &lfm->int_4x4_uv; int i; - int w = num_8x8_blocks_wide_lookup[block_size]; - int h = num_8x8_blocks_high_lookup[block_size]; // If filter level is 0 we don't loop filter. if (!filter_level) { return; } else { + const int w = num_8x8_blocks_wide_lookup[block_size]; + const int h = num_8x8_blocks_high_lookup[block_size]; int index = shift_y; for (i = 0; i < h; i++) { vpx_memset(&lfm->lfl_y[index], filter_level, w); @@ -512,7 +526,7 @@ static void build_masks(const loop_filter_info_n *const lfi_n, } // These set 1 in the current block size for the block size edges. - // For instance if the block size is 32x16, we'll set : + // For instance if the block size is 32x16, we'll set: // above = 1111 // 0000 // and @@ -521,7 +535,7 @@ static void build_masks(const loop_filter_info_n *const lfi_n, // NOTE : In this example the low bit is left most ( 1000 ) is stored as // 1, not 8... // - // U and v set things on a 16 bit scale. + // U and V set things on a 16 bit scale. // *above_y |= above_prediction_mask[block_size] << shift_y; *above_uv |= above_prediction_mask_uv[block_size] << shift_uv; @@ -530,10 +544,10 @@ static void build_masks(const loop_filter_info_n *const lfi_n, // If the block has no coefficients and is not intra we skip applying // the loop filter on block edges. - if (skip && ref > INTRA_FRAME) + if (mbmi->skip && is_inter_block(mbmi)) return; - // Here we are adding a mask for the transform size. The transform + // Here we are adding a mask for the transform size. The transform // size mask is set to be correct for a 64x64 prediction block size. We // mask to match the size of the block we are working on and then shift it // into place.. @@ -551,37 +565,33 @@ static void build_masks(const loop_filter_info_n *const lfi_n, // boundaries. These differ from the 4x4 boundaries on the outside edge of // an 8x8 in that the internal ones can be skipped and don't depend on // the prediction block size. - if (tx_size_y == TX_4X4) { + if (tx_size_y == TX_4X4) *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y; - } - if (tx_size_uv == TX_4X4) { + + if (tx_size_uv == TX_4X4) *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv; - } } // This function does the same thing as the one above with the exception that -// it only affects the y masks. It exists because for blocks < 16x16 in size, +// it only affects the y masks. It exists because for blocks < 16x16 in size, // we only update u and v masks on the first block. static void build_y_mask(const loop_filter_info_n *const lfi_n, const MODE_INFO *mi, const int shift_y, LOOP_FILTER_MASK *lfm) { - const BLOCK_SIZE block_size = mi->mbmi.sb_type; - const TX_SIZE tx_size_y = mi->mbmi.tx_size; - const int skip = mi->mbmi.skip_coeff; - const int seg = mi->mbmi.segment_id; - const int ref = mi->mbmi.ref_frame[0]; - const int mode = lfi_n->mode_lf_lut[mi->mbmi.mode]; - const int filter_level = lfi_n->lvl[seg][ref][mode]; - uint64_t *left_y = &lfm->left_y[tx_size_y]; - uint64_t *above_y = &lfm->above_y[tx_size_y]; - uint64_t *int_4x4_y = &lfm->int_4x4_y; + const MB_MODE_INFO *mbmi = &mi->mbmi; + const BLOCK_SIZE block_size = mbmi->sb_type; + const TX_SIZE tx_size_y = mbmi->tx_size; + const int filter_level = get_filter_level(lfi_n, mbmi); + uint64_t *const left_y = &lfm->left_y[tx_size_y]; + uint64_t *const above_y = &lfm->above_y[tx_size_y]; + uint64_t *const int_4x4_y = &lfm->int_4x4_y; int i; - int w = num_8x8_blocks_wide_lookup[block_size]; - int h = num_8x8_blocks_high_lookup[block_size]; if (!filter_level) { return; } else { + const int w = num_8x8_blocks_wide_lookup[block_size]; + const int h = num_8x8_blocks_high_lookup[block_size]; int index = shift_y; for (i = 0; i < h; i++) { vpx_memset(&lfm->lfl_y[index], filter_level, w); @@ -592,7 +602,7 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n, *above_y |= above_prediction_mask[block_size] << shift_y; *left_y |= left_prediction_mask[block_size] << shift_y; - if (skip && ref > INTRA_FRAME) + if (mbmi->skip && is_inter_block(mbmi)) return; *above_y |= (size_mask[block_size] & @@ -601,25 +611,24 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n, *left_y |= (size_mask[block_size] & left_64x64_txform_mask[tx_size_y]) << shift_y; - if (tx_size_y == TX_4X4) { + if (tx_size_y == TX_4X4) *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y; - } } // This function sets up the bit masks for the entire 64x64 region represented // by mi_row, mi_col. // TODO(JBB): This function only works for yv12. -static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, - MODE_INFO **mi_8x8, const int mode_info_stride, - LOOP_FILTER_MASK *lfm) { +void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, + MODE_INFO **mi, const int mode_info_stride, + LOOP_FILTER_MASK *lfm) { int idx_32, idx_16, idx_8; const loop_filter_info_n *const lfi_n = &cm->lf_info; - MODE_INFO **mip = mi_8x8; - MODE_INFO **mip2 = mi_8x8; + MODE_INFO **mip = mi; + MODE_INFO **mip2 = mi; // These are offsets to the next mi in the 64x64 block. It is what gets - // added to the mi ptr as we go through each loop. It helps us to avoids - // setting up special row and column counters for each index. The last step + // added to the mi ptr as we go through each loop. It helps us to avoid + // setting up special row and column counters for each index. The last step // brings us out back to the starting position. const int offset_32[] = {4, (mode_info_stride << 2) - 4, 4, -(mode_info_stride << 2) - 4}; @@ -628,7 +637,7 @@ static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, const int offset[] = {1, mode_info_stride - 1, 1, -mode_info_stride - 1}; // Following variables represent shifts to position the current block - // mask over the appropriate block. A shift of 36 to the left will move + // mask over the appropriate block. A shift of 36 to the left will move // the bits for the final 32 by 32 block in the 64x64 up 4 rows and left // 4 rows to the appropriate spot. const int shift_32_y[] = {0, 4, 32, 36}; @@ -643,6 +652,7 @@ static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, cm->mi_cols - mi_col : MI_BLOCK_SIZE); vp9_zero(*lfm); + assert(mip[0] != NULL); // TODO(jimbankoski): Try moving most of the following code into decode // loop and storing lfm in the mbmi structure so that we don't have to go @@ -758,7 +768,7 @@ static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, lfm->above_uv[TX_16X16] |= lfm->above_uv[TX_32X32]; // We do at least 8 tap filter on every 32x32 even if the transform size - // is 4x4. So if the 4x4 is set on a border pixel add it to the 8x8 and + // is 4x4. So if the 4x4 is set on a border pixel add it to the 8x8 and // remove it from the 4x4. lfm->left_y[TX_8X8] |= lfm->left_y[TX_4X4] & left_border; lfm->left_y[TX_4X4] &= ~left_border; @@ -787,7 +797,7 @@ static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, lfm->int_4x4_y &= mask_y; lfm->int_4x4_uv &= mask_uv; - // We don't apply a wide loop filter on the last uv block row. If set + // We don't apply a wide loop filter on the last uv block row. If set // apply the shorter one instead. if (rows == 1) { lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16]; @@ -821,7 +831,7 @@ static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, lfm->int_4x4_y &= mask_y; lfm->int_4x4_uv &= mask_uv_int; - // We don't apply a wide loop filter on the last uv column. If set + // We don't apply a wide loop filter on the last uv column. If set // apply the shorter one instead. if (columns == 1) { lfm->left_uv[TX_8X8] |= lfm->left_uv[TX_16X16]; @@ -832,7 +842,8 @@ static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, lfm->left_uv[TX_16X16] &= ~(lfm->left_uv[TX_16X16] & 0xcccc); } } - // We don't a loop filter on the first column in the image. Mask that out. + // We don't apply a loop filter on the first column in the image, mask that + // out. if (mi_col == 0) { for (i = 0; i < TX_32X32; i++) { lfm->left_y[i] &= 0xfefefefefefefefe; @@ -859,15 +870,37 @@ static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, assert(!(lfm->int_4x4_uv & lfm->above_uv[TX_16X16])); } -#if CONFIG_NON420 -static uint8_t build_lfi(const loop_filter_info_n *lfi_n, - const MB_MODE_INFO *mbmi) { - const int seg = mbmi->segment_id; - const int ref = mbmi->ref_frame[0]; - const int mode = lfi_n->mode_lf_lut[mbmi->mode]; - const int filter_level = lfi_n->lvl[seg][ref][mode]; +static void filter_selectively_vert(uint8_t *s, int pitch, + unsigned int mask_16x16, + unsigned int mask_8x8, + unsigned int mask_4x4, + unsigned int mask_4x4_int, + const loop_filter_info_n *lfi_n, + const uint8_t *lfl) { + unsigned int mask; - return filter_level; + for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int; + mask; mask >>= 1) { + const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl; + + if (mask & 1) { + if (mask_16x16 & 1) { + vp9_lpf_vertical_16(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr); + } else if (mask_8x8 & 1) { + vp9_lpf_vertical_8(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + } else if (mask_4x4 & 1) { + vp9_lpf_vertical_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + } + } + if (mask_4x4_int & 1) + vp9_lpf_vertical_4(s + 4, pitch, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + s += 8; + lfl += 1; + mask_16x16 >>= 1; + mask_8x8 >>= 1; + mask_4x4 >>= 1; + mask_4x4_int >>= 1; + } } static void filter_block_plane_non420(VP9_COMMON *cm, @@ -878,7 +911,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm, const int ss_y = plane->subsampling_y; const int row_step = 1 << ss_x; const int col_step = 1 << ss_y; - const int row_step_stride = cm->mode_info_stride * row_step; + const int row_step_stride = cm->mi_stride * row_step; struct buf_2d *const dst = &plane->dst; uint8_t* const dst0 = dst->buf; unsigned int mask_16x16[MI_BLOCK_SIZE] = {0}; @@ -897,25 +930,25 @@ static void filter_block_plane_non420(VP9_COMMON *cm, // Determine the vertical edges that need filtering for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) { const MODE_INFO *mi = mi_8x8[c]; - const int skip_this = mi[0].mbmi.skip_coeff - && is_inter_block(&mi[0].mbmi); + const BLOCK_SIZE sb_type = mi[0].mbmi.sb_type; + const int skip_this = mi[0].mbmi.skip && is_inter_block(&mi[0].mbmi); // left edge of current unit is block/partition edge -> no skip - const int block_edge_left = b_width_log2(mi[0].mbmi.sb_type) ? - !(c & ((1 << (b_width_log2(mi[0].mbmi.sb_type)-1)) - 1)) : 1; + const int block_edge_left = (num_4x4_blocks_wide_lookup[sb_type] > 1) ? + !(c & (num_8x8_blocks_wide_lookup[sb_type] - 1)) : 1; const int skip_this_c = skip_this && !block_edge_left; // top edge of current unit is block/partition edge -> no skip - const int block_edge_above = b_height_log2(mi[0].mbmi.sb_type) ? - !(r & ((1 << (b_height_log2(mi[0].mbmi.sb_type)-1)) - 1)) : 1; + const int block_edge_above = (num_4x4_blocks_high_lookup[sb_type] > 1) ? + !(r & (num_8x8_blocks_high_lookup[sb_type] - 1)) : 1; const int skip_this_r = skip_this && !block_edge_above; const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV) - ? get_uv_tx_size(&mi[0].mbmi) + ? get_uv_tx_size(&mi[0].mbmi, plane) : mi[0].mbmi.tx_size; const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1; const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1; // Filter level can vary per MI if (!(lfl[(r << 3) + (c >> ss_x)] = - build_lfi(&cm->lf_info, &mi[0].mbmi))) + get_filter_level(&cm->lf_info, &mi[0].mbmi))) continue; // Build masks based on the transform size of each block @@ -1007,15 +1040,13 @@ static void filter_block_plane_non420(VP9_COMMON *cm, dst->buf += 8 * dst->stride; } } -#endif -static void filter_block_plane(VP9_COMMON *const cm, - struct macroblockd_plane *const plane, - int mi_row, - LOOP_FILTER_MASK *lfm) { +void vp9_filter_block_plane(VP9_COMMON *const cm, + struct macroblockd_plane *const plane, + int mi_row, + LOOP_FILTER_MASK *lfm) { struct buf_2d *const dst = &plane->dst; uint8_t* const dst0 = dst->buf; - unsigned int mask_4x4_int_row[MI_BLOCK_SIZE] = {0}; int r, c; if (!plane->plane_type) { @@ -1024,23 +1055,27 @@ static void filter_block_plane(VP9_COMMON *const cm, uint64_t mask_4x4 = lfm->left_y[TX_4X4]; uint64_t mask_4x4_int = lfm->int_4x4_y; - // Vertical pass - for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) { - mask_4x4_int_row[r] = mask_4x4_int & 0xff; + // Vertical pass: do 2 rows at one time + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) { + unsigned int mask_16x16_l = mask_16x16 & 0xffff; + unsigned int mask_8x8_l = mask_8x8 & 0xffff; + unsigned int mask_4x4_l = mask_4x4 & 0xffff; + unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff; // Disable filtering on the leftmost column - filter_selectively_vert(dst->buf, dst->stride, - mask_16x16 & 0xff, - mask_8x8 & 0xff, - mask_4x4 & 0xff, - mask_4x4_int_row[r], - &cm->lf_info, &lfm->lfl_y[r << 3]); + filter_selectively_vert_row2(plane->plane_type, + dst->buf, dst->stride, + mask_16x16_l, + mask_8x8_l, + mask_4x4_l, + mask_4x4_int_l, + &cm->lf_info, &lfm->lfl_y[r << 3]); - dst->buf += 8 * dst->stride; - mask_16x16 >>= 8; - mask_8x8 >>= 8; - mask_4x4 >>= 8; - mask_4x4_int >>= 8; + dst->buf += 16 * dst->stride; + mask_16x16 >>= 16; + mask_8x8 >>= 16; + mask_4x4 >>= 16; + mask_4x4_int >>= 16; } // Horizontal pass @@ -1048,6 +1083,7 @@ static void filter_block_plane(VP9_COMMON *const cm, mask_16x16 = lfm->above_y[TX_16X16]; mask_8x8 = lfm->above_y[TX_8X8]; mask_4x4 = lfm->above_y[TX_4X4]; + mask_4x4_int = lfm->int_4x4_y; for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) { unsigned int mask_16x16_r; @@ -1068,13 +1104,14 @@ static void filter_block_plane(VP9_COMMON *const cm, mask_16x16_r, mask_8x8_r, mask_4x4_r, - mask_4x4_int_row[r], + mask_4x4_int & 0xff, &cm->lf_info, &lfm->lfl_y[r << 3]); dst->buf += 8 * dst->stride; mask_16x16 >>= 8; mask_8x8 >>= 8; mask_4x4 >>= 8; + mask_4x4_int >>= 8; } } else { uint16_t mask_16x16 = lfm->left_uv[TX_16X16]; @@ -1082,27 +1119,37 @@ static void filter_block_plane(VP9_COMMON *const cm, uint16_t mask_4x4 = lfm->left_uv[TX_4X4]; uint16_t mask_4x4_int = lfm->int_4x4_uv; - // Vertical pass - for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) { + // Vertical pass: do 2 rows at one time + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 4) { if (plane->plane_type == 1) { - for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) + for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) { lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)]; + lfm->lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + + (c << 1)]; + } } - mask_4x4_int_row[r] = mask_4x4_int & 0xf; - // Disable filtering on the leftmost column - filter_selectively_vert(dst->buf, dst->stride, - mask_16x16 & 0xf, - mask_8x8 & 0xf, - mask_4x4 & 0xf, - mask_4x4_int_row[r], - &cm->lf_info, &lfm->lfl_uv[r << 1]); + { + unsigned int mask_16x16_l = mask_16x16 & 0xff; + unsigned int mask_8x8_l = mask_8x8 & 0xff; + unsigned int mask_4x4_l = mask_4x4 & 0xff; + unsigned int mask_4x4_int_l = mask_4x4_int & 0xff; - dst->buf += 8 * dst->stride; - mask_16x16 >>= 4; - mask_8x8 >>= 4; - mask_4x4 >>= 4; - mask_4x4_int >>= 4; + // Disable filtering on the leftmost column + filter_selectively_vert_row2(plane->plane_type, + dst->buf, dst->stride, + mask_16x16_l, + mask_8x8_l, + mask_4x4_l, + mask_4x4_int_l, + &cm->lf_info, &lfm->lfl_uv[r << 1]); + + dst->buf += 16 * dst->stride; + mask_16x16 >>= 8; + mask_8x8 >>= 8; + mask_4x4 >>= 8; + mask_4x4_int >>= 8; + } } // Horizontal pass @@ -1110,11 +1157,12 @@ static void filter_block_plane(VP9_COMMON *const cm, mask_16x16 = lfm->above_uv[TX_16X16]; mask_8x8 = lfm->above_uv[TX_8X8]; mask_4x4 = lfm->above_uv[TX_4X4]; + mask_4x4_int = lfm->int_4x4_uv; for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) { const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1; const unsigned int mask_4x4_int_r = skip_border_4x4_r ? - 0 : (mask_4x4_int_row[r]); + 0 : (mask_4x4_int & 0xf); unsigned int mask_16x16_r; unsigned int mask_8x8_r; unsigned int mask_4x4_r; @@ -1140,66 +1188,61 @@ static void filter_block_plane(VP9_COMMON *const cm, mask_16x16 >>= 4; mask_8x8 >>= 4; mask_4x4 >>= 4; + mask_4x4_int >>= 4; } } } void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer, - VP9_COMMON *cm, MACROBLOCKD *xd, + VP9_COMMON *cm, + struct macroblockd_plane planes[MAX_MB_PLANE], int start, int stop, int y_only) { const int num_planes = y_only ? 1 : MAX_MB_PLANE; - int mi_row, mi_col; + const int use_420 = y_only || (planes[1].subsampling_y == 1 && + planes[1].subsampling_x == 1); LOOP_FILTER_MASK lfm; -#if CONFIG_NON420 - int use_420 = y_only || (xd->plane[1].subsampling_y == 1 && - xd->plane[1].subsampling_x == 1); -#endif + int mi_row, mi_col; for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) { - MODE_INFO **mi_8x8 = cm->mi_grid_visible + mi_row * cm->mode_info_stride; + MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride; for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) { int plane; - setup_dst_planes(xd, frame_buffer, mi_row, mi_col); + vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col); // TODO(JBB): Make setup_mask work for non 420. -#if CONFIG_NON420 if (use_420) -#endif - setup_mask(cm, mi_row, mi_col, mi_8x8 + mi_col, cm->mode_info_stride, - &lfm); + vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, + &lfm); for (plane = 0; plane < num_planes; ++plane) { -#if CONFIG_NON420 if (use_420) -#endif - filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm); -#if CONFIG_NON420 + vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm); else - filter_block_plane_non420(cm, &xd->plane[plane], mi_8x8 + mi_col, + filter_block_plane_non420(cm, &planes[plane], mi + mi_col, mi_row, mi_col); -#endif } } } } -void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd, +void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame, + VP9_COMMON *cm, MACROBLOCKD *xd, int frame_filter_level, - int y_only, int partial) { + int y_only, int partial_frame) { int start_mi_row, end_mi_row, mi_rows_to_filter; if (!frame_filter_level) return; start_mi_row = 0; mi_rows_to_filter = cm->mi_rows; - if (partial && cm->mi_rows > 8) { + if (partial_frame && cm->mi_rows > 8) { start_mi_row = cm->mi_rows >> 1; start_mi_row &= 0xfffffff8; mi_rows_to_filter = MAX(cm->mi_rows / 8, 8); } end_mi_row = start_mi_row + mi_rows_to_filter; vp9_loop_filter_frame_init(cm, frame_filter_level); - vp9_loop_filter_rows(cm->frame_to_show, cm, xd, + vp9_loop_filter_rows(frame, cm, xd->plane, start_mi_row, end_mi_row, y_only); } @@ -1207,7 +1250,7 @@ void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd, int vp9_loop_filter_worker(void *arg1, void *arg2) { LFWorkerData *const lf_data = (LFWorkerData*)arg1; (void)arg2; - vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, &lf_data->xd, + vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->start, lf_data->stop, lf_data->y_only); return 1; } diff --git a/media/libvpx/vp9/common/vp9_loopfilter.h b/media/libvpx/vp9/common/vp9_loopfilter.h index 62389ea5e339..6fa2773e594c 100644 --- a/media/libvpx/vp9/common/vp9_loopfilter.h +++ b/media/libvpx/vp9/common/vp9_loopfilter.h @@ -17,6 +17,10 @@ #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_seg_common.h" +#ifdef __cplusplus +extern "C" { +#endif + #define MAX_LOOP_FILTER 63 #define MAX_SHARPNESS 7 @@ -54,12 +58,44 @@ typedef struct { typedef struct { loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1]; uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS]; - uint8_t mode_lf_lut[MB_MODE_COUNT]; } loop_filter_info_n; +// This structure holds bit masks for all 8x8 blocks in a 64x64 region. +// Each 1 bit represents a position in which we want to apply the loop filter. +// Left_ entries refer to whether we apply a filter on the border to the +// left of the block. Above_ entries refer to whether or not to apply a +// filter on the above border. Int_ entries refer to whether or not to +// apply borders on the 4x4 edges within the 8x8 block that each bit +// represents. +// Since each transform is accompanied by a potentially different type of +// loop filter there is a different entry in the array for each transform size. +typedef struct { + uint64_t left_y[TX_SIZES]; + uint64_t above_y[TX_SIZES]; + uint64_t int_4x4_y; + uint16_t left_uv[TX_SIZES]; + uint16_t above_uv[TX_SIZES]; + uint16_t int_4x4_uv; + uint8_t lfl_y[64]; + uint8_t lfl_uv[16]; +} LOOP_FILTER_MASK; + /* assorted loopfilter functions which get used elsewhere */ struct VP9Common; struct macroblockd; +struct VP9LfSyncData; + +// This function sets up the bit masks for the entire 64x64 region represented +// by mi_row, mi_col. +void vp9_setup_mask(struct VP9Common *const cm, + const int mi_row, const int mi_col, + MODE_INFO **mi_8x8, const int mode_info_stride, + LOOP_FILTER_MASK *lfm); + +void vp9_filter_block_plane(struct VP9Common *const cm, + struct macroblockd_plane *const plane, + int mi_row, + LOOP_FILTER_MASK *lfm); void vp9_loop_filter_init(struct VP9Common *cm); @@ -68,27 +104,35 @@ void vp9_loop_filter_init(struct VP9Common *cm); // calls this function directly. void vp9_loop_filter_frame_init(struct VP9Common *cm, int default_filt_lvl); -void vp9_loop_filter_frame(struct VP9Common *cm, +void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame, + struct VP9Common *cm, struct macroblockd *mbd, int filter_level, - int y_only, int partial); + int y_only, int partial_frame); // Apply the loop filter to [start, stop) macro block rows in frame_buffer. void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer, - struct VP9Common *cm, struct macroblockd *xd, + struct VP9Common *cm, + struct macroblockd_plane planes[MAX_MB_PLANE], int start, int stop, int y_only); typedef struct LoopFilterWorkerData { const YV12_BUFFER_CONFIG *frame_buffer; struct VP9Common *cm; - struct macroblockd xd; // TODO(jzern): most of this is unnecessary to the - // loopfilter. the planes are necessary as their state - // is changed during decode. + struct macroblockd_plane planes[MAX_MB_PLANE]; + int start; int stop; int y_only; + + struct VP9LfSyncData *lf_sync; + int num_lf_workers; } LFWorkerData; // Operates on the rows described by LFWorkerData passed as 'arg1'. int vp9_loop_filter_worker(void *arg1, void *arg2); +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_LOOPFILTER_H_ diff --git a/media/libvpx/vp9/common/vp9_loopfilter_filters.c b/media/libvpx/vp9/common/vp9_loopfilter_filters.c index 2c4bf6cb2378..25d3311b6f6b 100644 --- a/media/libvpx/vp9/common/vp9_loopfilter_filters.c +++ b/media/libvpx/vp9/common/vp9_loopfilter_filters.c @@ -70,7 +70,7 @@ static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0, return hev; } -static INLINE void filter4(int8_t mask, uint8_t hev, uint8_t *op1, +static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1) { int8_t filter1, filter2; @@ -78,6 +78,7 @@ static INLINE void filter4(int8_t mask, uint8_t hev, uint8_t *op1, const int8_t ps0 = (int8_t) *op0 ^ 0x80; const int8_t qs0 = (int8_t) *oq0 ^ 0x80; const int8_t qs1 = (int8_t) *oq1 ^ 0x80; + const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); // add outer taps if we have high edge variance int8_t filter = signed_char_clamp(ps1 - qs1) & hev; @@ -101,11 +102,9 @@ static INLINE void filter4(int8_t mask, uint8_t hev, uint8_t *op1, *op1 = signed_char_clamp(ps1 + filter) ^ 0x80; } -void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int p /* pitch */, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, - int count) { +void vp9_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int count) { int i; // loop filter designed to work using chars so that we can make maximum use @@ -115,17 +114,22 @@ void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int p /* pitch */, const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1); - filter4(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p); + filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p); ++s; } } -void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, - int count) { +void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, 1); + vp9_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, 1); +} + +void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int count) { int i; // loop filter designed to work using chars so that we can make maximum use @@ -135,13 +139,21 @@ void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1); - filter4(mask, hev, s - 2, s - 1, s, s + 1); + filter4(mask, *thresh, s - 2, s - 1, s, s + 1); s += pitch; } } -static INLINE void filter8(int8_t mask, uint8_t hev, uint8_t flat, +void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, 1); + vp9_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, + thresh1, 1); +} + +static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat, uint8_t *op3, uint8_t *op2, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1, @@ -158,15 +170,13 @@ static INLINE void filter8(int8_t mask, uint8_t hev, uint8_t flat, *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3); *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3); } else { - filter4(mask, hev, op1, op0, oq0, oq1); + filter4(mask, thresh, op1, op0, oq0, oq1); } } -void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, - int count) { +void vp9_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int count) { int i; // loop filter designed to work using chars so that we can make maximum use @@ -177,19 +187,24 @@ void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int p, const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - filter8(mask, hev, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, - s, s + 1 * p, s + 2 * p, s + 3 * p); + filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, + s, s + 1 * p, s + 2 * p, s + 3 * p); ++s; } } -void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, - int count) { +void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, 1); + vp9_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, 1); +} + +void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int count) { int i; for (i = 0; i < 8 * count; ++i) { @@ -197,15 +212,23 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t hev = hev_mask(thresh[0], p1, p0, q0, q1); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - filter8(mask, hev, flat, s - 4, s - 3, s - 2, s - 1, - s, s + 1, s + 2, s + 3); + filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, + s, s + 1, s + 2, s + 3); s += pitch; } } -static INLINE void filter16(int8_t mask, uint8_t hev, +void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, 1); + vp9_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, + thresh1, 1); +} + +static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat, uint8_t flat2, uint8_t *op7, uint8_t *op6, uint8_t *op5, uint8_t *op4, @@ -252,15 +275,13 @@ static INLINE void filter16(int8_t mask, uint8_t hev, *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4); } else { - filter8(mask, hev, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); + filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); } } -void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, - int count) { +void vp9_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int count) { int i; // loop filter designed to work using chars so that we can make maximum use @@ -270,13 +291,12 @@ void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int p, const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat2 = flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]); - filter16(mask, hev, flat, flat2, + filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, @@ -285,25 +305,35 @@ void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int p, } } -void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh) { +static void mb_lpf_vertical_edge_w(uint8_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count) { int i; - for (i = 0; i < 8; ++i) { + for (i = 0; i < count; ++i) { const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); - const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], s[7]); - filter16(mask, hev, flat, flat2, + filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7); s += p; } } + +void vp9_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8); +} + +void vp9_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16); +} diff --git a/media/libvpx/vp9/common/vp9_mv.h b/media/libvpx/vp9/common/vp9_mv.h index 31a79b984023..5d89da8c25a4 100644 --- a/media/libvpx/vp9/common/vp9_mv.h +++ b/media/libvpx/vp9/common/vp9_mv.h @@ -15,7 +15,11 @@ #include "vp9/common/vp9_common.h" -typedef struct { +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct mv { int16_t row; int16_t col; } MV; @@ -25,15 +29,27 @@ typedef union int_mv { MV as_mv; } int_mv; /* facilitates faster equality tests and copies */ -typedef struct { +typedef struct mv32 { int32_t row; int32_t col; } MV32; -static void clamp_mv(MV *mv, int min_col, int max_col, - int min_row, int max_row) { +static INLINE int is_zero_mv(const MV *mv) { + return *((const uint32_t *)mv) == 0; +} + +static INLINE int is_equal_mv(const MV *a, const MV *b) { + return *((const uint32_t *)a) == *((const uint32_t *)b); +} + +static INLINE void clamp_mv(MV *mv, int min_col, int max_col, + int min_row, int max_row) { mv->col = clamp(mv->col, min_col, max_col); mv->row = clamp(mv->row, min_row, max_row); } +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_MV_H_ diff --git a/media/libvpx/vp9/common/vp9_mvref_common.c b/media/libvpx/vp9/common/vp9_mvref_common.c index 8df8aec84840..ab64d30360e8 100644 --- a/media/libvpx/vp9/common/vp9_mvref_common.c +++ b/media/libvpx/vp9/common/vp9_mvref_common.c @@ -11,187 +11,23 @@ #include "vp9/common/vp9_mvref_common.h" -#define MVREF_NEIGHBOURS 8 - -typedef enum { - BOTH_ZERO = 0, - ZERO_PLUS_PREDICTED = 1, - BOTH_PREDICTED = 2, - NEW_PLUS_NON_INTRA = 3, - BOTH_NEW = 4, - INTRA_PLUS_NON_INTRA = 5, - BOTH_INTRA = 6, - INVALID_CASE = 9 -} motion_vector_context; - -// This is used to figure out a context for the ref blocks. The code flattens -// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by -// adding 9 for each intra block, 3 for each zero mv and 1 for each new -// motion vector. This single number is then converted into a context -// with a single lookup ( counter_to_context ). -static const int mode_2_counter[MB_MODE_COUNT] = { - 9, // DC_PRED - 9, // V_PRED - 9, // H_PRED - 9, // D45_PRED - 9, // D135_PRED - 9, // D117_PRED - 9, // D153_PRED - 9, // D207_PRED - 9, // D63_PRED - 9, // TM_PRED - 0, // NEARESTMV - 0, // NEARMV - 3, // ZEROMV - 1, // NEWMV -}; - -// There are 3^3 different combinations of 3 counts that can be either 0,1 or -// 2. However the actual count can never be greater than 2 so the highest -// counter we need is 18. 9 is an invalid counter that's never used. -static const int counter_to_context[19] = { - BOTH_PREDICTED, // 0 - NEW_PLUS_NON_INTRA, // 1 - BOTH_NEW, // 2 - ZERO_PLUS_PREDICTED, // 3 - NEW_PLUS_NON_INTRA, // 4 - INVALID_CASE, // 5 - BOTH_ZERO, // 6 - INVALID_CASE, // 7 - INVALID_CASE, // 8 - INTRA_PLUS_NON_INTRA, // 9 - INTRA_PLUS_NON_INTRA, // 10 - INVALID_CASE, // 11 - INTRA_PLUS_NON_INTRA, // 12 - INVALID_CASE, // 13 - INVALID_CASE, // 14 - INVALID_CASE, // 15 - INVALID_CASE, // 16 - INVALID_CASE, // 17 - BOTH_INTRA // 18 -}; - -static const MV mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = { - // 4X4 - {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, - // 4X8 - {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, - // 8X4 - {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, - // 8X8 - {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, - // 8X16 - {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}}, - // 16X8 - {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}}, - // 16X16 - {{-1, 0}, {0, -1}, {-1, 1}, {1, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}}, - // 16X32 - {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}}, - // 32X16 - {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}}, - // 32X32 - {{-1, 1}, {1, -1}, {-1, 2}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}}, - // 32X64 - {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}}, - // 64X32 - {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}}, - // 64X64 - {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}} -}; - -static const int idx_n_column_to_subblock[4][2] = { - {1, 2}, - {1, 3}, - {3, 2}, - {3, 3} -}; - -// clamp_mv_ref -#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units - -static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) { - clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER, - xd->mb_to_right_edge + MV_BORDER, - xd->mb_to_top_edge - MV_BORDER, - xd->mb_to_bottom_edge + MV_BORDER); -} - -// This function returns either the appropriate sub block or block's mv -// on whether the block_size < 8x8 and we have check_sub_blocks set. -static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv, - int search_col, int block_idx) { - return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8 - ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]] - .as_mv[which_mv] - : candidate->mbmi.mv[which_mv]; -} - - -// Performs mv sign inversion if indicated by the reference frame combination. -static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref, - const MV_REFERENCE_FRAME this_ref_frame, - const int *ref_sign_bias) { - int_mv mv = mbmi->mv[ref]; - if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) { - mv.as_mv.row *= -1; - mv.as_mv.col *= -1; - } - return mv; -} - -// This macro is used to add a motion vector mv_ref list if it isn't -// already in the list. If it's the second motion vector it will also -// skip all additional processing and jump to done! -#define ADD_MV_REF_LIST(MV) \ - do { \ - if (refmv_count) { \ - if ((MV).as_int != mv_ref_list[0].as_int) { \ - mv_ref_list[refmv_count] = (MV); \ - goto Done; \ - } \ - } else { \ - mv_ref_list[refmv_count++] = (MV); \ - } \ - } while (0) - -// If either reference frame is different, not INTRA, and they -// are different from each other scale and add the mv to our list. -#define IF_DIFF_REF_FRAME_ADD_MV(CANDIDATE) \ - do { \ - if ((CANDIDATE)->ref_frame[0] != ref_frame) \ - ADD_MV_REF_LIST(scale_mv((CANDIDATE), 0, ref_frame, ref_sign_bias)); \ - if ((CANDIDATE)->ref_frame[1] != ref_frame && \ - has_second_ref(CANDIDATE) && \ - (CANDIDATE)->mv[1].as_int != (CANDIDATE)->mv[0].as_int) \ - ADD_MV_REF_LIST(scale_mv((CANDIDATE), 1, ref_frame, ref_sign_bias)); \ - } while (0) - - -// Checks that the given mi_row, mi_col and search point -// are inside the borders of the tile. -static INLINE int is_inside(const TileInfo *const tile, - int mi_col, int mi_row, int mi_rows, - const MV *mv) { - return !(mi_row + mv->row < 0 || - mi_col + mv->col < tile->mi_col_start || - mi_row + mv->row >= mi_rows || - mi_col + mv->col >= tile->mi_col_end); -} - // This function searches the neighbourhood of a given MB/SB // to try and find candidate reference vectors. -void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, - const TileInfo *const tile, - MODE_INFO *mi, const MODE_INFO *prev_mi, - MV_REFERENCE_FRAME ref_frame, - int_mv *mv_ref_list, - int block_idx, - int mi_row, int mi_col) { +static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, + const TileInfo *const tile, + MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, + int block, int mi_row, int mi_col) { const int *ref_sign_bias = cm->ref_frame_sign_bias; int i, refmv_count = 0; - const MV *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type]; + const MODE_INFO *prev_mi = !cm->error_resilient_mode && cm->prev_mi + ? cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col] + : NULL; const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL; + + + const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type]; + int different_ref_found = 0; int context_counter = 0; @@ -202,26 +38,19 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, // if the size < 8x8 we get the mv from the bmi substructure, // and we also need to keep a mode count. for (i = 0; i < 2; ++i) { - const MV *const mv_ref = &mv_ref_search[i]; + const POSITION *const mv_ref = &mv_ref_search[i]; if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { - const MODE_INFO *const candidate_mi = xd->mi_8x8[mv_ref->col + mv_ref->row - * xd->mode_info_stride]; + const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row * + xd->mi_stride]; const MB_MODE_INFO *const candidate = &candidate_mi->mbmi; // Keep counts for entropy encoding. context_counter += mode_2_counter[candidate->mode]; + different_ref_found = 1; - // Check if the candidate comes from the same reference frame. - if (candidate->ref_frame[0] == ref_frame) { - ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, - mv_ref->col, block_idx)); - different_ref_found = candidate->ref_frame[1] != ref_frame; - } else { - if (candidate->ref_frame[1] == ref_frame) - // Add second motion vector if it has the same ref_frame. - ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, - mv_ref->col, block_idx)); - different_ref_found = 1; - } + if (candidate->ref_frame[0] == ref_frame) + ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block)); + else if (candidate->ref_frame[1] == ref_frame) + ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block)); } } @@ -229,20 +58,16 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, // as before except we don't need to keep track of sub blocks or // mode counts. for (; i < MVREF_NEIGHBOURS; ++i) { - const MV *const mv_ref = &mv_ref_search[i]; + const POSITION *const mv_ref = &mv_ref_search[i]; if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { - const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col + - mv_ref->row - * xd->mode_info_stride]->mbmi; + const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row * + xd->mi_stride]->mbmi; + different_ref_found = 1; - if (candidate->ref_frame[0] == ref_frame) { + if (candidate->ref_frame[0] == ref_frame) ADD_MV_REF_LIST(candidate->mv[0]); - different_ref_found = candidate->ref_frame[1] != ref_frame; - } else { - if (candidate->ref_frame[1] == ref_frame) - ADD_MV_REF_LIST(candidate->mv[1]); - different_ref_found = 1; - } + else if (candidate->ref_frame[1] == ref_frame) + ADD_MV_REF_LIST(candidate->mv[1]); } } @@ -259,21 +84,19 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, // different reference frames. if (different_ref_found) { for (i = 0; i < MVREF_NEIGHBOURS; ++i) { - const MV *mv_ref = &mv_ref_search[i]; + const POSITION *mv_ref = &mv_ref_search[i]; if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { - const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col + - mv_ref->row - * xd->mode_info_stride]->mbmi; + const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row + * xd->mi_stride]->mbmi; // If the candidate is INTRA we don't want to consider its mv. - if (is_inter_block(candidate)) - IF_DIFF_REF_FRAME_ADD_MV(candidate); + IF_DIFF_REF_FRAME_ADD_MV(candidate); } } } // Since we still don't have a candidate we'll try the last frame. - if (prev_mbmi && is_inter_block(prev_mbmi)) + if (prev_mbmi) IF_DIFF_REF_FRAME_ADD_MV(prev_mbmi); Done: @@ -284,3 +107,84 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) clamp_mv_ref(&mv_ref_list[i].as_mv, xd); } + +void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, + const TileInfo *const tile, + MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, + int mi_row, int mi_col) { + find_mv_refs_idx(cm, xd, tile, mi, ref_frame, mv_ref_list, -1, + mi_row, mi_col); +} + +static void lower_mv_precision(MV *mv, int allow_hp) { + const int use_hp = allow_hp && vp9_use_mv_hp(mv); + if (!use_hp) { + if (mv->row & 1) + mv->row += (mv->row > 0 ? -1 : 1); + if (mv->col & 1) + mv->col += (mv->col > 0 ? -1 : 1); + } +} + + +void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, + int_mv *mvlist, int_mv *nearest, int_mv *near) { + int i; + // Make sure all the candidates are properly clamped etc + for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) { + lower_mv_precision(&mvlist[i].as_mv, allow_hp); + clamp_mv2(&mvlist[i].as_mv, xd); + } + *nearest = mvlist[0]; + *near = mvlist[1]; +} + +void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, + const TileInfo *const tile, + int block, int ref, int mi_row, int mi_col, + int_mv *nearest, int_mv *near) { + int_mv mv_list[MAX_MV_REF_CANDIDATES]; + MODE_INFO *const mi = xd->mi[0]; + b_mode_info *bmi = mi->bmi; + int n; + + assert(MAX_MV_REF_CANDIDATES == 2); + + find_mv_refs_idx(cm, xd, tile, mi, mi->mbmi.ref_frame[ref], mv_list, block, + mi_row, mi_col); + + near->as_int = 0; + switch (block) { + case 0: + nearest->as_int = mv_list[0].as_int; + near->as_int = mv_list[1].as_int; + break; + case 1: + case 2: + nearest->as_int = bmi[0].as_mv[ref].as_int; + for (n = 0; n < MAX_MV_REF_CANDIDATES; ++n) + if (nearest->as_int != mv_list[n].as_int) { + near->as_int = mv_list[n].as_int; + break; + } + break; + case 3: { + int_mv candidates[2 + MAX_MV_REF_CANDIDATES]; + candidates[0] = bmi[1].as_mv[ref]; + candidates[1] = bmi[0].as_mv[ref]; + candidates[2] = mv_list[0]; + candidates[3] = mv_list[1]; + + nearest->as_int = bmi[2].as_mv[ref].as_int; + for (n = 0; n < 2 + MAX_MV_REF_CANDIDATES; ++n) + if (nearest->as_int != candidates[n].as_int) { + near->as_int = candidates[n].as_int; + break; + } + break; + } + default: + assert("Invalid block index."); + } +} diff --git a/media/libvpx/vp9/common/vp9_mvref_common.h b/media/libvpx/vp9/common/vp9_mvref_common.h index ce4c55983be3..a937b7823a51 100644 --- a/media/libvpx/vp9/common/vp9_mvref_common.h +++ b/media/libvpx/vp9/common/vp9_mvref_common.h @@ -7,29 +7,221 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP9_COMMON_VP9_MVREF_COMMON_H_ +#define VP9_COMMON_VP9_MVREF_COMMON_H_ #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_blockd.h" -#ifndef VP9_COMMON_VP9_MVREF_COMMON_H_ -#define VP9_COMMON_VP9_MVREF_COMMON_H_ +#ifdef __cplusplus +extern "C" { +#endif -void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, - const TileInfo *const tile, - MODE_INFO *mi, const MODE_INFO *prev_mi, - MV_REFERENCE_FRAME ref_frame, - int_mv *mv_ref_list, - int block_idx, - int mi_row, int mi_col); +#define LEFT_TOP_MARGIN ((VP9_ENC_BORDER_IN_PIXELS - VP9_INTERP_EXTEND) << 3) +#define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS -\ + VP9_INTERP_EXTEND) << 3) -static INLINE void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, - const TileInfo *const tile, - MODE_INFO *mi, const MODE_INFO *prev_mi, - MV_REFERENCE_FRAME ref_frame, - int_mv *mv_ref_list, - int mi_row, int mi_col) { - vp9_find_mv_refs_idx(cm, xd, tile, mi, prev_mi, ref_frame, - mv_ref_list, -1, mi_row, mi_col); +#define MVREF_NEIGHBOURS 8 + +typedef struct position { + int row; + int col; +} POSITION; + +typedef enum { + BOTH_ZERO = 0, + ZERO_PLUS_PREDICTED = 1, + BOTH_PREDICTED = 2, + NEW_PLUS_NON_INTRA = 3, + BOTH_NEW = 4, + INTRA_PLUS_NON_INTRA = 5, + BOTH_INTRA = 6, + INVALID_CASE = 9 +} motion_vector_context; + +// This is used to figure out a context for the ref blocks. The code flattens +// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by +// adding 9 for each intra block, 3 for each zero mv and 1 for each new +// motion vector. This single number is then converted into a context +// with a single lookup ( counter_to_context ). +static const int mode_2_counter[MB_MODE_COUNT] = { + 9, // DC_PRED + 9, // V_PRED + 9, // H_PRED + 9, // D45_PRED + 9, // D135_PRED + 9, // D117_PRED + 9, // D153_PRED + 9, // D207_PRED + 9, // D63_PRED + 9, // TM_PRED + 0, // NEARESTMV + 0, // NEARMV + 3, // ZEROMV + 1, // NEWMV +}; + +// There are 3^3 different combinations of 3 counts that can be either 0,1 or +// 2. However the actual count can never be greater than 2 so the highest +// counter we need is 18. 9 is an invalid counter that's never used. +static const int counter_to_context[19] = { + BOTH_PREDICTED, // 0 + NEW_PLUS_NON_INTRA, // 1 + BOTH_NEW, // 2 + ZERO_PLUS_PREDICTED, // 3 + NEW_PLUS_NON_INTRA, // 4 + INVALID_CASE, // 5 + BOTH_ZERO, // 6 + INVALID_CASE, // 7 + INVALID_CASE, // 8 + INTRA_PLUS_NON_INTRA, // 9 + INTRA_PLUS_NON_INTRA, // 10 + INVALID_CASE, // 11 + INTRA_PLUS_NON_INTRA, // 12 + INVALID_CASE, // 13 + INVALID_CASE, // 14 + INVALID_CASE, // 15 + INVALID_CASE, // 16 + INVALID_CASE, // 17 + BOTH_INTRA // 18 +}; + +static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = { + // 4X4 + {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, + // 4X8 + {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, + // 8X4 + {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, + // 8X8 + {{-1, 0}, {0, -1}, {-1, -1}, {-2, 0}, {0, -2}, {-2, -1}, {-1, -2}, {-2, -2}}, + // 8X16 + {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}}, + // 16X8 + {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}}, + // 16X16 + {{-1, 0}, {0, -1}, {-1, 1}, {1, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}}, + // 16X32 + {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}}, + // 32X16 + {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}}, + // 32X32 + {{-1, 1}, {1, -1}, {-1, 2}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-3, -3}}, + // 32X64 + {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}}, + // 64X32 + {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}}, + // 64X64 + {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}} +}; + +static const int idx_n_column_to_subblock[4][2] = { + {1, 2}, + {1, 3}, + {3, 2}, + {3, 3} +}; + +// clamp_mv_ref +#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units + +static INLINE void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) { + clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER, + xd->mb_to_right_edge + MV_BORDER, + xd->mb_to_top_edge - MV_BORDER, + xd->mb_to_bottom_edge + MV_BORDER); } +// This function returns either the appropriate sub block or block's mv +// on whether the block_size < 8x8 and we have check_sub_blocks set. +static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv, + int search_col, int block_idx) { + return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8 + ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]] + .as_mv[which_mv] + : candidate->mbmi.mv[which_mv]; +} + + +// Performs mv sign inversion if indicated by the reference frame combination. +static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref, + const MV_REFERENCE_FRAME this_ref_frame, + const int *ref_sign_bias) { + int_mv mv = mbmi->mv[ref]; + if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) { + mv.as_mv.row *= -1; + mv.as_mv.col *= -1; + } + return mv; +} + +// This macro is used to add a motion vector mv_ref list if it isn't +// already in the list. If it's the second motion vector it will also +// skip all additional processing and jump to done! +#define ADD_MV_REF_LIST(mv) \ + do { \ + if (refmv_count) { \ + if ((mv).as_int != mv_ref_list[0].as_int) { \ + mv_ref_list[refmv_count] = (mv); \ + goto Done; \ + } \ + } else { \ + mv_ref_list[refmv_count++] = (mv); \ + } \ + } while (0) + +// If either reference frame is different, not INTRA, and they +// are different from each other scale and add the mv to our list. +#define IF_DIFF_REF_FRAME_ADD_MV(mbmi) \ + do { \ + if (is_inter_block(mbmi)) { \ + if ((mbmi)->ref_frame[0] != ref_frame) \ + ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias)); \ + if (has_second_ref(mbmi) && \ + (mbmi)->ref_frame[1] != ref_frame && \ + (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \ + ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias)); \ + } \ + } while (0) + + +// Checks that the given mi_row, mi_col and search point +// are inside the borders of the tile. +static INLINE int is_inside(const TileInfo *const tile, + int mi_col, int mi_row, int mi_rows, + const POSITION *mi_pos) { + return !(mi_row + mi_pos->row < 0 || + mi_col + mi_pos->col < tile->mi_col_start || + mi_row + mi_pos->row >= mi_rows || + mi_col + mi_pos->col >= tile->mi_col_end); +} + +// TODO(jingning): this mv clamping function should be block size dependent. +static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { + clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN, + xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, + xd->mb_to_top_edge - LEFT_TOP_MARGIN, + xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); +} + +void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, + const TileInfo *const tile, + MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, int mi_row, int mi_col); + +// check a list of motion vectors by sad score using a number rows of pixels +// above and a number cols of pixels in the left to select the one with best +// score to use as ref motion vector +void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, + int_mv *mvlist, int_mv *nearest, int_mv *near); + +void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, + const TileInfo *const tile, + int block, int ref, int mi_row, int mi_col, + int_mv *nearest, int_mv *near); + +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_MVREF_COMMON_H_ diff --git a/media/libvpx/vp9/common/vp9_onyx.h b/media/libvpx/vp9/common/vp9_onyx.h deleted file mode 100644 index cda68a28519a..000000000000 --- a/media/libvpx/vp9/common/vp9_onyx.h +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_COMMON_VP9_ONYX_H_ -#define VP9_COMMON_VP9_ONYX_H_ - -#ifdef __cplusplus -extern "C" -{ // NOLINT -#endif - -#include "./vpx_config.h" -#include "vpx/internal/vpx_codec_internal.h" -#include "vpx/vp8cx.h" -#include "vpx_scale/yv12config.h" -#include "vp9/common/vp9_ppflags.h" - -#define MAX_SEGMENTS 8 - - typedef int *VP9_PTR; - - /* Create/destroy static data structures. */ - - typedef enum { - NORMAL = 0, - FOURFIVE = 1, - THREEFIVE = 2, - ONETWO = 3 - } VPX_SCALING; - - typedef enum { - VP9_LAST_FLAG = 1, - VP9_GOLD_FLAG = 2, - VP9_ALT_FLAG = 4 - } VP9_REFFRAME; - - - typedef enum { - USAGE_STREAM_FROM_SERVER = 0x0, - USAGE_LOCAL_FILE_PLAYBACK = 0x1, - USAGE_CONSTRAINED_QUALITY = 0x2, - USAGE_CONSTANT_QUALITY = 0x3, - } END_USAGE; - - - typedef enum { - MODE_GOODQUALITY = 0x1, - MODE_BESTQUALITY = 0x2, - MODE_FIRSTPASS = 0x3, - MODE_SECONDPASS = 0x4, - MODE_SECONDPASS_BEST = 0x5, - } MODE; - - typedef enum { - FRAMEFLAGS_KEY = 1, - FRAMEFLAGS_GOLDEN = 2, - FRAMEFLAGS_ALTREF = 4, - } FRAMETYPE_FLAGS; - - typedef enum { - NO_AQ = 0, - VARIANCE_AQ = 1, - AQ_MODES_COUNT // This should always be the last member of the enum - } AQ_MODES; - - typedef struct { - int version; // 4 versions of bitstream defined: - // 0 - best quality/slowest decode, - // 3 - lowest quality/fastest decode - int width; // width of data passed to the compressor - int height; // height of data passed to the compressor - double framerate; // set to passed in framerate - int64_t target_bandwidth; // bandwidth to be used in kilobits per second - - int noise_sensitivity; // pre processing blur: recommendation 0 - int Sharpness; // sharpening output: recommendation 0: - int cpu_used; - unsigned int rc_max_intra_bitrate_pct; - - // mode -> - // (0)=Realtime/Live Encoding. This mode is optimized for realtime - // encoding (for example, capturing a television signal or feed from - // a live camera). ( speed setting controls how fast ) - // (1)=Good Quality Fast Encoding. The encoder balances quality with the - // amount of time it takes to encode the output. ( speed setting - // controls how fast ) - // (2)=One Pass - Best Quality. The encoder places priority on the - // quality of the output over encoding speed. The output is compressed - // at the highest possible quality. This option takes the longest - // amount of time to encode. ( speed setting ignored ) - // (3)=Two Pass - First Pass. The encoder generates a file of statistics - // for use in the second encoding pass. ( speed setting controls how - // fast ) - // (4)=Two Pass - Second Pass. The encoder uses the statistics that were - // generated in the first encoding pass to create the compressed - // output. ( speed setting controls how fast ) - // (5)=Two Pass - Second Pass Best. The encoder uses the statistics that - // were generated in the first encoding pass to create the compressed - // output using the highest possible quality, and taking a - // longer amount of time to encode.. ( speed setting ignored ) - int Mode; - - // Key Framing Operations - int auto_key; // autodetect cut scenes and set the keyframes - int key_freq; // maximum distance to key frame. - - int allow_lag; // allow lagged compression (if 0 lagin frames is ignored) - int lag_in_frames; // how many frames lag before we start encoding - - // ---------------------------------------------------------------- - // DATARATE CONTROL OPTIONS - - int end_usage; // vbr or cbr - - // buffer targeting aggressiveness - int under_shoot_pct; - int over_shoot_pct; - - // buffering parameters - int64_t starting_buffer_level; // in seconds - int64_t optimal_buffer_level; - int64_t maximum_buffer_size; - - // controlling quality - int fixed_q; - int worst_allowed_q; - int best_allowed_q; - int cq_level; - int lossless; - int aq_mode; // Adaptive Quantization mode - - // two pass datarate control - int two_pass_vbrbias; // two pass datarate control tweaks - int two_pass_vbrmin_section; - int two_pass_vbrmax_section; - // END DATARATE CONTROL OPTIONS - // ---------------------------------------------------------------- - - // Spatial scalability - int ss_number_layers; - - // these parameters aren't to be used in final build don't use!!! - int play_alternate; - int alt_freq; - - int encode_breakout; // early breakout : for video conf recommend 800 - - /* Bitfield defining the error resiliency features to enable. - * Can provide decodable frames after losses in previous - * frames and decodable partitions after losses in the same frame. - */ - unsigned int error_resilient_mode; - - /* Bitfield defining the parallel decoding mode where the - * decoding in successive frames may be conducted in parallel - * just by decoding the frame headers. - */ - unsigned int frame_parallel_decoding_mode; - - int arnr_max_frames; - int arnr_strength; - int arnr_type; - - int tile_columns; - int tile_rows; - - struct vpx_fixed_buf two_pass_stats_in; - struct vpx_codec_pkt_list *output_pkt_list; - - vp8e_tuning tuning; - } VP9_CONFIG; - - - void vp9_initialize_enc(); - - VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf); - void vp9_remove_compressor(VP9_PTR *comp); - - void vp9_change_config(VP9_PTR onyx, VP9_CONFIG *oxcf); - - // receive a frames worth of data. caller can assume that a copy of this - // frame is made and not just a copy of the pointer.. - int vp9_receive_raw_frame(VP9_PTR comp, unsigned int frame_flags, - YV12_BUFFER_CONFIG *sd, int64_t time_stamp, - int64_t end_time_stamp); - - int vp9_get_compressed_data(VP9_PTR comp, unsigned int *frame_flags, - unsigned long *size, unsigned char *dest, - int64_t *time_stamp, int64_t *time_end, - int flush); - - int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest, - vp9_ppflags_t *flags); - - int vp9_use_as_reference(VP9_PTR comp, int ref_frame_flags); - - int vp9_update_reference(VP9_PTR comp, int ref_frame_flags); - - int vp9_copy_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag, - YV12_BUFFER_CONFIG *sd); - - int vp9_get_reference_enc(VP9_PTR ptr, int index, YV12_BUFFER_CONFIG **fb); - - int vp9_set_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag, - YV12_BUFFER_CONFIG *sd); - - int vp9_update_entropy(VP9_PTR comp, int update); - - int vp9_set_roimap(VP9_PTR comp, unsigned char *map, - unsigned int rows, unsigned int cols, - int delta_q[MAX_SEGMENTS], - int delta_lf[MAX_SEGMENTS], - unsigned int threshold[MAX_SEGMENTS]); - - int vp9_set_active_map(VP9_PTR comp, unsigned char *map, - unsigned int rows, unsigned int cols); - - int vp9_set_internal_size(VP9_PTR comp, - VPX_SCALING horiz_mode, VPX_SCALING vert_mode); - - int vp9_set_size_literal(VP9_PTR comp, unsigned int width, - unsigned int height); - - void vp9_set_svc(VP9_PTR comp, int use_svc); - - int vp9_get_quantizer(VP9_PTR c); - -#ifdef __cplusplus -} -#endif - -#endif // VP9_COMMON_VP9_ONYX_H_ diff --git a/media/libvpx/vp9/common/vp9_onyxc_int.h b/media/libvpx/vp9/common/vp9_onyxc_int.h index 2333a61db4ac..637867aa1ab4 100644 --- a/media/libvpx/vp9/common/vp9_onyxc_int.h +++ b/media/libvpx/vp9/common/vp9_onyxc_int.h @@ -18,6 +18,7 @@ #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_frame_buffers.h" #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_tile_common.h" @@ -25,71 +26,48 @@ #include "vp9/common/vp9_postproc.h" #endif -#define ALLOWED_REFS_PER_FRAME 3 +#ifdef __cplusplus +extern "C" { +#endif -#define NUM_REF_FRAMES_LOG2 3 -#define NUM_REF_FRAMES (1 << NUM_REF_FRAMES_LOG2) +#define REFS_PER_FRAME 3 + +#define REF_FRAMES_LOG2 3 +#define REF_FRAMES (1 << REF_FRAMES_LOG2) // 1 scratch frame for the new frame, 3 for scaled references on the encoder // TODO(jkoleszar): These 3 extra references could probably come from the // normal reference pool. -#define NUM_YV12_BUFFERS (NUM_REF_FRAMES + 4) +#define FRAME_BUFFERS (REF_FRAMES + 4) -#define NUM_FRAME_CONTEXTS_LOG2 2 -#define NUM_FRAME_CONTEXTS (1 << NUM_FRAME_CONTEXTS_LOG2) +#define FRAME_CONTEXTS_LOG2 2 +#define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2) -typedef struct frame_contexts { - vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1]; - vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1]; - vp9_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1]; - vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES]; - vp9_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS] - [SWITCHABLE_FILTERS - 1]; - vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1]; - vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS]; - vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS]; - vp9_prob single_ref_prob[REF_CONTEXTS][2]; - vp9_prob comp_ref_prob[REF_CONTEXTS]; - struct tx_probs tx_probs; - vp9_prob mbskip_probs[MBSKIP_CONTEXTS]; - nmv_context nmvc; -} FRAME_CONTEXT; - -typedef struct { - unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES]; - unsigned int uv_mode[INTRA_MODES][INTRA_MODES]; - unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES]; - vp9_coeff_count_model coef[TX_SIZES][BLOCK_TYPES]; - unsigned int eob_branch[TX_SIZES][BLOCK_TYPES][REF_TYPES] - [COEF_BANDS][PREV_COEF_CONTEXTS]; - unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS] - [SWITCHABLE_FILTERS]; - unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES]; - unsigned int intra_inter[INTRA_INTER_CONTEXTS][2]; - unsigned int comp_inter[COMP_INTER_CONTEXTS][2]; - unsigned int single_ref[REF_CONTEXTS][2][2]; - unsigned int comp_ref[REF_CONTEXTS][2]; - struct tx_counts tx; - unsigned int mbskip[MBSKIP_CONTEXTS][2]; - nmv_context_counts mv; -} FRAME_COUNTS; +extern const struct { + PARTITION_CONTEXT above; + PARTITION_CONTEXT left; +} partition_context_lookup[BLOCK_SIZES]; typedef enum { - SINGLE_PREDICTION_ONLY = 0, - COMP_PREDICTION_ONLY = 1, - HYBRID_PREDICTION = 2, - NB_PREDICTION_TYPES = 3, -} COMPPREDMODE_TYPE; + SINGLE_REFERENCE = 0, + COMPOUND_REFERENCE = 1, + REFERENCE_MODE_SELECT = 2, + REFERENCE_MODES = 3, +} REFERENCE_MODE; + + +typedef struct { + int ref_count; + vpx_codec_frame_buffer_t raw_frame_buffer; + YV12_BUFFER_CONFIG buf; +} RefCntBuffer; typedef struct VP9Common { struct vpx_internal_error_info error; DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]); -#if CONFIG_ALPHA - DECLARE_ALIGNED(16, int16_t, a_dequant[QINDEX_RANGE][8]); -#endif COLOR_SPACE color_space; @@ -106,19 +84,22 @@ typedef struct VP9Common { int subsampling_x; int subsampling_y; +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth; // Marks if we need to use 16bit frame buffers. +#endif + YV12_BUFFER_CONFIG *frame_to_show; - YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS]; - int fb_idx_ref_cnt[NUM_YV12_BUFFERS]; /* reference counts */ - int ref_frame_map[NUM_REF_FRAMES]; /* maps fb_idx to reference slot */ + RefCntBuffer frame_bufs[FRAME_BUFFERS]; + + int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */ // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and // roll new_fb_idx into it. - // Each frame can reference ALLOWED_REFS_PER_FRAME buffers - int active_ref_idx[ALLOWED_REFS_PER_FRAME]; - struct scale_factors active_ref_scale[ALLOWED_REFS_PER_FRAME]; - struct scale_factors_common active_ref_scale_comm[ALLOWED_REFS_PER_FRAME]; + // Each frame can reference REFS_PER_FRAME buffers + RefBuffer frame_refs[REFS_PER_FRAME]; + int new_fb_idx; YV12_BUFFER_CONFIG post_proc_buffer; @@ -128,6 +109,7 @@ typedef struct VP9Common { int show_frame; int last_show_frame; + int show_existing_frame; // Flag signaling that the frame is encoded using only INTRA modes. int intra_only; @@ -139,13 +121,12 @@ typedef struct VP9Common { // frame header, 3 reset all contexts. int reset_frame_context; - int frame_flags; // MBs, mb_rows/cols is in 16-pixel units; mi_rows/cols is in // MODE_INFO (8-pixel) units. int MBs; int mb_rows, mi_rows; int mb_cols, mi_cols; - int mode_info_stride; + int mi_stride; /* profile settings */ TX_MODE tx_mode; @@ -154,14 +135,16 @@ typedef struct VP9Common { int y_dc_delta_q; int uv_dc_delta_q; int uv_ac_delta_q; -#if CONFIG_ALPHA - int a_dc_delta_q; - int a_ac_delta_q; -#endif /* We allocate a MODE_INFO struct for each macroblock, together with an extra row on top and column on the left to simplify prediction. */ + int mi_idx; + int prev_mi_idx; + int mi_alloc_size; + MODE_INFO *mip_array[2]; + MODE_INFO **mi_grid_base_array[2]; + MODE_INFO *mip; /* Base of allocated array */ MODE_INFO *mi; /* Corresponds to upper left visible macroblock */ MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */ @@ -175,7 +158,7 @@ typedef struct VP9Common { // Persistent mb segment id map used in prediction. unsigned char *last_frame_seg_map; - INTERPOLATION_TYPE mcomp_filter_type; + INTERP_FILTER interp_filter; loop_filter_info_n lf_info; @@ -190,15 +173,18 @@ typedef struct VP9Common { int allow_comp_inter_inter; MV_REFERENCE_FRAME comp_fixed_ref; MV_REFERENCE_FRAME comp_var_ref[2]; - COMPPREDMODE_TYPE comp_pred_mode; + REFERENCE_MODE reference_mode; FRAME_CONTEXT fc; /* this frame entropy */ - FRAME_CONTEXT frame_contexts[NUM_FRAME_CONTEXTS]; + FRAME_CONTEXT frame_contexts[FRAME_CONTEXTS]; unsigned int frame_context_idx; /* Context to use/update */ FRAME_COUNTS counts; unsigned int current_video_frame; - int version; + BITSTREAM_PROFILE profile; + + // VPX_BITS_8 in profile 0 or 1, VPX_BITS_10 or VPX_BITS_12 in profile 2 or 3. + vpx_bit_depth_t bit_depth; #if CONFIG_VP9_POSTPROC struct postproc_state postproc_state; @@ -208,67 +194,101 @@ typedef struct VP9Common { int frame_parallel_decoding_mode; int log2_tile_cols, log2_tile_rows; + + // Private data associated with the frame buffer callbacks. + void *cb_priv; + vpx_get_frame_buffer_cb_fn_t get_fb_cb; + vpx_release_frame_buffer_cb_fn_t release_fb_cb; + + // Handles memory for the codec. + InternalFrameBufferList int_frame_buffers; + + PARTITION_CONTEXT *above_seg_context; + ENTROPY_CONTEXT *above_context; } VP9_COMMON; -// ref == 0 => LAST_FRAME -// ref == 1 => GOLDEN_FRAME -// ref == 2 => ALTREF_FRAME -static YV12_BUFFER_CONFIG *get_frame_ref_buffer(VP9_COMMON *cm, int ref) { - return &cm->yv12_fb[cm->active_ref_idx[ref]]; +static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) { + if (index < 0 || index >= REF_FRAMES) + return NULL; + if (cm->ref_frame_map[index] < 0) + return NULL; + assert(cm->ref_frame_map[index] < FRAME_BUFFERS); + return &cm->frame_bufs[cm->ref_frame_map[index]].buf; } -static YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) { - return &cm->yv12_fb[cm->new_fb_idx]; +static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) { + return &cm->frame_bufs[cm->new_fb_idx].buf; } -static int get_free_fb(VP9_COMMON *cm) { +static INLINE int get_free_fb(VP9_COMMON *cm) { int i; - for (i = 0; i < NUM_YV12_BUFFERS; i++) - if (cm->fb_idx_ref_cnt[i] == 0) + for (i = 0; i < FRAME_BUFFERS; i++) + if (cm->frame_bufs[i].ref_count == 0) break; - assert(i < NUM_YV12_BUFFERS); - cm->fb_idx_ref_cnt[i] = 1; + assert(i < FRAME_BUFFERS); + cm->frame_bufs[i].ref_count = 1; return i; } -static void ref_cnt_fb(int *buf, int *idx, int new_idx) { - if (buf[*idx] > 0) - buf[*idx]--; +static INLINE void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) { + const int ref_index = *idx; + + if (ref_index >= 0 && bufs[ref_index].ref_count > 0) + bufs[ref_index].ref_count--; *idx = new_idx; - buf[new_idx]++; + bufs[new_idx].ref_count++; } -static int mi_cols_aligned_to_sb(int n_mis) { +static INLINE int mi_cols_aligned_to_sb(int n_mis) { return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2); } -static INLINE const vp9_prob* get_partition_probs(VP9_COMMON *cm, int ctx) { - return cm->frame_type == KEY_FRAME ? vp9_kf_partition_probs[ctx] - : cm->fc.partition_prob[ctx]; +static INLINE void init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd) { + int i; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + xd->plane[i].dqcoeff = xd->dqcoeff[i]; + xd->above_context[i] = cm->above_context + + i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols); + } + + xd->above_seg_context = cm->above_seg_context; + xd->mi_stride = cm->mi_stride; } -static INLINE void set_skip_context( - MACROBLOCKD *xd, - ENTROPY_CONTEXT *above_context[MAX_MB_PLANE], - ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16], - int mi_row, int mi_col) { +static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) { + return cm->frame_type == KEY_FRAME || cm->intra_only; +} + +static INLINE const vp9_prob* get_partition_probs(const VP9_COMMON *cm, + int ctx) { + return frame_is_intra_only(cm) ? vp9_kf_partition_probs[ctx] + : cm->fc.partition_prob[ctx]; +} + +static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) { const int above_idx = mi_col * 2; const int left_idx = (mi_row * 2) & 15; int i; - for (i = 0; i < MAX_MB_PLANE; i++) { + for (i = 0; i < MAX_MB_PLANE; ++i) { struct macroblockd_plane *const pd = &xd->plane[i]; - pd->above_context = above_context[i] + (above_idx >> pd->subsampling_x); - pd->left_context = left_context[i] + (left_idx >> pd->subsampling_y); + pd->above_context = &xd->above_context[i][above_idx >> pd->subsampling_x]; + pd->left_context = &xd->left_context[i][left_idx >> pd->subsampling_y]; } } -static void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, - int mi_row, int bh, - int mi_col, int bw, - int mi_rows, int mi_cols) { +static INLINE int calc_mi_size(int len) { + // len is in mi units. + return len + MI_BLOCK_SIZE; +} + +static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, + int mi_row, int bh, + int mi_col, int bw, + int mi_rows, int mi_cols) { xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8; xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); @@ -279,72 +299,59 @@ static void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile, xd->left_available = (mi_col > tile->mi_col_start); } -static void set_prev_mi(VP9_COMMON *cm) { +static INLINE void set_prev_mi(VP9_COMMON *cm) { const int use_prev_in_find_mv_refs = cm->width == cm->last_width && cm->height == cm->last_height && - !cm->error_resilient_mode && !cm->intra_only && cm->last_show_frame; // Special case: set prev_mi to NULL when the previous mode info // context cannot be used. cm->prev_mi = use_prev_in_find_mv_refs ? - cm->prev_mip + cm->mode_info_stride + 1 : NULL; + cm->prev_mip + cm->mi_stride + 1 : NULL; } -static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) { - return cm->frame_type == KEY_FRAME || cm->intra_only; -} +static INLINE void update_partition_context(MACROBLOCKD *xd, + int mi_row, int mi_col, + BLOCK_SIZE subsize, + BLOCK_SIZE bsize) { + PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col; + PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK); -static INLINE void update_partition_context( - PARTITION_CONTEXT *above_seg_context, - PARTITION_CONTEXT left_seg_context[8], - int mi_row, int mi_col, - BLOCK_SIZE sb_type, - BLOCK_SIZE sb_size) { - PARTITION_CONTEXT *above_ctx = above_seg_context + mi_col; - PARTITION_CONTEXT *left_ctx = left_seg_context + (mi_row & MI_MASK); - - const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2; - const int bwl = b_width_log2(sb_type); - const int bhl = b_height_log2(sb_type); - const int boffset = b_width_log2(BLOCK_64X64) - bsl; - const char pcval0 = ~(0xe << boffset); - const char pcval1 = ~(0xf << boffset); - const char pcvalue[2] = {pcval0, pcval1}; - - assert(MAX(bwl, bhl) <= bsl); + // num_4x4_blocks_wide_lookup[bsize] / 2 + const int bs = num_8x8_blocks_wide_lookup[bsize]; // update the partition context at the end notes. set partition bits // of block sizes larger than the current one to be one, and partition // bits of smaller block sizes to be zero. - vpx_memset(above_ctx, pcvalue[bwl == bsl], bs); - vpx_memset(left_ctx, pcvalue[bhl == bsl], bs); + vpx_memset(above_ctx, partition_context_lookup[subsize].above, bs); + vpx_memset(left_ctx, partition_context_lookup[subsize].left, bs); } -static INLINE int partition_plane_context( - const PARTITION_CONTEXT *above_seg_context, - const PARTITION_CONTEXT left_seg_context[8], - int mi_row, int mi_col, - BLOCK_SIZE sb_type) { - const PARTITION_CONTEXT *above_ctx = above_seg_context + mi_col; - const PARTITION_CONTEXT *left_ctx = left_seg_context + (mi_row & MI_MASK); +static INLINE int partition_plane_context(const MACROBLOCKD *xd, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col; + const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK); - int bsl = mi_width_log2(sb_type), bs = 1 << bsl; + const int bsl = mi_width_log2(bsize); + const int bs = 1 << bsl; int above = 0, left = 0, i; - int boffset = mi_width_log2(BLOCK_64X64) - bsl; - assert(mi_width_log2(sb_type) == mi_height_log2(sb_type)); + assert(b_width_log2(bsize) == b_height_log2(bsize)); assert(bsl >= 0); - assert(boffset >= 0); for (i = 0; i < bs; i++) { above |= above_ctx[i]; left |= left_ctx[i]; } - above = (above & (1 << boffset)) > 0; - left = (left & (1 << boffset)) > 0; + above = (above & bs) > 0; + left = (left & bs) > 0; return (left * 2 + above) + bsl * PARTITION_PLOFFSET; } +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_ONYXC_INT_H_ diff --git a/media/libvpx/vp9/common/vp9_postproc.c b/media/libvpx/vp9/common/vp9_postproc.c index 212a28ab9764..e4e6ce7826ed 100644 --- a/media/libvpx/vp9/common/vp9_postproc.c +++ b/media/libvpx/vp9/common/vp9_postproc.c @@ -13,69 +13,18 @@ #include #include "./vpx_config.h" -#include "vpx_scale/yv12config.h" -#include "vp9/common/vp9_postproc.h" -#include "vp9/common/vp9_textblit.h" -#include "vpx_scale/vpx_scale.h" -#include "vp9/common/vp9_systemdependent.h" -#include "./vp9_rtcd.h" #include "./vpx_scale_rtcd.h" +#include "./vp9_rtcd.h" -#define RGB_TO_YUV(t) \ - ( (0.257*(float)(t >> 16)) + (0.504*(float)(t >> 8 & 0xff)) + \ - (0.098*(float)(t & 0xff)) + 16), \ - (-(0.148*(float)(t >> 16)) - (0.291*(float)(t >> 8 & 0xff)) + \ - (0.439*(float)(t & 0xff)) + 128), \ - ( (0.439*(float)(t >> 16)) - (0.368*(float)(t >> 8 & 0xff)) - \ - (0.071*(float)(t & 0xff)) + 128) +#include "vpx_scale/vpx_scale.h" +#include "vpx_scale/yv12config.h" -/* global constants */ -#if 0 && CONFIG_POSTPROC_VISUALIZER -static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = { - { RGB_TO_YUV(0x98FB98) }, /* PaleGreen */ - { RGB_TO_YUV(0x00FF00) }, /* Green */ - { RGB_TO_YUV(0xADFF2F) }, /* GreenYellow */ - { RGB_TO_YUV(0x8F0000) }, /* Dark Red */ - { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */ - { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */ - { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */ - { RGB_TO_YUV(0x8F0000) }, /* Dark Red */ - { RGB_TO_YUV(0x8F0000) }, /* Dark Red */ - { RGB_TO_YUV(0x228B22) }, /* ForestGreen */ - { RGB_TO_YUV(0x006400) }, /* DarkGreen */ - { RGB_TO_YUV(0x98F5FF) }, /* Cadet Blue */ - { RGB_TO_YUV(0x6CA6CD) }, /* Sky Blue */ - { RGB_TO_YUV(0x00008B) }, /* Dark blue */ - { RGB_TO_YUV(0x551A8B) }, /* Purple */ - { RGB_TO_YUV(0xFF0000) } /* Red */ - { RGB_TO_YUV(0xCC33FF) }, /* Magenta */ -}; - -static const unsigned char B_PREDICTION_MODE_colors[INTRA_MODES][3] = { - { RGB_TO_YUV(0x6633ff) }, /* Purple */ - { RGB_TO_YUV(0xcc33ff) }, /* Magenta */ - { RGB_TO_YUV(0xff33cc) }, /* Pink */ - { RGB_TO_YUV(0xff3366) }, /* Coral */ - { RGB_TO_YUV(0x3366ff) }, /* Blue */ - { RGB_TO_YUV(0xed00f5) }, /* Dark Blue */ - { RGB_TO_YUV(0x2e00b8) }, /* Dark Purple */ - { RGB_TO_YUV(0xff6633) }, /* Orange */ - { RGB_TO_YUV(0x33ccff) }, /* Light Blue */ - { RGB_TO_YUV(0x8ab800) }, /* Green */ - { RGB_TO_YUV(0xffcc33) }, /* Light Orange */ - { RGB_TO_YUV(0x33ffcc) }, /* Aqua */ - { RGB_TO_YUV(0x66ff33) }, /* Light Green */ - { RGB_TO_YUV(0xccff33) }, /* Yellow */ -}; - -static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] = { - { RGB_TO_YUV(0x00ff00) }, /* Blue */ - { RGB_TO_YUV(0x0000ff) }, /* Green */ - { RGB_TO_YUV(0xffff00) }, /* Yellow */ - { RGB_TO_YUV(0xff0000) }, /* Red */ -}; -#endif +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_postproc.h" +#include "vp9/common/vp9_systemdependent.h" +#include "vp9/common/vp9_textblit.h" +#if CONFIG_VP9_POSTPROC static const short kernel5[] = { 1, 1, 4, 1, 1 }; @@ -127,9 +76,6 @@ const short vp9_rv[] = { 0, 9, 5, 5, 11, 10, 13, 9, 10, 13, }; - -/**************************************************************************** - */ void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, @@ -317,19 +263,13 @@ void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, + 0.0065 + 0.5); int i; - const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer, - src->alpha_buffer}; - const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, - src->alpha_stride}; - const int src_widths[4] = {src->y_width, src->uv_width, src->uv_width, - src->alpha_width}; - const int src_heights[4] = {src->y_height, src->uv_height, src->uv_height, - src->alpha_height}; + const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer}; + const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride}; + const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width}; + const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height}; - uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer, - dst->alpha_buffer}; - const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride, - dst->alpha_stride}; + uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer}; + const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride}; for (i = 0; i < MAX_MB_PLANE; ++i) vp9_post_proc_down_and_across(srcs[i], dsts[i], @@ -343,19 +283,13 @@ void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, + 0.0065 + 0.5); int i; - const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer, - src->alpha_buffer}; - const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, - src->alpha_stride}; - const int src_widths[4] = {src->y_width, src->uv_width, src->uv_width, - src->alpha_width}; - const int src_heights[4] = {src->y_height, src->uv_height, src->uv_height, - src->alpha_height}; + const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer}; + const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride}; + const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width}; + const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height}; - uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer, - dst->alpha_buffer}; - const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride, - dst->alpha_stride}; + uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer}; + const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride}; for (i = 0; i < MAX_MB_PLANE; ++i) { const int src_stride = src_strides[i]; @@ -371,7 +305,7 @@ void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, } } -double vp9_gaussian(double sigma, double mu, double x) { +static double gaussian(double sigma, double mu, double x) { return 1 / (sigma * sqrt(2.0 * 3.14159265)) * (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma))); } @@ -396,7 +330,7 @@ static void fillrd(struct postproc_state *state, int q, int a) { next = 0; for (i = -32; i < 32; i++) { - int a = (int)(.5 + 256 * vp9_gaussian(sigma, 0, i)); + int a = (int)(0.5 + 256 * gaussian(sigma, 0, i)); if (a) { for (j = 0; j < a; j++) { @@ -425,27 +359,6 @@ static void fillrd(struct postproc_state *state, int q, int a) { state->last_noise = a; } -/**************************************************************************** - * - * ROUTINE : plane_add_noise_c - * - * INPUTS : unsigned char *Start starting address of buffer to - * add gaussian noise to - * unsigned int width width of plane - * unsigned int height height of plane - * int pitch distance between subsequent lines of frame - * int q quantizer used to determine amount of noise - * to add - * - * OUTPUTS : None. - * - * RETURNS : void. - * - * FUNCTION : adds gaussian noise to a plane of pixels - * - * SPECIAL NOTES : None. - * - ****************************************************************************/ void vp9_plane_add_noise_c(uint8_t *start, char *noise, char blackclamp[16], char whiteclamp[16], @@ -453,6 +366,9 @@ void vp9_plane_add_noise_c(uint8_t *start, char *noise, unsigned int width, unsigned int height, int pitch) { unsigned int i, j; + // TODO(jbb): why does simd code use both but c doesn't, normalize and + // fix.. + (void) bothclamp; for (i = 0; i < height; i++) { uint8_t *pos = start + i * pitch; char *ref = (char *)(noise + (rand() & 0xff)); // NOLINT @@ -469,540 +385,53 @@ void vp9_plane_add_noise_c(uint8_t *start, char *noise, } } -/* Blend the macro block with a solid colored square. Leave the - * edges unblended to give distinction to macro blocks in areas - * filled with the same color block. - */ -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, - int y1, int u1, int v1, int alpha, int stride) { - int i, j; - int y1_const = y1 * ((1 << 16) - alpha); - int u1_const = u1 * ((1 << 16) - alpha); - int v1_const = v1 * ((1 << 16) - alpha); - - y += 2 * stride + 2; - for (i = 0; i < 12; i++) { - for (j = 0; j < 12; j++) { - y[j] = (y[j] * alpha + y1_const) >> 16; - } - y += stride; - } - - stride >>= 1; - - u += stride + 1; - v += stride + 1; - - for (i = 0; i < 6; i++) { - for (j = 0; j < 6; j++) { - u[j] = (u[j] * alpha + u1_const) >> 16; - v[j] = (v[j] * alpha + v1_const) >> 16; - } - u += stride; - v += stride; - } -} - -/* Blend only the edge of the macro block. Leave center - * unblended to allow for other visualizations to be layered. - */ -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, - int y1, int u1, int v1, int alpha, int stride) { - int i, j; - int y1_const = y1 * ((1 << 16) - alpha); - int u1_const = u1 * ((1 << 16) - alpha); - int v1_const = v1 * ((1 << 16) - alpha); - - for (i = 0; i < 2; i++) { - for (j = 0; j < 16; j++) { - y[j] = (y[j] * alpha + y1_const) >> 16; - } - y += stride; - } - - for (i = 0; i < 12; i++) { - y[0] = (y[0] * alpha + y1_const) >> 16; - y[1] = (y[1] * alpha + y1_const) >> 16; - y[14] = (y[14] * alpha + y1_const) >> 16; - y[15] = (y[15] * alpha + y1_const) >> 16; - y += stride; - } - - for (i = 0; i < 2; i++) { - for (j = 0; j < 16; j++) { - y[j] = (y[j] * alpha + y1_const) >> 16; - } - y += stride; - } - - stride >>= 1; - - for (j = 0; j < 8; j++) { - u[j] = (u[j] * alpha + u1_const) >> 16; - v[j] = (v[j] * alpha + v1_const) >> 16; - } - u += stride; - v += stride; - - for (i = 0; i < 6; i++) { - u[0] = (u[0] * alpha + u1_const) >> 16; - v[0] = (v[0] * alpha + v1_const) >> 16; - - u[7] = (u[7] * alpha + u1_const) >> 16; - v[7] = (v[7] * alpha + v1_const) >> 16; - - u += stride; - v += stride; - } - - for (j = 0; j < 8; j++) { - u[j] = (u[j] * alpha + u1_const) >> 16; - v[j] = (v[j] * alpha + v1_const) >> 16; - } -} - -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, - int y1, int u1, int v1, int alpha, int stride) { - int i, j; - int y1_const = y1 * ((1 << 16) - alpha); - int u1_const = u1 * ((1 << 16) - alpha); - int v1_const = v1 * ((1 << 16) - alpha); - - for (i = 0; i < 4; i++) { - for (j = 0; j < 4; j++) { - y[j] = (y[j] * alpha + y1_const) >> 16; - } - y += stride; - } - - stride >>= 1; - - for (i = 0; i < 2; i++) { - for (j = 0; j < 2; j++) { - u[j] = (u[j] * alpha + u1_const) >> 16; - v[j] = (v[j] * alpha + v1_const) >> 16; - } - u += stride; - v += stride; - } -} - -static void constrain_line(int x0, int *x1, int y0, int *y1, - int width, int height) { - int dx; - int dy; - - if (*x1 > width) { - dx = *x1 - x0; - dy = *y1 - y0; - - *x1 = width; - if (dx) - *y1 = ((width - x0) * dy) / dx + y0; - } - if (*x1 < 0) { - dx = *x1 - x0; - dy = *y1 - y0; - - *x1 = 0; - if (dx) - *y1 = ((0 - x0) * dy) / dx + y0; - } - if (*y1 > height) { - dx = *x1 - x0; - dy = *y1 - y0; - - *y1 = height; - if (dy) - *x1 = ((height - y0) * dx) / dy + x0; - } - if (*y1 < 0) { - dx = *x1 - x0; - dy = *y1 - y0; - - *y1 = 0; - if (dy) - *x1 = ((0 - y0) * dx) / dy + x0; - } -} - int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) { - int q = cm->lf.filter_level * 10 / 6; - int flags = ppflags->post_proc_flag; - int deblock_level = ppflags->deblocking_level; - int noise_level = ppflags->noise_level; + const int q = MIN(63, cm->lf.filter_level * 10 / 6); + const int flags = ppflags->post_proc_flag; + YV12_BUFFER_CONFIG *const ppbuf = &cm->post_proc_buffer; + struct postproc_state *const ppstate = &cm->postproc_state; if (!cm->frame_to_show) return -1; - if (q > 63) - q = 63; - if (!flags) { *dest = *cm->frame_to_show; return 0; } -#if ARCH_X86||ARCH_X86_64 - vpx_reset_mmx_state(); + vp9_clear_system_state(); + +#if CONFIG_VP9_POSTPROC || CONFIG_INTERNAL_STATS + if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, + VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate post-processing buffer"); #endif if (flags & VP9D_DEMACROBLOCK) { - deblock_and_de_macro_block(cm->frame_to_show, &cm->post_proc_buffer, - q + (deblock_level - 5) * 10, 1, 0); + deblock_and_de_macro_block(cm->frame_to_show, ppbuf, + q + (ppflags->deblocking_level - 5) * 10, 1, 0); } else if (flags & VP9D_DEBLOCK) { - vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, q); + vp9_deblock(cm->frame_to_show, ppbuf, q); } else { - vp8_yv12_copy_frame(cm->frame_to_show, &cm->post_proc_buffer); + vp8_yv12_copy_frame(cm->frame_to_show, ppbuf); } if (flags & VP9D_ADDNOISE) { - if (cm->postproc_state.last_q != q - || cm->postproc_state.last_noise != noise_level) { - fillrd(&cm->postproc_state, 63 - q, noise_level); + const int noise_level = ppflags->noise_level; + if (ppstate->last_q != q || + ppstate->last_noise != noise_level) { + fillrd(ppstate, 63 - q, noise_level); } - vp9_plane_add_noise(cm->post_proc_buffer.y_buffer, - cm->postproc_state.noise, - cm->postproc_state.blackclamp, - cm->postproc_state.whiteclamp, - cm->postproc_state.bothclamp, - cm->post_proc_buffer.y_width, - cm->post_proc_buffer.y_height, - cm->post_proc_buffer.y_stride); + vp9_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp, + ppstate->whiteclamp, ppstate->bothclamp, + ppbuf->y_width, ppbuf->y_height, ppbuf->y_stride); } -#if 0 && CONFIG_POSTPROC_VISUALIZER - if (flags & VP9D_DEBUG_TXT_FRAME_INFO) { - char message[512]; - snprintf(message, sizeof(message) -1, - "F%1dG%1dQ%3dF%3dP%d_s%dx%d", - (cm->frame_type == KEY_FRAME), - cm->refresh_golden_frame, - cm->base_qindex, - cm->filter_level, - flags, - cm->mb_cols, cm->mb_rows); - vp9_blit_text(message, cm->post_proc_buffer.y_buffer, - cm->post_proc_buffer.y_stride); - } - - if (flags & VP9D_DEBUG_TXT_MBLK_MODES) { - int i, j; - uint8_t *y_ptr; - YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer; - int mb_rows = post->y_height >> 4; - int mb_cols = post->y_width >> 4; - int mb_index = 0; - MODE_INFO *mi = cm->mi; - - y_ptr = post->y_buffer + 4 * post->y_stride + 4; - - /* vp9_filter each macro block */ - for (i = 0; i < mb_rows; i++) { - for (j = 0; j < mb_cols; j++) { - char zz[4]; - - snprintf(zz, sizeof(zz) - 1, "%c", mi[mb_index].mbmi.mode + 'a'); - - vp9_blit_text(zz, y_ptr, post->y_stride); - mb_index++; - y_ptr += 16; - } - - mb_index++; /* border */ - y_ptr += post->y_stride * 16 - post->y_width; - } - } - - if (flags & VP9D_DEBUG_TXT_DC_DIFF) { - int i, j; - uint8_t *y_ptr; - YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer; - int mb_rows = post->y_height >> 4; - int mb_cols = post->y_width >> 4; - int mb_index = 0; - MODE_INFO *mi = cm->mi; - - y_ptr = post->y_buffer + 4 * post->y_stride + 4; - - /* vp9_filter each macro block */ - for (i = 0; i < mb_rows; i++) { - for (j = 0; j < mb_cols; j++) { - char zz[4]; - int dc_diff = !(mi[mb_index].mbmi.mode != I4X4_PRED && - mi[mb_index].mbmi.mode != SPLITMV && - mi[mb_index].mbmi.skip_coeff); - - if (cm->frame_type == KEY_FRAME) - snprintf(zz, sizeof(zz) - 1, "a"); - else - snprintf(zz, sizeof(zz) - 1, "%c", dc_diff + '0'); - - vp9_blit_text(zz, y_ptr, post->y_stride); - mb_index++; - y_ptr += 16; - } - - mb_index++; /* border */ - y_ptr += post->y_stride * 16 - post->y_width; - } - } - - if (flags & VP9D_DEBUG_TXT_RATE_INFO) { - char message[512]; - snprintf(message, sizeof(message), - "Bitrate: %10.2f framerate: %10.2f ", - cm->bitrate, cm->framerate); - vp9_blit_text(message, cm->post_proc_buffer.y_buffer, - cm->post_proc_buffer.y_stride); - } - - /* Draw motion vectors */ - if ((flags & VP9D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) { - YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer; - int width = post->y_width; - int height = post->y_height; - uint8_t *y_buffer = cm->post_proc_buffer.y_buffer; - int y_stride = cm->post_proc_buffer.y_stride; - MODE_INFO *mi = cm->mi; - int x0, y0; - - for (y0 = 0; y0 < height; y0 += 16) { - for (x0 = 0; x0 < width; x0 += 16) { - int x1, y1; - - if (!(ppflags->display_mv_flag & (1 << mi->mbmi.mode))) { - mi++; - continue; - } - - if (mi->mbmi.mode == SPLITMV) { - switch (mi->mbmi.partitioning) { - case PARTITIONING_16X8 : { /* mv_top_bottom */ - union b_mode_info *bmi = &mi->bmi[0]; - MV *mv = &bmi->mv.as_mv; - - x1 = x0 + 8 + (mv->col >> 3); - y1 = y0 + 4 + (mv->row >> 3); - - constrain_line(x0 + 8, &x1, y0 + 4, &y1, width, height); - vp9_blit_line(x0 + 8, x1, y0 + 4, y1, y_buffer, y_stride); - - bmi = &mi->bmi[8]; - - x1 = x0 + 8 + (mv->col >> 3); - y1 = y0 + 12 + (mv->row >> 3); - - constrain_line(x0 + 8, &x1, y0 + 12, &y1, width, height); - vp9_blit_line(x0 + 8, x1, y0 + 12, y1, y_buffer, y_stride); - - break; - } - case PARTITIONING_8X16 : { /* mv_left_right */ - union b_mode_info *bmi = &mi->bmi[0]; - MV *mv = &bmi->mv.as_mv; - - x1 = x0 + 4 + (mv->col >> 3); - y1 = y0 + 8 + (mv->row >> 3); - - constrain_line(x0 + 4, &x1, y0 + 8, &y1, width, height); - vp9_blit_line(x0 + 4, x1, y0 + 8, y1, y_buffer, y_stride); - - bmi = &mi->bmi[2]; - - x1 = x0 + 12 + (mv->col >> 3); - y1 = y0 + 8 + (mv->row >> 3); - - constrain_line(x0 + 12, &x1, y0 + 8, &y1, width, height); - vp9_blit_line(x0 + 12, x1, y0 + 8, y1, y_buffer, y_stride); - - break; - } - case PARTITIONING_8X8 : { /* mv_quarters */ - union b_mode_info *bmi = &mi->bmi[0]; - MV *mv = &bmi->mv.as_mv; - - x1 = x0 + 4 + (mv->col >> 3); - y1 = y0 + 4 + (mv->row >> 3); - - constrain_line(x0 + 4, &x1, y0 + 4, &y1, width, height); - vp9_blit_line(x0 + 4, x1, y0 + 4, y1, y_buffer, y_stride); - - bmi = &mi->bmi[2]; - - x1 = x0 + 12 + (mv->col >> 3); - y1 = y0 + 4 + (mv->row >> 3); - - constrain_line(x0 + 12, &x1, y0 + 4, &y1, width, height); - vp9_blit_line(x0 + 12, x1, y0 + 4, y1, y_buffer, y_stride); - - bmi = &mi->bmi[8]; - - x1 = x0 + 4 + (mv->col >> 3); - y1 = y0 + 12 + (mv->row >> 3); - - constrain_line(x0 + 4, &x1, y0 + 12, &y1, width, height); - vp9_blit_line(x0 + 4, x1, y0 + 12, y1, y_buffer, y_stride); - - bmi = &mi->bmi[10]; - - x1 = x0 + 12 + (mv->col >> 3); - y1 = y0 + 12 + (mv->row >> 3); - - constrain_line(x0 + 12, &x1, y0 + 12, &y1, width, height); - vp9_blit_line(x0 + 12, x1, y0 + 12, y1, y_buffer, y_stride); - break; - } - case PARTITIONING_4X4: - default : { - union b_mode_info *bmi = mi->bmi; - int bx0, by0; - - for (by0 = y0; by0 < (y0 + 16); by0 += 4) { - for (bx0 = x0; bx0 < (x0 + 16); bx0 += 4) { - MV *mv = &bmi->mv.as_mv; - - x1 = bx0 + 2 + (mv->col >> 3); - y1 = by0 + 2 + (mv->row >> 3); - - constrain_line(bx0 + 2, &x1, by0 + 2, &y1, width, height); - vp9_blit_line(bx0 + 2, x1, by0 + 2, y1, y_buffer, y_stride); - - bmi++; - } - } - } - } - } else if (is_inter_mode(mi->mbmi.mode)) { - MV *mv = &mi->mbmi.mv.as_mv; - const int lx0 = x0 + 8; - const int ly0 = y0 + 8; - - x1 = lx0 + (mv->col >> 3); - y1 = ly0 + (mv->row >> 3); - - if (x1 != lx0 && y1 != ly0) { - constrain_line(lx0, &x1, ly0 - 1, &y1, width, height); - vp9_blit_line(lx0, x1, ly0 - 1, y1, y_buffer, y_stride); - - constrain_line(lx0, &x1, ly0 + 1, &y1, width, height); - vp9_blit_line(lx0, x1, ly0 + 1, y1, y_buffer, y_stride); - } else { - vp9_blit_line(lx0, x1, ly0, y1, y_buffer, y_stride); - } - } - - mi++; - } - mi++; - } - } - - /* Color in block modes */ - if ((flags & VP9D_DEBUG_CLR_BLK_MODES) - && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) { - int y, x; - YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer; - int width = post->y_width; - int height = post->y_height; - uint8_t *y_ptr = cm->post_proc_buffer.y_buffer; - uint8_t *u_ptr = cm->post_proc_buffer.u_buffer; - uint8_t *v_ptr = cm->post_proc_buffer.v_buffer; - int y_stride = cm->post_proc_buffer.y_stride; - MODE_INFO *mi = cm->mi; - - for (y = 0; y < height; y += 16) { - for (x = 0; x < width; x += 16) { - int Y = 0, U = 0, V = 0; - - if (mi->mbmi.mode == I4X4_PRED && - ((ppflags->display_mb_modes_flag & I4X4_PRED) || - ppflags->display_b_modes_flag)) { - int by, bx; - uint8_t *yl, *ul, *vl; - union b_mode_info *bmi = mi->bmi; - - yl = y_ptr + x; - ul = u_ptr + (x >> 1); - vl = v_ptr + (x >> 1); - - for (by = 0; by < 16; by += 4) { - for (bx = 0; bx < 16; bx += 4) { - if ((ppflags->display_b_modes_flag & (1 << mi->mbmi.mode)) - || (ppflags->display_mb_modes_flag & I4X4_PRED)) { - Y = B_PREDICTION_MODE_colors[bmi->as_mode][0]; - U = B_PREDICTION_MODE_colors[bmi->as_mode][1]; - V = B_PREDICTION_MODE_colors[bmi->as_mode][2]; - - vp9_blend_b(yl + bx, ul + (bx >> 1), vl + (bx >> 1), Y, U, V, - 0xc000, y_stride); - } - bmi++; - } - - yl += y_stride * 4; - ul += y_stride * 1; - vl += y_stride * 1; - } - } else if (ppflags->display_mb_modes_flag & (1 << mi->mbmi.mode)) { - Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0]; - U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1]; - V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2]; - - vp9_blend_mb_inner(y_ptr + x, u_ptr + (x >> 1), v_ptr + (x >> 1), - Y, U, V, 0xc000, y_stride); - } - - mi++; - } - y_ptr += y_stride * 16; - u_ptr += y_stride * 4; - v_ptr += y_stride * 4; - - mi++; - } - } - - /* Color in frame reference blocks */ - if ((flags & VP9D_DEBUG_CLR_FRM_REF_BLKS) && - ppflags->display_ref_frame_flag) { - int y, x; - YV12_BUFFER_CONFIG *post = &cm->post_proc_buffer; - int width = post->y_width; - int height = post->y_height; - uint8_t *y_ptr = cm->post_proc_buffer.y_buffer; - uint8_t *u_ptr = cm->post_proc_buffer.u_buffer; - uint8_t *v_ptr = cm->post_proc_buffer.v_buffer; - int y_stride = cm->post_proc_buffer.y_stride; - MODE_INFO *mi = cm->mi; - - for (y = 0; y < height; y += 16) { - for (x = 0; x < width; x += 16) { - int Y = 0, U = 0, V = 0; - - if (ppflags->display_ref_frame_flag & (1 << mi->mbmi.ref_frame)) { - Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0]; - U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1]; - V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2]; - - vp9_blend_mb_outer(y_ptr + x, u_ptr + (x >> 1), v_ptr + (x >> 1), - Y, U, V, 0xc000, y_stride); - } - - mi++; - } - y_ptr += y_stride * 16; - u_ptr += y_stride * 4; - v_ptr += y_stride * 4; - - mi++; - } - } -#endif - - *dest = cm->post_proc_buffer; + *dest = *ppbuf; /* handle problem with extending borders */ dest->y_width = cm->width; @@ -1012,3 +441,4 @@ int vp9_post_proc_frame(struct VP9Common *cm, return 0; } +#endif diff --git a/media/libvpx/vp9/common/vp9_postproc.h b/media/libvpx/vp9/common/vp9_postproc.h index c63beae9dba6..ebebc1ae3467 100644 --- a/media/libvpx/vp9/common/vp9_postproc.h +++ b/media/libvpx/vp9/common/vp9_postproc.h @@ -13,6 +13,12 @@ #define VP9_COMMON_VP9_POSTPROC_H_ #include "vpx_ports/mem.h" +#include "vpx_scale/yv12config.h" +#include "vp9/common/vp9_ppflags.h" + +#ifdef __cplusplus +extern "C" { +#endif struct postproc_state { int last_q; @@ -23,8 +29,7 @@ struct postproc_state { DECLARE_ALIGNED(16, char, bothclamp[16]); }; -#include "vp9/common/vp9_onyxc_int.h" -#include "vp9/common/vp9_ppflags.h" +struct VP9Common; int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags); @@ -33,4 +38,8 @@ void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q); void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q); +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_POSTPROC_H_ diff --git a/media/libvpx/vp9/common/vp9_ppflags.h b/media/libvpx/vp9/common/vp9_ppflags.h index 561c93028a4d..1644a1bbbe80 100644 --- a/media/libvpx/vp9/common/vp9_ppflags.h +++ b/media/libvpx/vp9/common/vp9_ppflags.h @@ -11,6 +11,10 @@ #ifndef VP9_COMMON_VP9_PPFLAGS_H_ #define VP9_COMMON_VP9_PPFLAGS_H_ +#ifdef __cplusplus +extern "C" { +#endif + enum { VP9D_NOFILTERING = 0, VP9D_DEBLOCK = 1 << 0, @@ -29,10 +33,10 @@ typedef struct { int post_proc_flag; int deblocking_level; int noise_level; - int display_ref_frame_flag; - int display_mb_modes_flag; - int display_b_modes_flag; - int display_mv_flag; } vp9_ppflags_t; +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_PPFLAGS_H_ diff --git a/media/libvpx/vp9/common/vp9_pragmas.h b/media/libvpx/vp9/common/vp9_pragmas.h deleted file mode 100644 index f079161d6b43..000000000000 --- a/media/libvpx/vp9/common/vp9_pragmas.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_COMMON_VP9_PRAGMAS_H_ -#define VP9_COMMON_VP9_PRAGMAS_H_ - -#ifdef __INTEL_COMPILER -#pragma warning(disable:997 1011 170) -#endif - -#ifdef _MSC_VER -#pragma warning(disable:4799) -#endif - -#endif // VP9_COMMON_VP9_PRAGMAS_H_ diff --git a/media/libvpx/vp9/common/vp9_pred_common.c b/media/libvpx/vp9/common/vp9_pred_common.c index 6018e1775d42..014638466004 100644 --- a/media/libvpx/vp9/common/vp9_pred_common.c +++ b/media/libvpx/vp9/common/vp9_pred_common.c @@ -14,134 +14,110 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_pred_common.h" #include "vp9/common/vp9_seg_common.h" -#include "vp9/common/vp9_treecoder.h" -static INLINE const MB_MODE_INFO *get_above_mbmi(const MODE_INFO *const above) { - return (above != NULL) ? &above->mbmi : NULL; -} - -static INLINE const MB_MODE_INFO *get_left_mbmi(const MODE_INFO *const left) { - return (left != NULL) ? &left->mbmi : NULL; +static INLINE const MB_MODE_INFO *get_mbmi(const MODE_INFO *const mi) { + return (mi != NULL) ? &mi->mbmi : NULL; } // Returns a context number for the given MB prediction signal -unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) { - const MODE_INFO *const above_mi = get_above_mi(xd); - const MODE_INFO *const left_mi = get_left_mi(xd); - const int above_in_image = above_mi != NULL; - const int left_in_image = left_mi != NULL; +int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) { // Note: // The mode info data structure has a one element border above and to the // left of the entries correpsonding to real macroblocks. // The prediction flags in these dummy entries are initialised to 0. - // left - const int left_mv_pred = left_in_image ? is_inter_block(&left_mi->mbmi) - : 0; - const int left_interp = left_in_image && left_mv_pred - ? left_mi->mbmi.interp_filter - : SWITCHABLE_FILTERS; + const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); + const int left_type = left_mbmi != NULL && is_inter_block(left_mbmi) ? + left_mbmi->interp_filter : SWITCHABLE_FILTERS; + const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd)); + const int above_type = above_mbmi != NULL && is_inter_block(above_mbmi) ? + above_mbmi->interp_filter : SWITCHABLE_FILTERS; - // above - const int above_mv_pred = above_in_image ? is_inter_block(&above_mi->mbmi) - : 0; - const int above_interp = above_in_image && above_mv_pred - ? above_mi->mbmi.interp_filter - : SWITCHABLE_FILTERS; - - if (left_interp == above_interp) - return left_interp; - else if (left_interp == SWITCHABLE_FILTERS && - above_interp != SWITCHABLE_FILTERS) - return above_interp; - else if (left_interp != SWITCHABLE_FILTERS && - above_interp == SWITCHABLE_FILTERS) - return left_interp; + if (left_type == above_type) + return left_type; + else if (left_type == SWITCHABLE_FILTERS && above_type != SWITCHABLE_FILTERS) + return above_type; + else if (left_type != SWITCHABLE_FILTERS && above_type == SWITCHABLE_FILTERS) + return left_type; else return SWITCHABLE_FILTERS; } -// Returns a context number for the given MB prediction signal -unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) { - const MODE_INFO *const above_mi = get_above_mi(xd); - const MODE_INFO *const left_mi = get_left_mi(xd); - const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi); - const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi); - const int above_in_image = above_mi != NULL; - const int left_in_image = left_mi != NULL; - const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1; - const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1; - // The mode info data structure has a one element border above and to the - // left of the entries corresponding to real macroblocks. - // The prediction flags in these dummy entries are initialized to 0. - // 0 - inter/inter, inter/--, --/inter, --/-- - // 1 - intra/inter, inter/intra - // 2 - intra/--, --/intra - // 3 - intra/intra - if (above_in_image && left_in_image) // both edges available +// The mode info data structure has a one element border above and to the +// left of the entries corresponding to real macroblocks. +// The prediction flags in these dummy entries are initialized to 0. +// 0 - inter/inter, inter/--, --/inter, --/-- +// 1 - intra/inter, inter/intra +// 2 - intra/--, --/intra +// 3 - intra/intra +int vp9_get_intra_inter_context(const MACROBLOCKD *xd) { + const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd)); + const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); + const int has_above = above_mbmi != NULL; + const int has_left = left_mbmi != NULL; + + if (has_above && has_left) { // both edges available + const int above_intra = !is_inter_block(above_mbmi); + const int left_intra = !is_inter_block(left_mbmi); return left_intra && above_intra ? 3 : left_intra || above_intra; - else if (above_in_image || left_in_image) // one edge available - return 2 * (above_in_image ? above_intra : left_intra); - else + } else if (has_above || has_left) { // one edge available + return 2 * !is_inter_block(has_above ? above_mbmi : left_mbmi); + } else { return 0; + } } -// Returns a context number for the given MB prediction signal -unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm, - const MACROBLOCKD *xd) { - int pred_context; - const MODE_INFO *const above_mi = get_above_mi(xd); - const MODE_INFO *const left_mi = get_left_mi(xd); - const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi); - const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi); - const int above_in_image = above_mi != NULL; - const int left_in_image = left_mi != NULL; + +int vp9_get_reference_mode_context(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + int ctx; + const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd)); + const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); + const int has_above = above_mbmi != NULL; + const int has_left = left_mbmi != NULL; // Note: // The mode info data structure has a one element border above and to the // left of the entries correpsonding to real macroblocks. // The prediction flags in these dummy entries are initialised to 0. - if (above_in_image && left_in_image) { // both edges available + if (has_above && has_left) { // both edges available if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi)) // neither edge uses comp pred (0/1) - pred_context = (above_mbmi->ref_frame[0] == cm->comp_fixed_ref) ^ - (left_mbmi->ref_frame[0] == cm->comp_fixed_ref); + ctx = (above_mbmi->ref_frame[0] == cm->comp_fixed_ref) ^ + (left_mbmi->ref_frame[0] == cm->comp_fixed_ref); else if (!has_second_ref(above_mbmi)) // one of two edges uses comp pred (2/3) - pred_context = 2 + (above_mbmi->ref_frame[0] == cm->comp_fixed_ref || - !is_inter_block(above_mbmi)); + ctx = 2 + (above_mbmi->ref_frame[0] == cm->comp_fixed_ref || + !is_inter_block(above_mbmi)); else if (!has_second_ref(left_mbmi)) // one of two edges uses comp pred (2/3) - pred_context = 2 + (left_mbmi->ref_frame[0] == cm->comp_fixed_ref || - !is_inter_block(left_mbmi)); + ctx = 2 + (left_mbmi->ref_frame[0] == cm->comp_fixed_ref || + !is_inter_block(left_mbmi)); else // both edges use comp pred (4) - pred_context = 4; - } else if (above_in_image || left_in_image) { // one edge available - const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi; + ctx = 4; + } else if (has_above || has_left) { // one edge available + const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi; if (!has_second_ref(edge_mbmi)) // edge does not use comp pred (0/1) - pred_context = edge_mbmi->ref_frame[0] == cm->comp_fixed_ref; + ctx = edge_mbmi->ref_frame[0] == cm->comp_fixed_ref; else // edge uses comp pred (3) - pred_context = 3; + ctx = 3; } else { // no edges available (1) - pred_context = 1; + ctx = 1; } - assert(pred_context >= 0 && pred_context < COMP_INTER_CONTEXTS); - return pred_context; + assert(ctx >= 0 && ctx < COMP_INTER_CONTEXTS); + return ctx; } // Returns a context number for the given MB prediction signal -unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, - const MACROBLOCKD *xd) { +int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { int pred_context; - const MODE_INFO *const above_mi = get_above_mi(xd); - const MODE_INFO *const left_mi = get_left_mi(xd); - const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi); - const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi); - const int above_in_image = above_mi != NULL; - const int left_in_image = left_mi != NULL; - const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1; - const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1; + const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd)); + const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); + const int above_in_image = above_mbmi != NULL; + const int left_in_image = left_mbmi != NULL; + // Note: // The mode info data structure has a one element border above and to the // left of the entries correpsonding to real macroblocks. @@ -150,6 +126,9 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, const int var_ref_idx = !fix_ref_idx; if (above_in_image && left_in_image) { // both edges available + const int above_intra = !is_inter_block(above_mbmi); + const int left_intra = !is_inter_block(left_mbmi); + if (above_intra && left_intra) { // intra/intra (2) pred_context = 2; } else if (above_intra || left_intra) { // intra/inter @@ -163,10 +142,10 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, } else { // inter/inter const int l_sg = !has_second_ref(left_mbmi); const int a_sg = !has_second_ref(above_mbmi); - MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0] - : above_mbmi->ref_frame[var_ref_idx]; - MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0] - : left_mbmi->ref_frame[var_ref_idx]; + const MV_REFERENCE_FRAME vrfa = a_sg ? above_mbmi->ref_frame[0] + : above_mbmi->ref_frame[var_ref_idx]; + const MV_REFERENCE_FRAME vrfl = l_sg ? left_mbmi->ref_frame[0] + : left_mbmi->ref_frame[var_ref_idx]; if (vrfa == vrfl && cm->comp_var_ref[1] == vrfa) { pred_context = 0; @@ -179,8 +158,8 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, else pred_context = 1; } else if (l_sg || a_sg) { // single/comp - MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl; - MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl; + const MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl; + const MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl; if (vrfc == cm->comp_var_ref[1] && rfs != cm->comp_var_ref[1]) pred_context = 1; else if (rfs == cm->comp_var_ref[1] && vrfc != cm->comp_var_ref[1]) @@ -212,21 +191,21 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, return pred_context; } -unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) { + +int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) { int pred_context; - const MODE_INFO *const above_mi = get_above_mi(xd); - const MODE_INFO *const left_mi = get_left_mi(xd); - const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi); - const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi); - const int above_in_image = above_mi != NULL; - const int left_in_image = left_mi != NULL; - const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1; - const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1; + const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd)); + const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); + const int has_above = above_mbmi != NULL; + const int has_left = left_mbmi != NULL; // Note: // The mode info data structure has a one element border above and to the // left of the entries correpsonding to real macroblocks. // The prediction flags in these dummy entries are initialised to 0. - if (above_in_image && left_in_image) { // both edges available + if (has_above && has_left) { // both edges available + const int above_intra = !is_inter_block(above_mbmi); + const int left_intra = !is_inter_block(left_mbmi); + if (above_intra && left_intra) { // intra/intra pred_context = 2; } else if (above_intra || left_intra) { // intra/inter or inter/intra @@ -237,30 +216,31 @@ unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) { pred_context = 1 + (edge_mbmi->ref_frame[0] == LAST_FRAME || edge_mbmi->ref_frame[1] == LAST_FRAME); } else { // inter/inter - if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi)) { - pred_context = 2 * (above_mbmi->ref_frame[0] == LAST_FRAME) + - 2 * (left_mbmi->ref_frame[0] == LAST_FRAME); - } else if (has_second_ref(above_mbmi) && has_second_ref(left_mbmi)) { - pred_context = 1 + (above_mbmi->ref_frame[0] == LAST_FRAME || - above_mbmi->ref_frame[1] == LAST_FRAME || - left_mbmi->ref_frame[0] == LAST_FRAME || - left_mbmi->ref_frame[1] == LAST_FRAME); - } else { - const MV_REFERENCE_FRAME rfs = !has_second_ref(above_mbmi) ? - above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; - const MV_REFERENCE_FRAME crf1 = has_second_ref(above_mbmi) ? - above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; - const MV_REFERENCE_FRAME crf2 = has_second_ref(above_mbmi) ? - above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1]; + const int above_has_second = has_second_ref(above_mbmi); + const int left_has_second = has_second_ref(left_mbmi); + const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1]; + const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1]; + + if (above_has_second && left_has_second) { + pred_context = 1 + (above0 == LAST_FRAME || above1 == LAST_FRAME || + left0 == LAST_FRAME || left1 == LAST_FRAME); + } else if (above_has_second || left_has_second) { + const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1; if (rfs == LAST_FRAME) pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME); else - pred_context = crf1 == LAST_FRAME || crf2 == LAST_FRAME; + pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME); + } else { + pred_context = 2 * (above0 == LAST_FRAME) + 2 * (left0 == LAST_FRAME); } } - } else if (above_in_image || left_in_image) { // one edge available - const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi; + } else if (has_above || has_left) { // one edge available + const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi; if (!is_inter_block(edge_mbmi)) { // intra pred_context = 2; } else { // inter @@ -278,22 +258,21 @@ unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) { return pred_context; } -unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { +int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { int pred_context; - const MODE_INFO *const above_mi = get_above_mi(xd); - const MODE_INFO *const left_mi = get_left_mi(xd); - const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi); - const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi); - const int above_in_image = above_mi != NULL; - const int left_in_image = left_mi != NULL; - const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1; - const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1; + const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd)); + const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); + const int has_above = above_mbmi != NULL; + const int has_left = left_mbmi != NULL; // Note: // The mode info data structure has a one element border above and to the // left of the entries correpsonding to real macroblocks. // The prediction flags in these dummy entries are initialised to 0. - if (above_in_image && left_in_image) { // both edges available + if (has_above && has_left) { // both edges available + const int above_intra = !is_inter_block(above_mbmi); + const int left_intra = !is_inter_block(left_mbmi); + if (above_intra && left_intra) { // intra/intra pred_context = 2; } else if (above_intra || left_intra) { // intra/inter or inter/intra @@ -308,36 +287,25 @@ unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { edge_mbmi->ref_frame[1] == GOLDEN_FRAME); } } else { // inter/inter - if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi)) { - if (above_mbmi->ref_frame[0] == LAST_FRAME && - left_mbmi->ref_frame[0] == LAST_FRAME) { - pred_context = 3; - } else if (above_mbmi->ref_frame[0] == LAST_FRAME || - left_mbmi->ref_frame[0] == LAST_FRAME) { - const MB_MODE_INFO *edge_mbmi = - above_mbmi->ref_frame[0] == LAST_FRAME ? left_mbmi : above_mbmi; + const int above_has_second = has_second_ref(above_mbmi); + const int left_has_second = has_second_ref(left_mbmi); + const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1]; + const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0]; + const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1]; - pred_context = 4 * (edge_mbmi->ref_frame[0] == GOLDEN_FRAME); - } else { - pred_context = 2 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME) + - 2 * (left_mbmi->ref_frame[0] == GOLDEN_FRAME); - } - } else if (has_second_ref(above_mbmi) && has_second_ref(left_mbmi)) { - if (above_mbmi->ref_frame[0] == left_mbmi->ref_frame[0] && - above_mbmi->ref_frame[1] == left_mbmi->ref_frame[1]) - pred_context = 3 * (above_mbmi->ref_frame[0] == GOLDEN_FRAME || - above_mbmi->ref_frame[1] == GOLDEN_FRAME || - left_mbmi->ref_frame[0] == GOLDEN_FRAME || - left_mbmi->ref_frame[1] == GOLDEN_FRAME); + if (above_has_second && left_has_second) { + if (above0 == left0 && above1 == left1) + pred_context = 3 * (above0 == GOLDEN_FRAME || + above1 == GOLDEN_FRAME || + left0 == GOLDEN_FRAME || + left1 == GOLDEN_FRAME); else pred_context = 2; - } else { - const MV_REFERENCE_FRAME rfs = !has_second_ref(above_mbmi) ? - above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; - const MV_REFERENCE_FRAME crf1 = has_second_ref(above_mbmi) ? - above_mbmi->ref_frame[0] : left_mbmi->ref_frame[0]; - const MV_REFERENCE_FRAME crf2 = has_second_ref(above_mbmi) ? - above_mbmi->ref_frame[1] : left_mbmi->ref_frame[1]; + } else if (above_has_second || left_has_second) { + const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0; + const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1; if (rfs == GOLDEN_FRAME) pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME); @@ -345,10 +313,21 @@ unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME; else pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME); + } else { + if (above0 == LAST_FRAME && left0 == LAST_FRAME) { + pred_context = 3; + } else if (above0 == LAST_FRAME || left0 == LAST_FRAME) { + const MV_REFERENCE_FRAME edge0 = (above0 == LAST_FRAME) ? left0 + : above0; + pred_context = 4 * (edge0 == GOLDEN_FRAME); + } else { + pred_context = 2 * (above0 == GOLDEN_FRAME) + + 2 * (left0 == GOLDEN_FRAME); + } } } - } else if (above_in_image || left_in_image) { // one edge available - const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi; + } else if (has_above || has_left) { // one edge available + const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi; if (!is_inter_block(edge_mbmi) || (edge_mbmi->ref_frame[0] == LAST_FRAME && !has_second_ref(edge_mbmi))) @@ -368,39 +347,26 @@ unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { // The mode info data structure has a one element border above and to the // left of the entries corresponding to real blocks. // The prediction flags in these dummy entries are initialized to 0. -unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd) { - const MODE_INFO *const above_mi = get_above_mi(xd); - const MODE_INFO *const left_mi = get_left_mi(xd); - const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi); - const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi); - const int above_in_image = above_mi != NULL; - const int left_in_image = left_mi != NULL; - const int max_tx_size = max_txsize_lookup[xd->mi_8x8[0]->mbmi.sb_type]; - int above_context = max_tx_size; - int left_context = max_tx_size; +int vp9_get_tx_size_context(const MACROBLOCKD *xd) { + const int max_tx_size = max_txsize_lookup[xd->mi[0]->mbmi.sb_type]; + const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd)); + const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd)); + const int has_above = above_mbmi != NULL; + const int has_left = left_mbmi != NULL; + int above_ctx = (has_above && !above_mbmi->skip) ? (int)above_mbmi->tx_size + : max_tx_size; + int left_ctx = (has_left && !left_mbmi->skip) ? (int)left_mbmi->tx_size + : max_tx_size; + if (!has_left) + left_ctx = above_ctx; - if (above_in_image) - above_context = above_mbmi->skip_coeff ? max_tx_size - : above_mbmi->tx_size; + if (!has_above) + above_ctx = left_ctx; - if (left_in_image) - left_context = left_mbmi->skip_coeff ? max_tx_size - : left_mbmi->tx_size; - - if (!left_in_image) - left_context = above_context; - - if (!above_in_image) - above_context = left_context; - - return above_context + left_context > max_tx_size; + return (above_ctx + left_ctx) > max_tx_size; } -void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag) { - xd->mi_8x8[0]->mbmi.seg_id_predicted = pred_flag; -} - -int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids, +int vp9_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids, BLOCK_SIZE bsize, int mi_row, int mi_col) { const int mi_offset = mi_row * cm->mi_cols + mi_col; const int bw = num_8x8_blocks_wide_lookup[bsize]; diff --git a/media/libvpx/vp9/common/vp9_pred_common.h b/media/libvpx/vp9/common/vp9_pred_common.h index 919093080038..2c965068a0ff 100644 --- a/media/libvpx/vp9/common/vp9_pred_common.h +++ b/media/libvpx/vp9/common/vp9_pred_common.h @@ -14,15 +14,19 @@ #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_onyxc_int.h" +#ifdef __cplusplus +extern "C" { +#endif + static INLINE const MODE_INFO *get_above_mi(const MACROBLOCKD *const xd) { - return xd->up_available ? xd->mi_8x8[-xd->mode_info_stride] : NULL; + return xd->up_available ? xd->mi[-xd->mi_stride] : NULL; } static INLINE const MODE_INFO *get_left_mi(const MACROBLOCKD *const xd) { - return xd->left_available ? xd->mi_8x8[-1] : NULL; + return xd->left_available ? xd->mi[-1] : NULL; } -int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids, +int vp9_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids, BLOCK_SIZE bsize, int mi_row, int mi_col); static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) { @@ -35,55 +39,42 @@ static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) { return above_sip + left_sip; } -static INLINE vp9_prob vp9_get_pred_prob_seg_id(struct segmentation *seg, +static INLINE vp9_prob vp9_get_pred_prob_seg_id(const struct segmentation *seg, const MACROBLOCKD *xd) { return seg->pred_probs[vp9_get_pred_context_seg_id(xd)]; } -void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag); - -static INLINE int vp9_get_pred_context_mbskip(const MACROBLOCKD *xd) { +static INLINE int vp9_get_skip_context(const MACROBLOCKD *xd) { const MODE_INFO *const above_mi = get_above_mi(xd); const MODE_INFO *const left_mi = get_left_mi(xd); - const int above_skip_coeff = (above_mi != NULL) ? - above_mi->mbmi.skip_coeff : 0; - const int left_skip_coeff = (left_mi != NULL) ? left_mi->mbmi.skip_coeff : 0; - - return above_skip_coeff + left_skip_coeff; + const int above_skip = (above_mi != NULL) ? above_mi->mbmi.skip : 0; + const int left_skip = (left_mi != NULL) ? left_mi->mbmi.skip : 0; + return above_skip + left_skip; } -static INLINE vp9_prob vp9_get_pred_prob_mbskip(const VP9_COMMON *cm, +static INLINE vp9_prob vp9_get_skip_prob(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + return cm->fc.skip_probs[vp9_get_skip_context(xd)]; +} + +int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd); + +int vp9_get_intra_inter_context(const MACROBLOCKD *xd); + +static INLINE vp9_prob vp9_get_intra_inter_prob(const VP9_COMMON *cm, const MACROBLOCKD *xd) { - return cm->fc.mbskip_probs[vp9_get_pred_context_mbskip(xd)]; + return cm->fc.intra_inter_prob[vp9_get_intra_inter_context(xd)]; } -static INLINE unsigned char vp9_get_pred_flag_mbskip(const MACROBLOCKD *xd) { - return xd->mi_8x8[0]->mbmi.skip_coeff; +int vp9_get_reference_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd); + +static INLINE vp9_prob vp9_get_reference_mode_prob(const VP9_COMMON *cm, + const MACROBLOCKD *xd) { + return cm->fc.comp_inter_prob[vp9_get_reference_mode_context(cm, xd)]; } -unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd); - -unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd); - -static INLINE vp9_prob vp9_get_pred_prob_intra_inter(const VP9_COMMON *cm, - const MACROBLOCKD *xd) { - const int pred_context = vp9_get_pred_context_intra_inter(xd); - return cm->fc.intra_inter_prob[pred_context]; -} - -unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm, - const MACROBLOCKD *xd); - - -static INLINE -vp9_prob vp9_get_pred_prob_comp_inter_inter(const VP9_COMMON *cm, - const MACROBLOCKD *xd) { - const int pred_context = vp9_get_pred_context_comp_inter_inter(cm, xd); - return cm->fc.comp_inter_prob[pred_context]; -} - -unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, - const MACROBLOCKD *xd); +int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm, + const MACROBLOCKD *xd); static INLINE vp9_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm, const MACROBLOCKD *xd) { @@ -91,26 +82,24 @@ static INLINE vp9_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm, return cm->fc.comp_ref_prob[pred_context]; } -unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd); +int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd); static INLINE vp9_prob vp9_get_pred_prob_single_ref_p1(const VP9_COMMON *cm, const MACROBLOCKD *xd) { - const int pred_context = vp9_get_pred_context_single_ref_p1(xd); - return cm->fc.single_ref_prob[pred_context][0]; + return cm->fc.single_ref_prob[vp9_get_pred_context_single_ref_p1(xd)][0]; } -unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd); +int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd); static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm, const MACROBLOCKD *xd) { - const int pred_context = vp9_get_pred_context_single_ref_p2(xd); - return cm->fc.single_ref_prob[pred_context][1]; + return cm->fc.single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1]; } -unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd); +int vp9_get_tx_size_context(const MACROBLOCKD *xd); -static const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx, - const struct tx_probs *tx_probs) { +static INLINE const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx, + const struct tx_probs *tx_probs) { switch (max_tx_size) { case TX_8X8: return tx_probs->p8x8[ctx]; @@ -119,19 +108,19 @@ static const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx, case TX_32X32: return tx_probs->p32x32[ctx]; default: - assert(!"Invalid max_tx_size."); + assert(0 && "Invalid max_tx_size."); return NULL; } } -static const vp9_prob *get_tx_probs2(TX_SIZE max_tx_size, const MACROBLOCKD *xd, - const struct tx_probs *tx_probs) { - const int ctx = vp9_get_pred_context_tx_size(xd); - return get_tx_probs(max_tx_size, ctx, tx_probs); +static INLINE const vp9_prob *get_tx_probs2(TX_SIZE max_tx_size, + const MACROBLOCKD *xd, + const struct tx_probs *tx_probs) { + return get_tx_probs(max_tx_size, vp9_get_tx_size_context(xd), tx_probs); } -static unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx, - struct tx_counts *tx_counts) { +static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx, + struct tx_counts *tx_counts) { switch (max_tx_size) { case TX_8X8: return tx_counts->p8x8[ctx]; @@ -140,9 +129,13 @@ static unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx, case TX_32X32: return tx_counts->p32x32[ctx]; default: - assert(!"Invalid max_tx_size."); + assert(0 && "Invalid max_tx_size."); return NULL; } } +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_PRED_COMMON_H_ diff --git a/media/libvpx/vp9/common/vp9_prob.c b/media/libvpx/vp9/common/vp9_prob.c new file mode 100644 index 000000000000..a1befc63e88d --- /dev/null +++ b/media/libvpx/vp9/common/vp9_prob.c @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_prob.h" + +const uint8_t vp9_norm[256] = { + 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + + +static unsigned int tree_merge_probs_impl(unsigned int i, + const vp9_tree_index *tree, + const vp9_prob *pre_probs, + const unsigned int *counts, + unsigned int count_sat, + unsigned int max_update, + vp9_prob *probs) { + const int l = tree[i]; + const unsigned int left_count = (l <= 0) + ? counts[-l] + : tree_merge_probs_impl(l, tree, pre_probs, counts, + count_sat, max_update, probs); + const int r = tree[i + 1]; + const unsigned int right_count = (r <= 0) + ? counts[-r] + : tree_merge_probs_impl(r, tree, pre_probs, counts, + count_sat, max_update, probs); + const unsigned int ct[2] = { left_count, right_count }; + probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct, + count_sat, max_update); + return left_count + right_count; +} + +void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, + const unsigned int *counts, unsigned int count_sat, + unsigned int max_update_factor, vp9_prob *probs) { + tree_merge_probs_impl(0, tree, pre_probs, counts, count_sat, + max_update_factor, probs); +} diff --git a/media/libvpx/vp9/common/vp9_prob.h b/media/libvpx/vp9/common/vp9_prob.h new file mode 100644 index 000000000000..fa0e36da4726 --- /dev/null +++ b/media/libvpx/vp9/common/vp9_prob.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_PROB_H_ +#define VP9_COMMON_VP9_PROB_H_ + +#include "./vpx_config.h" + +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" + +#include "vp9/common/vp9_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef uint8_t vp9_prob; + +#define MAX_PROB 255 + +#define vp9_prob_half ((vp9_prob) 128) + +typedef int8_t vp9_tree_index; + +#define TREE_SIZE(leaf_count) (2 * (leaf_count) - 2) + +#define vp9_complement(x) (255 - x) + +/* We build coding trees compactly in arrays. + Each node of the tree is a pair of vp9_tree_indices. + Array index often references a corresponding probability table. + Index <= 0 means done encoding/decoding and value = -Index, + Index > 0 means need another bit, specification at index. + Nonnegative indices are always even; processing begins at node 0. */ + +typedef const vp9_tree_index vp9_tree[]; + +static INLINE vp9_prob clip_prob(int p) { + return (p > 255) ? 255 : (p < 1) ? 1 : p; +} + +static INLINE vp9_prob get_prob(int num, int den) { + return (den == 0) ? 128u : clip_prob(((int64_t)num * 256 + (den >> 1)) / den); +} + +static INLINE vp9_prob get_binary_prob(int n0, int n1) { + return get_prob(n0, n0 + n1); +} + +/* This function assumes prob1 and prob2 are already within [1,255] range. */ +static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) { + return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8); +} + +static INLINE vp9_prob merge_probs(vp9_prob pre_prob, + const unsigned int ct[2], + unsigned int count_sat, + unsigned int max_update_factor) { + const vp9_prob prob = get_binary_prob(ct[0], ct[1]); + const unsigned int count = MIN(ct[0] + ct[1], count_sat); + const unsigned int factor = max_update_factor * count / count_sat; + return weighted_prob(pre_prob, prob, factor); +} + +void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs, + const unsigned int *counts, unsigned int count_sat, + unsigned int max_update_factor, vp9_prob *probs); + + +DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_COMMON_VP9_PROB_H_ diff --git a/media/libvpx/vp9/common/vp9_quant_common.c b/media/libvpx/vp9/common/vp9_quant_common.c index 6dbdb421623f..3332e58e6a7c 100644 --- a/media/libvpx/vp9/common/vp9_quant_common.c +++ b/media/libvpx/vp9/common/vp9_quant_common.c @@ -12,7 +12,6 @@ #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_seg_common.h" -#if 1 static const int16_t dc_qlookup[QINDEX_RANGE] = { 4, 8, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16, 17, 18, 19, 19, @@ -83,44 +82,6 @@ static const int16_t ac_qlookup[QINDEX_RANGE] = { 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828, }; -void vp9_init_quant_tables(void) { } -#else -static int16_t dc_qlookup[QINDEX_RANGE]; -static int16_t ac_qlookup[QINDEX_RANGE]; - -#define ACDC_MIN 8 - -// TODO(dkovalev) move to common and reuse -static double poly3(double a, double b, double c, double d, double x) { - return a*x*x*x + b*x*x + c*x + d; -} - -void vp9_init_quant_tables() { - int i, val = 4; - - // A "real" q of 1.0 forces lossless mode. - // In practice non lossless Q's between 1.0 and 2.0 (represented here by - // integer values from 5-7 give poor rd results (lower psnr and often - // larger size than the lossless encode. To block out those "not very useful" - // values we increment the ac and dc q lookup values by 4 after position 0. - ac_qlookup[0] = val; - dc_qlookup[0] = val; - val += 4; - - for (i = 1; i < QINDEX_RANGE; i++) { - const int ac_val = val; - - val = (int)(val * 1.01975); - if (val == ac_val) - ++val; - - ac_qlookup[i] = (int16_t)ac_val; - dc_qlookup[i] = (int16_t)MAX(ACDC_MIN, poly3(0.000000305, -0.00065, 0.9, - 0.5, ac_val)); - } -} -#endif - int16_t vp9_dc_quant(int qindex, int delta) { return dc_qlookup[clamp(qindex + delta, 0, MAXQ)]; } @@ -130,12 +91,13 @@ int16_t vp9_ac_quant(int qindex, int delta) { } -int vp9_get_qindex(struct segmentation *seg, int segment_id, int base_qindex) { +int vp9_get_qindex(const struct segmentation *seg, int segment_id, + int base_qindex) { if (vp9_segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) { const int data = vp9_get_segdata(seg, segment_id, SEG_LVL_ALT_Q); - return seg->abs_delta == SEGMENT_ABSDATA ? - data : // Abs value - clamp(base_qindex + data, 0, MAXQ); // Delta value + const int seg_qindex = seg->abs_delta == SEGMENT_ABSDATA ? + data : base_qindex + data; + return clamp(seg_qindex, 0, MAXQ); } else { return base_qindex; } diff --git a/media/libvpx/vp9/common/vp9_quant_common.h b/media/libvpx/vp9/common/vp9_quant_common.h index 83f2fb655308..d1545d93c245 100644 --- a/media/libvpx/vp9/common/vp9_quant_common.h +++ b/media/libvpx/vp9/common/vp9_quant_common.h @@ -13,16 +13,23 @@ #include "vp9/common/vp9_blockd.h" +#ifdef __cplusplus +extern "C" { +#endif + #define MINQ 0 #define MAXQ 255 #define QINDEX_RANGE (MAXQ - MINQ + 1) #define QINDEX_BITS 8 -void vp9_init_quant_tables(); - int16_t vp9_dc_quant(int qindex, int delta); int16_t vp9_ac_quant(int qindex, int delta); -int vp9_get_qindex(struct segmentation *seg, int segment_id, int base_qindex); +int vp9_get_qindex(const struct segmentation *seg, int segment_id, + int base_qindex); + +#ifdef __cplusplus +} // extern "C" +#endif #endif // VP9_COMMON_VP9_QUANT_COMMON_H_ diff --git a/media/libvpx/vp9/common/vp9_reconinter.c b/media/libvpx/vp9/common/vp9_reconinter.c index 7cc66c8b4fdc..86ae64839130 100644 --- a/media/libvpx/vp9/common/vp9_reconinter.c +++ b/media/libvpx/vp9/common/vp9_reconinter.c @@ -20,59 +20,81 @@ #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" -void vp9_setup_interp_filters(MACROBLOCKD *xd, - INTERPOLATION_TYPE mcomp_filter_type, - VP9_COMMON *cm) { - if (xd->mi_8x8 && xd->mi_8x8[0]) { - MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; +static void build_mc_border(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + int x, int y, int b_w, int b_h, int w, int h) { + // Get a pointer to the start of the real data for this row. + const uint8_t *ref_row = src - x - y * src_stride; - set_scale_factors(xd, mbmi->ref_frame[0] - LAST_FRAME, - mbmi->ref_frame[1] - LAST_FRAME, - cm->active_ref_scale); - } else { - set_scale_factors(xd, -1, -1, cm->active_ref_scale); - } + if (y >= h) + ref_row += (h - 1) * src_stride; + else if (y > 0) + ref_row += y * src_stride; - xd->subpix.filter_x = xd->subpix.filter_y = - vp9_get_filter_kernel(mcomp_filter_type == SWITCHABLE ? - EIGHTTAP : mcomp_filter_type); + do { + int right = 0, copy; + int left = x < 0 ? -x : 0; - assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0); + if (left > b_w) + left = b_w; + + if (x + b_w > w) + right = x + b_w - w; + + if (right > b_w) + right = b_w; + + copy = b_w - left - right; + + if (left) + memset(dst, ref_row[0], left); + + if (copy) + memcpy(dst + left, ref_row + x + left, copy); + + if (right) + memset(dst + left + copy, ref_row[w - 1], right); + + dst += dst_stride; + ++y; + + if (y > 0 && y < h) + ref_row += src_stride; + } while (--b_h); } static void inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, - const MV32 *mv, - const struct scale_factors *scale, + const int subpel_x, + const int subpel_y, + const struct scale_factors *sf, int w, int h, int ref, - const struct subpix_fn_table *subpix, + const InterpKernel *kernel, int xs, int ys) { - const int subpel_x = mv->col & SUBPEL_MASK; - const int subpel_y = mv->row & SUBPEL_MASK; - - src += (mv->row >> SUBPEL_BITS) * src_stride + (mv->col >> SUBPEL_BITS); - scale->sfc->predict[subpel_x != 0][subpel_y != 0][ref]( + sf->predict[subpel_x != 0][subpel_y != 0][ref]( src, src_stride, dst, dst_stride, - subpix->filter_x[subpel_x], xs, - subpix->filter_y[subpel_y], ys, - w, h); + kernel[subpel_x], xs, kernel[subpel_y], ys, w, h); } void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const MV *src_mv, - const struct scale_factors *scale, + const struct scale_factors *sf, int w, int h, int ref, - const struct subpix_fn_table *subpix, - enum mv_precision precision) { + const InterpKernel *kernel, + enum mv_precision precision, + int x, int y) { const int is_q4 = precision == MV_PRECISION_Q4; const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2, is_q4 ? src_mv->col : src_mv->col * 2 }; - const struct scale_factors_common *sfc = scale->sfc; - const MV32 mv = sfc->scale_mv(&mv_q4, scale); + MV32 mv = vp9_scale_mv(&mv_q4, x, y, sf); + const int subpel_x = mv.col & SUBPEL_MASK; + const int subpel_y = mv.row & SUBPEL_MASK; - inter_predictor(src, src_stride, dst, dst_stride, &mv, scale, - w, h, ref, subpix, sfc->x_step_q4, sfc->y_step_q4); + src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS); + + inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y, + sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4); } static INLINE int round_mv_comp_q4(int value) { @@ -91,6 +113,18 @@ static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) { return res; } +static INLINE int round_mv_comp_q2(int value) { + return (value < 0 ? value - 1 : value + 1) / 2; +} + +static MV mi_mv_pred_q2(const MODE_INFO *mi, int idx, int block0, int block1) { + MV res = { round_mv_comp_q2(mi->bmi[block0].as_mv[idx].as_mv.row + + mi->bmi[block1].as_mv[idx].as_mv.row), + round_mv_comp_q2(mi->bmi[block0].as_mv[idx].as_mv.col + + mi->bmi[block1].as_mv[idx].as_mv.col) }; + return res; +} + // TODO(jkoleszar): yet another mv clamping function :-( MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv, int bw, int bh, int ss_x, int ss_y) { @@ -117,41 +151,46 @@ MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv, return clamped_mv; } +static MV average_split_mvs(const struct macroblockd_plane *pd, + const MODE_INFO *mi, int ref, int block) { + const int ss_idx = ((pd->subsampling_x > 0) << 1) | (pd->subsampling_y > 0); + MV res = {0, 0}; + switch (ss_idx) { + case 0: + res = mi->bmi[block].as_mv[ref].as_mv; + break; + case 1: + res = mi_mv_pred_q2(mi, ref, block, block + 2); + break; + case 2: + res = mi_mv_pred_q2(mi, ref, block, block + 1); + break; + case 3: + res = mi_mv_pred_q4(mi, ref); + break; + default: + assert(ss_idx <= 3 || ss_idx >= 0); + } + return res; +} -// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could -// calculate the subsampled BLOCK_SIZE, but that type isn't defined for -// sizes smaller than 16x16 yet. static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block, - BLOCK_SIZE bsize, int pred_w, int pred_h, + int bw, int bh, + int x, int y, int w, int h, int mi_x, int mi_y) { struct macroblockd_plane *const pd = &xd->plane[plane]; - const int bwl = b_width_log2(bsize) - pd->subsampling_x; - const int bw = 4 << bwl; - const int bh = plane_block_height(bsize, pd); - const int x = 4 * (block & ((1 << bwl) - 1)); - const int y = 4 * (block >> bwl); - const MODE_INFO *mi = xd->mi_8x8[0]; + const MODE_INFO *mi = xd->mi[0]; const int is_compound = has_second_ref(&mi->mbmi); + const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter); int ref; - assert(x < bw); - assert(y < bh); - assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_w == bw); - assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_h == bh); - for (ref = 0; ref < 1 + is_compound; ++ref) { - struct scale_factors *const scale = &xd->scale_factor[ref]; + const struct scale_factors *const sf = &xd->block_refs[ref]->sf; struct buf_2d *const pre_buf = &pd->pre[ref]; struct buf_2d *const dst_buf = &pd->dst; uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; - - // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the - // same MV (the average of the 4 luma MVs) but we could do something - // smarter for non-4:2:0. Just punt for now, pending the changes to get - // rid of SPLITMV mode entirely. const MV mv = mi->mbmi.sb_type < BLOCK_8X8 - ? (plane == 0 ? mi->bmi[block].as_mv[ref].as_mv - : mi_mv_pred_q4(mi, ref)) + ? average_split_mvs(pd, mi, ref, block) : mi->mbmi.mv[ref].as_mv; // TODO(jkoleszar): This clamping is done in the incorrect place for the @@ -165,25 +204,26 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block, uint8_t *pre; MV32 scaled_mv; - int xs, ys; + int xs, ys, subpel_x, subpel_y; - if (vp9_is_scaled(scale->sfc)) { - pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, scale); - scale->sfc->set_scaled_offsets(scale, mi_y + y, mi_x + x); - scaled_mv = scale->sfc->scale_mv(&mv_q4, scale); - xs = scale->sfc->x_step_q4; - ys = scale->sfc->y_step_q4; + if (vp9_is_scaled(sf)) { + pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf); + scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf); + xs = sf->x_step_q4; + ys = sf->y_step_q4; } else { pre = pre_buf->buf + (y * pre_buf->stride + x); scaled_mv.row = mv_q4.row; scaled_mv.col = mv_q4.col; xs = ys = 16; } + subpel_x = scaled_mv.col & SUBPEL_MASK; + subpel_y = scaled_mv.row & SUBPEL_MASK; + pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride + + (scaled_mv.col >> SUBPEL_BITS); inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, - &scaled_mv, scale, - 4 << pred_w, 4 << pred_h, ref, - &xd->subpix, xs, ys); + subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys); } } @@ -191,20 +231,26 @@ static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize, int mi_row, int mi_col, int plane_from, int plane_to) { int plane; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; for (plane = plane_from; plane <= plane_to; ++plane) { - const int mi_x = mi_col * MI_SIZE; - const int mi_y = mi_row * MI_SIZE; - const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x; - const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, + &xd->plane[plane]); + const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const int bw = 4 * num_4x4_w; + const int bh = 4 * num_4x4_h; - if (xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8) { + if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) { int i = 0, x, y; assert(bsize == BLOCK_8X8); - for (y = 0; y < 1 << bhl; ++y) - for (x = 0; x < 1 << bwl; ++x) - build_inter_predictors(xd, plane, i++, bsize, 0, 0, mi_x, mi_y); + for (y = 0; y < num_4x4_h; ++y) + for (x = 0; x < num_4x4_w; ++x) + build_inter_predictors(xd, plane, i++, bw, bh, + 4 * x, 4 * y, 4, 4, mi_x, mi_y); } else { - build_inter_predictors(xd, plane, 0, bsize, bwl, bhl, mi_x, mi_y); + build_inter_predictors(xd, plane, 0, bw, bh, + 0, 0, bw, bh, mi_x, mi_y); } } } @@ -224,22 +270,200 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, MAX_MB_PLANE - 1); } -// TODO(dkovalev: find better place for this function) -void vp9_setup_scale_factors(VP9_COMMON *cm, int i) { - const int ref = cm->active_ref_idx[i]; - struct scale_factors *const sf = &cm->active_ref_scale[i]; - struct scale_factors_common *const sfc = &cm->active_ref_scale_comm[i]; - if (ref >= NUM_YV12_BUFFERS) { - vp9_zero(*sf); - vp9_zero(*sfc); - } else { - YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[ref]; - vp9_setup_scale_factors_for_frame(sf, sfc, - fb->y_crop_width, fb->y_crop_height, - cm->width, cm->height); +// TODO(jingning): This function serves as a placeholder for decoder prediction +// using on demand border extension. It should be moved to /decoder/ directory. +static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block, + int bw, int bh, + int x, int y, int w, int h, + int mi_x, int mi_y) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const MODE_INFO *mi = xd->mi[0]; + const int is_compound = has_second_ref(&mi->mbmi); + const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter); + int ref; - if (vp9_is_scaled(sfc)) - vp9_extend_frame_borders(fb, cm->subsampling_x, cm->subsampling_y); + for (ref = 0; ref < 1 + is_compound; ++ref) { + const struct scale_factors *const sf = &xd->block_refs[ref]->sf; + struct buf_2d *const pre_buf = &pd->pre[ref]; + struct buf_2d *const dst_buf = &pd->dst; + uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; + const MV mv = mi->mbmi.sb_type < BLOCK_8X8 + ? average_split_mvs(pd, mi, ref, block) + : mi->mbmi.mv[ref].as_mv; + + + // TODO(jkoleszar): This clamping is done in the incorrect place for the + // scaling case. It needs to be done on the scaled MV, not the pre-scaling + // MV. Note however that it performs the subsampling aware scaling so + // that the result is always q4. + // mv_precision precision is MV_PRECISION_Q4. + const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh, + pd->subsampling_x, + pd->subsampling_y); + + MV32 scaled_mv; + int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height, buf_stride, + subpel_x, subpel_y; + uint8_t *ref_frame, *buf_ptr; + const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf; + + // Get reference frame pointer, width and height. + if (plane == 0) { + frame_width = ref_buf->y_crop_width; + frame_height = ref_buf->y_crop_height; + ref_frame = ref_buf->y_buffer; + } else { + frame_width = ref_buf->uv_crop_width; + frame_height = ref_buf->uv_crop_height; + ref_frame = plane == 1 ? ref_buf->u_buffer : ref_buf->v_buffer; + } + + if (vp9_is_scaled(sf)) { + // Co-ordinate of containing block to pixel precision. + int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)); + int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)); + + // Co-ordinate of the block to 1/16th pixel precision. + x0_16 = (x_start + x) << SUBPEL_BITS; + y0_16 = (y_start + y) << SUBPEL_BITS; + + // Co-ordinate of current block in reference frame + // to 1/16th pixel precision. + x0_16 = sf->scale_value_x(x0_16, sf); + y0_16 = sf->scale_value_y(y0_16, sf); + + // Map the top left corner of the block into the reference frame. + x0 = sf->scale_value_x(x_start + x, sf); + y0 = sf->scale_value_y(y_start + y, sf); + + // Scale the MV and incorporate the sub-pixel offset of the block + // in the reference frame. + scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf); + xs = sf->x_step_q4; + ys = sf->y_step_q4; + } else { + // Co-ordinate of containing block to pixel precision. + x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x; + y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y; + + // Co-ordinate of the block to 1/16th pixel precision. + x0_16 = x0 << SUBPEL_BITS; + y0_16 = y0 << SUBPEL_BITS; + + scaled_mv.row = mv_q4.row; + scaled_mv.col = mv_q4.col; + xs = ys = 16; + } + subpel_x = scaled_mv.col & SUBPEL_MASK; + subpel_y = scaled_mv.row & SUBPEL_MASK; + + // Calculate the top left corner of the best matching block in the reference frame. + x0 += scaled_mv.col >> SUBPEL_BITS; + y0 += scaled_mv.row >> SUBPEL_BITS; + x0_16 += scaled_mv.col; + y0_16 += scaled_mv.row; + + // Get reference block pointer. + buf_ptr = ref_frame + y0 * pre_buf->stride + x0; + buf_stride = pre_buf->stride; + + // Do border extension if there is motion or the + // width/height is not a multiple of 8 pixels. + if (scaled_mv.col || scaled_mv.row || + (frame_width & 0x7) || (frame_height & 0x7)) { + // Get reference block bottom right coordinate. + int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1; + int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1; + int x_pad = 0, y_pad = 0; + + if (subpel_x || (sf->x_step_q4 & SUBPEL_MASK)) { + x0 -= VP9_INTERP_EXTEND - 1; + x1 += VP9_INTERP_EXTEND; + x_pad = 1; + } + + if (subpel_y || (sf->y_step_q4 & SUBPEL_MASK)) { + y0 -= VP9_INTERP_EXTEND - 1; + y1 += VP9_INTERP_EXTEND; + y_pad = 1; + } + + // Skip border extension if block is inside the frame. + if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 || + y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) { + uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0; + // Extend the border. + build_mc_border(buf_ptr1, pre_buf->stride, xd->mc_buf, x1 - x0 + 1, + x0, y0, x1 - x0 + 1, y1 - y0 + 1, frame_width, + frame_height); + buf_stride = x1 - x0 + 1; + buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3; + } + } + + inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, + subpel_y, sf, w, h, ref, kernel, xs, ys); } } +void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + int plane; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, + &xd->plane[plane]); + const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const int bw = 4 * num_4x4_w; + const int bh = 4 * num_4x4_h; + + if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) { + int i = 0, x, y; + assert(bsize == BLOCK_8X8); + for (y = 0; y < num_4x4_h; ++y) + for (x = 0; x < num_4x4_w; ++x) + dec_build_inter_predictors(xd, plane, i++, bw, bh, + 4 * x, 4 * y, 4, 4, mi_x, mi_y); + } else { + dec_build_inter_predictors(xd, plane, 0, bw, bh, + 0, 0, bw, bh, mi_x, mi_y); + } + } +} + +void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE], + const YV12_BUFFER_CONFIG *src, + int mi_row, int mi_col) { + uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, + src->alpha_buffer}; + const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, + src->alpha_stride}; + int i; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + struct macroblockd_plane *const pd = &planes[i]; + setup_pred_plane(&pd->dst, buffers[i], strides[i], mi_row, mi_col, NULL, + pd->subsampling_x, pd->subsampling_y); + } +} + +void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx, + const YV12_BUFFER_CONFIG *src, + int mi_row, int mi_col, + const struct scale_factors *sf) { + if (src != NULL) { + int i; + uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, + src->alpha_buffer}; + const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, + src->alpha_stride}; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + struct macroblockd_plane *const pd = &xd->plane[i]; + setup_pred_plane(&pd->pre[idx], buffers[i], strides[i], mi_row, mi_col, + sf, pd->subsampling_x, pd->subsampling_y); + } + } +} diff --git a/media/libvpx/vp9/common/vp9_reconinter.h b/media/libvpx/vp9/common/vp9_reconinter.h index 2c8a6e4d92dd..58c596ee87f0 100644 --- a/media/libvpx/vp9/common/vp9_reconinter.h +++ b/media/libvpx/vp9/common/vp9_reconinter.h @@ -14,7 +14,10 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_onyxc_int.h" -struct subpix_fn_table; +#ifdef __cplusplus +extern "C" { +#endif + void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col, BLOCK_SIZE bsize); @@ -24,80 +27,46 @@ void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col, void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, BLOCK_SIZE bsize); -void vp9_setup_interp_filters(MACROBLOCKD *xd, - INTERPOLATION_TYPE filter, - VP9_COMMON *cm); +void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, + BLOCK_SIZE bsize); void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const MV *mv_q3, - const struct scale_factors *scale, + const struct scale_factors *sf, int w, int h, int do_avg, - const struct subpix_fn_table *subpix, - enum mv_precision precision); + const InterpKernel *kernel, + enum mv_precision precision, + int x, int y); -static int scaled_buffer_offset(int x_offset, int y_offset, int stride, - const struct scale_factors *scale) { - const int x = scale ? scale->sfc->scale_value_x(x_offset, scale->sfc) : - x_offset; - const int y = scale ? scale->sfc->scale_value_y(y_offset, scale->sfc) : - y_offset; +static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride, + const struct scale_factors *sf) { + const int x = sf ? sf->scale_value_x(x_offset, sf) : x_offset; + const int y = sf ? sf->scale_value_y(y_offset, sf) : y_offset; return y * stride + x; } -static void setup_pred_plane(struct buf_2d *dst, - uint8_t *src, int stride, - int mi_row, int mi_col, - const struct scale_factors *scale, - int subsampling_x, int subsampling_y) { +static INLINE void setup_pred_plane(struct buf_2d *dst, + uint8_t *src, int stride, + int mi_row, int mi_col, + const struct scale_factors *scale, + int subsampling_x, int subsampling_y) { const int x = (MI_SIZE * mi_col) >> subsampling_x; const int y = (MI_SIZE * mi_row) >> subsampling_y; dst->buf = src + scaled_buffer_offset(x, y, stride, scale); dst->stride = stride; } -// TODO(jkoleszar): audit all uses of this that don't set mb_row, mb_col -static void setup_dst_planes(MACROBLOCKD *xd, - const YV12_BUFFER_CONFIG *src, - int mi_row, int mi_col) { - uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, - src->alpha_buffer}; - int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, - src->alpha_stride}; - int i; +void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE], + const YV12_BUFFER_CONFIG *src, + int mi_row, int mi_col); - for (i = 0; i < MAX_MB_PLANE; ++i) { - struct macroblockd_plane *pd = &xd->plane[i]; - setup_pred_plane(&pd->dst, buffers[i], strides[i], mi_row, mi_col, NULL, - pd->subsampling_x, pd->subsampling_y); - } -} +void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx, + const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, + const struct scale_factors *sf); -static void setup_pre_planes(MACROBLOCKD *xd, int i, - const YV12_BUFFER_CONFIG *src, - int mi_row, int mi_col, - const struct scale_factors *sf) { - if (src) { - int j; - uint8_t* buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, - src->alpha_buffer}; - int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, - src->alpha_stride}; - - for (j = 0; j < MAX_MB_PLANE; ++j) { - struct macroblockd_plane *pd = &xd->plane[j]; - setup_pred_plane(&pd->pre[i], buffers[j], strides[j], - mi_row, mi_col, sf, pd->subsampling_x, pd->subsampling_y); - } - } -} - -static void set_scale_factors(MACROBLOCKD *xd, int ref0, int ref1, - struct scale_factors sf[MAX_REF_FRAMES]) { - xd->scale_factor[0] = sf[ref0 >= 0 ? ref0 : 0]; - xd->scale_factor[1] = sf[ref1 >= 0 ? ref1 : 0]; -} - -void vp9_setup_scale_factors(VP9_COMMON *cm, int i); +#ifdef __cplusplus +} // extern "C" +#endif #endif // VP9_COMMON_VP9_RECONINTER_H_ diff --git a/media/libvpx/vp9/common/vp9_reconintra.c b/media/libvpx/vp9/common/vp9_reconintra.c index eb643b09089a..471929aea091 100644 --- a/media/libvpx/vp9/common/vp9_reconintra.c +++ b/media/libvpx/vp9/common/vp9_reconintra.c @@ -9,32 +9,29 @@ */ #include "./vpx_config.h" +#include "./vp9_rtcd.h" #include "vpx_mem/vpx_mem.h" -#include "vpx_ports/vpx_once.h" - -#include "./vp9_rtcd.h" #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_onyxc_int.h" -const TX_TYPE mode2txfm_map[MB_MODE_COUNT] = { - DCT_DCT, // DC - ADST_DCT, // V - DCT_ADST, // H - DCT_DCT, // D45 - ADST_ADST, // D135 - ADST_DCT, // D117 - DCT_ADST, // D153 - DCT_ADST, // D207 - ADST_DCT, // D63 - ADST_ADST, // TM - DCT_DCT, // NEARESTMV - DCT_DCT, // NEARMV - DCT_DCT, // ZEROMV - DCT_DCT // NEWMV +const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES] = { + DCT_DCT, // DC + ADST_DCT, // V + DCT_ADST, // H + DCT_DCT, // D45 + ADST_ADST, // D135 + ADST_DCT, // D117 + DCT_ADST, // D153 + DCT_ADST, // D207 + ADST_DCT, // D63 + ADST_ADST, // TM }; +// This serves as a wrapper function, so that all the prediction functions +// can be unified and accessed as a pointer array. Note that the boundary +// above and left are not necessarily used all the time. #define intra_pred_sized(type, size) \ void vp9_##type##_predictor_##size##x##size##_c(uint8_t *dst, \ ptrdiff_t stride, \ @@ -52,7 +49,7 @@ const TX_TYPE mode2txfm_map[MB_MODE_COUNT] = { static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r, c; - + (void) above; // first column for (r = 0; r < bs - 1; ++r) dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1], 1); @@ -81,6 +78,7 @@ intra_pred_allsizes(d207) static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r, c; + (void) left; for (r = 0; r < bs; ++r) { for (c = 0; c < bs; ++c) dst[c] = r & 1 ? ROUND_POWER_OF_TWO(above[r/2 + c] + @@ -96,6 +94,7 @@ intra_pred_allsizes(d63) static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r, c; + (void) left; for (r = 0; r < bs; ++r) { for (c = 0; c < bs; ++c) dst[c] = r + c + 2 < bs * 2 ? ROUND_POWER_OF_TWO(above[r + c] + @@ -188,6 +187,7 @@ intra_pred_allsizes(d153) static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r; + (void) left; for (r = 0; r < bs; r++) { vpx_memcpy(dst, above, bs); @@ -199,6 +199,7 @@ intra_pred_allsizes(v) static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r; + (void) above; for (r = 0; r < bs; r++) { vpx_memset(dst, left[r], bs); @@ -223,6 +224,8 @@ intra_pred_allsizes(tm) static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int r; + (void) above; + (void) left; for (r = 0; r < bs; r++) { vpx_memset(dst, 128, bs); @@ -235,6 +238,7 @@ static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int i, r, expected_dc, sum = 0; + (void) above; for (i = 0; i < bs; i++) sum += left[i]; @@ -250,6 +254,7 @@ intra_pred_allsizes(dc_left) static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { int i, r, expected_dc, sum = 0; + (void) left; for (i = 0; i < bs; i++) sum += above[i]; @@ -285,45 +290,49 @@ intra_pred_allsizes(dc) typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); -static intra_pred_fn pred[INTRA_MODES][4]; -static intra_pred_fn dc_pred[2][2][4]; +static intra_pred_fn pred[INTRA_MODES][TX_SIZES]; +static intra_pred_fn dc_pred[2][2][TX_SIZES]; -static void init_intra_pred_fn_ptrs(void) { -#define intra_pred_allsizes(l, type) \ - l[0] = vp9_##type##_predictor_4x4; \ - l[1] = vp9_##type##_predictor_8x8; \ - l[2] = vp9_##type##_predictor_16x16; \ - l[3] = vp9_##type##_predictor_32x32 +void vp9_init_intra_predictors() { +#define INIT_ALL_SIZES(p, type) \ + p[TX_4X4] = vp9_##type##_predictor_4x4; \ + p[TX_8X8] = vp9_##type##_predictor_8x8; \ + p[TX_16X16] = vp9_##type##_predictor_16x16; \ + p[TX_32X32] = vp9_##type##_predictor_32x32 - intra_pred_allsizes(pred[V_PRED], v); - intra_pred_allsizes(pred[H_PRED], h); - intra_pred_allsizes(pred[D207_PRED], d207); - intra_pred_allsizes(pred[D45_PRED], d45); - intra_pred_allsizes(pred[D63_PRED], d63); - intra_pred_allsizes(pred[D117_PRED], d117); - intra_pred_allsizes(pred[D135_PRED], d135); - intra_pred_allsizes(pred[D153_PRED], d153); - intra_pred_allsizes(pred[TM_PRED], tm); + INIT_ALL_SIZES(pred[V_PRED], v); + INIT_ALL_SIZES(pred[H_PRED], h); + INIT_ALL_SIZES(pred[D207_PRED], d207); + INIT_ALL_SIZES(pred[D45_PRED], d45); + INIT_ALL_SIZES(pred[D63_PRED], d63); + INIT_ALL_SIZES(pred[D117_PRED], d117); + INIT_ALL_SIZES(pred[D135_PRED], d135); + INIT_ALL_SIZES(pred[D153_PRED], d153); + INIT_ALL_SIZES(pred[TM_PRED], tm); - intra_pred_allsizes(dc_pred[0][0], dc_128); - intra_pred_allsizes(dc_pred[0][1], dc_top); - intra_pred_allsizes(dc_pred[1][0], dc_left); - intra_pred_allsizes(dc_pred[1][1], dc); + INIT_ALL_SIZES(dc_pred[0][0], dc_128); + INIT_ALL_SIZES(dc_pred[0][1], dc_top); + INIT_ALL_SIZES(dc_pred[1][0], dc_left); + INIT_ALL_SIZES(dc_pred[1][1], dc); -#undef intra_pred_allsizes +#undef INIT_ALL_SIZES } -static void build_intra_predictors(const uint8_t *ref, int ref_stride, - uint8_t *dst, int dst_stride, - MB_PREDICTION_MODE mode, TX_SIZE tx_size, +static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref, + int ref_stride, uint8_t *dst, int dst_stride, + PREDICTION_MODE mode, TX_SIZE tx_size, int up_available, int left_available, - int right_available) { + int right_available, int x, int y, + int plane) { int i; DECLARE_ALIGNED_ARRAY(16, uint8_t, left_col, 64); DECLARE_ALIGNED_ARRAY(16, uint8_t, above_data, 128 + 16); uint8_t *above_row = above_data + 16; const uint8_t *const_above_row = above_row; const int bs = 4 << tx_size; + int frame_width, frame_height; + int x0, y0; + const struct macroblockd_plane *const pd = &xd->plane[plane]; // 127 127 127 .. 127 127 127 127 127 127 // 129 A B .. Y Z @@ -332,28 +341,90 @@ static void build_intra_predictors(const uint8_t *ref, int ref_stride, // 129 G H .. S T T T T T // .. - once(init_intra_pred_fn_ptrs); + // Get current frame pointer, width and height. + if (plane == 0) { + frame_width = xd->cur_buf->y_width; + frame_height = xd->cur_buf->y_height; + } else { + frame_width = xd->cur_buf->uv_width; + frame_height = xd->cur_buf->uv_height; + } + + // Get block position in current frame. + x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x; + y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y; + + vpx_memset(left_col, 129, 64); // left if (left_available) { - for (i = 0; i < bs; i++) - left_col[i] = ref[i * ref_stride - 1]; - } else { - vpx_memset(left_col, 129, bs); + if (xd->mb_to_bottom_edge < 0) { + /* slower path if the block needs border extension */ + if (y0 + bs <= frame_height) { + for (i = 0; i < bs; ++i) + left_col[i] = ref[i * ref_stride - 1]; + } else { + const int extend_bottom = frame_height - y0; + for (i = 0; i < extend_bottom; ++i) + left_col[i] = ref[i * ref_stride - 1]; + for (; i < bs; ++i) + left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1]; + } + } else { + /* faster path if the block does not need extension */ + for (i = 0; i < bs; ++i) + left_col[i] = ref[i * ref_stride - 1]; + } } + // TODO(hkuang) do not extend 2*bs pixels for all modes. // above if (up_available) { const uint8_t *above_ref = ref - ref_stride; - if (bs == 4 && right_available && left_available) { - const_above_row = above_ref; - } else { - vpx_memcpy(above_row, above_ref, bs); - if (bs == 4 && right_available) - vpx_memcpy(above_row + bs, above_ref + bs, bs); - else - vpx_memset(above_row + bs, above_row[bs - 1], bs); + if (xd->mb_to_right_edge < 0) { + /* slower path if the block needs border extension */ + if (x0 + 2 * bs <= frame_width) { + if (right_available && bs == 4) { + vpx_memcpy(above_row, above_ref, 2 * bs); + } else { + vpx_memcpy(above_row, above_ref, bs); + vpx_memset(above_row + bs, above_row[bs - 1], bs); + } + } else if (x0 + bs <= frame_width) { + const int r = frame_width - x0; + if (right_available && bs == 4) { + vpx_memcpy(above_row, above_ref, r); + vpx_memset(above_row + r, above_row[r - 1], + x0 + 2 * bs - frame_width); + } else { + vpx_memcpy(above_row, above_ref, bs); + vpx_memset(above_row + bs, above_row[bs - 1], bs); + } + } else if (x0 <= frame_width) { + const int r = frame_width - x0; + if (right_available && bs == 4) { + vpx_memcpy(above_row, above_ref, r); + vpx_memset(above_row + r, above_row[r - 1], + x0 + 2 * bs - frame_width); + } else { + vpx_memcpy(above_row, above_ref, r); + vpx_memset(above_row + r, above_row[r - 1], + x0 + 2 * bs - frame_width); + } + } above_row[-1] = left_available ? above_ref[-1] : 129; + } else { + /* faster path if the block does not need extension */ + if (bs == 4 && right_available && left_available) { + const_above_row = above_ref; + } else { + vpx_memcpy(above_row, above_ref, bs); + if (bs == 4 && right_available) + vpx_memcpy(above_row + bs, above_ref + bs, bs); + else + vpx_memset(above_row + bs, above_row[bs - 1], bs); + above_row[-1] = left_available ? above_ref[-1] : 129; + } } } else { vpx_memset(above_row, 127, bs * 2); @@ -370,16 +441,19 @@ static void build_intra_predictors(const uint8_t *ref, int ref_stride, } void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in, - TX_SIZE tx_size, int mode, - const uint8_t *ref, int ref_stride, - uint8_t *dst, int dst_stride) { + TX_SIZE tx_size, PREDICTION_MODE mode, + const uint8_t *ref, int ref_stride, + uint8_t *dst, int dst_stride, + int aoff, int loff, int plane) { const int bwl = bwl_in - tx_size; const int wmask = (1 << bwl) - 1; const int have_top = (block_idx >> bwl) || xd->up_available; const int have_left = (block_idx & wmask) || xd->left_available; const int have_right = ((block_idx & wmask) != wmask); + const int x = aoff * 4; + const int y = loff * 4; assert(bwl >= 0); - build_intra_predictors(ref, ref_stride, dst, dst_stride, mode, tx_size, - have_top, have_left, have_right); + build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, tx_size, + have_top, have_left, have_right, x, y, plane); } diff --git a/media/libvpx/vp9/common/vp9_reconintra.h b/media/libvpx/vp9/common/vp9_reconintra.h index 6e3f55c4d83e..845f3bcaac79 100644 --- a/media/libvpx/vp9/common/vp9_reconintra.h +++ b/media/libvpx/vp9/common/vp9_reconintra.h @@ -14,8 +14,19 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_blockd.h" +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_init_intra_predictors(); + void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in, - TX_SIZE tx_size, int mode, + TX_SIZE tx_size, PREDICTION_MODE mode, const uint8_t *ref, int ref_stride, - uint8_t *dst, int dst_stride); + uint8_t *dst, int dst_stride, + int aoff, int loff, int plane); +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_RECONINTRA_H_ diff --git a/media/libvpx/vp9/common/vp9_sadmxn.h b/media/libvpx/vp9/common/vp9_sadmxn.h deleted file mode 100644 index b2dfd63f9b41..000000000000 --- a/media/libvpx/vp9/common/vp9_sadmxn.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_COMMON_VP9_SADMXN_H_ -#define VP9_COMMON_VP9_SADMXN_H_ - -#include "./vpx_config.h" -#include "vpx/vpx_integer.h" - -static INLINE unsigned int sad_mx_n_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - int m, - int n) { - int r, c; - unsigned int sad = 0; - - for (r = 0; r < n; r++) { - for (c = 0; c < m; c++) { - sad += abs(src_ptr[c] - ref_ptr[c]); - } - - src_ptr += src_stride; - ref_ptr += ref_stride; - } - - return sad; -} - -#endif // VP9_COMMON_VP9_SADMXN_H_ diff --git a/media/libvpx/vp9/common/vp9_scale.c b/media/libvpx/vp9/common/vp9_scale.c index 3f0994f80a2e..2f58323aa450 100644 --- a/media/libvpx/vp9/common/vp9_scale.c +++ b/media/libvpx/vp9/common/vp9_scale.c @@ -12,47 +12,19 @@ #include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_scale.h" -static INLINE int scaled_x(int val, const struct scale_factors_common *sfc) { - return val * sfc->x_scale_fp >> REF_SCALE_SHIFT; +static INLINE int scaled_x(int val, const struct scale_factors *sf) { + return (int)((int64_t)val * sf->x_scale_fp >> REF_SCALE_SHIFT); } -static INLINE int scaled_y(int val, const struct scale_factors_common *sfc) { - return val * sfc->y_scale_fp >> REF_SCALE_SHIFT; +static INLINE int scaled_y(int val, const struct scale_factors *sf) { + return (int)((int64_t)val * sf->y_scale_fp >> REF_SCALE_SHIFT); } -static int unscaled_value(int val, const struct scale_factors_common *sfc) { - (void) sfc; +static int unscaled_value(int val, const struct scale_factors *sf) { + (void) sf; return val; } -static MV32 scaled_mv(const MV *mv, const struct scale_factors *scale) { - const MV32 res = { - scaled_y(mv->row, scale->sfc) + scale->y_offset_q4, - scaled_x(mv->col, scale->sfc) + scale->x_offset_q4 - }; - return res; -} - -static MV32 unscaled_mv(const MV *mv, const struct scale_factors *scale) { - const MV32 res = { - mv->row, - mv->col - }; - return res; -} - -static void set_offsets_with_scaling(struct scale_factors *scale, - int row, int col) { - scale->x_offset_q4 = scaled_x(col << SUBPEL_BITS, scale->sfc) & SUBPEL_MASK; - scale->y_offset_q4 = scaled_y(row << SUBPEL_BITS, scale->sfc) & SUBPEL_MASK; -} - -static void set_offsets_without_scaling(struct scale_factors *scale, - int row, int col) { - scale->x_offset_q4 = 0; - scale->y_offset_q4 = 0; -} - static int get_fixed_point_scale_factor(int other_size, int this_size) { // Calculate scaling factor once for each reference frame // and use fixed point scaling factors in decoding and encoding routines. @@ -61,39 +33,36 @@ static int get_fixed_point_scale_factor(int other_size, int this_size) { return (other_size << REF_SCALE_SHIFT) / this_size; } -static int check_scale_factors(int other_w, int other_h, - int this_w, int this_h) { - return 2 * this_w >= other_w && - 2 * this_h >= other_h && - this_w <= 16 * other_w && - this_h <= 16 * other_h; +MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf) { + const int x_off_q4 = scaled_x(x << SUBPEL_BITS, sf) & SUBPEL_MASK; + const int y_off_q4 = scaled_y(y << SUBPEL_BITS, sf) & SUBPEL_MASK; + const MV32 res = { + scaled_y(mv->row, sf) + y_off_q4, + scaled_x(mv->col, sf) + x_off_q4 + }; + return res; } -void vp9_setup_scale_factors_for_frame(struct scale_factors *scale, - struct scale_factors_common *scale_comm, +void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, int other_h, int this_w, int this_h) { - if (!check_scale_factors(other_w, other_h, this_w, this_h)) { - scale_comm->x_scale_fp = REF_INVALID_SCALE; - scale_comm->y_scale_fp = REF_INVALID_SCALE; + if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) { + sf->x_scale_fp = REF_INVALID_SCALE; + sf->y_scale_fp = REF_INVALID_SCALE; return; } - scale_comm->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w); - scale_comm->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h); - scale_comm->x_step_q4 = scaled_x(16, scale_comm); - scale_comm->y_step_q4 = scaled_y(16, scale_comm); + sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w); + sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h); + sf->x_step_q4 = scaled_x(16, sf); + sf->y_step_q4 = scaled_y(16, sf); - if (vp9_is_scaled(scale_comm)) { - scale_comm->scale_value_x = scaled_x; - scale_comm->scale_value_y = scaled_y; - scale_comm->set_scaled_offsets = set_offsets_with_scaling; - scale_comm->scale_mv = scaled_mv; + if (vp9_is_scaled(sf)) { + sf->scale_value_x = scaled_x; + sf->scale_value_y = scaled_y; } else { - scale_comm->scale_value_x = unscaled_value; - scale_comm->scale_value_y = unscaled_value; - scale_comm->set_scaled_offsets = set_offsets_without_scaling; - scale_comm->scale_mv = unscaled_mv; + sf->scale_value_x = unscaled_value; + sf->scale_value_y = unscaled_value; } // TODO(agrange): Investigate the best choice of functions to use here @@ -102,48 +71,44 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *scale, // applied in one direction only, and not at all for 0,0, seems to give the // best quality, but it may be worth trying an additional mode that does // do the filtering on full-pel. - if (scale_comm->x_step_q4 == 16) { - if (scale_comm->y_step_q4 == 16) { + if (sf->x_step_q4 == 16) { + if (sf->y_step_q4 == 16) { // No scaling in either direction. - scale_comm->predict[0][0][0] = vp9_convolve_copy; - scale_comm->predict[0][0][1] = vp9_convolve_avg; - scale_comm->predict[0][1][0] = vp9_convolve8_vert; - scale_comm->predict[0][1][1] = vp9_convolve8_avg_vert; - scale_comm->predict[1][0][0] = vp9_convolve8_horiz; - scale_comm->predict[1][0][1] = vp9_convolve8_avg_horiz; + sf->predict[0][0][0] = vp9_convolve_copy; + sf->predict[0][0][1] = vp9_convolve_avg; + sf->predict[0][1][0] = vp9_convolve8_vert; + sf->predict[0][1][1] = vp9_convolve8_avg_vert; + sf->predict[1][0][0] = vp9_convolve8_horiz; + sf->predict[1][0][1] = vp9_convolve8_avg_horiz; } else { // No scaling in x direction. Must always scale in the y direction. - scale_comm->predict[0][0][0] = vp9_convolve8_vert; - scale_comm->predict[0][0][1] = vp9_convolve8_avg_vert; - scale_comm->predict[0][1][0] = vp9_convolve8_vert; - scale_comm->predict[0][1][1] = vp9_convolve8_avg_vert; - scale_comm->predict[1][0][0] = vp9_convolve8; - scale_comm->predict[1][0][1] = vp9_convolve8_avg; + sf->predict[0][0][0] = vp9_convolve8_vert; + sf->predict[0][0][1] = vp9_convolve8_avg_vert; + sf->predict[0][1][0] = vp9_convolve8_vert; + sf->predict[0][1][1] = vp9_convolve8_avg_vert; + sf->predict[1][0][0] = vp9_convolve8; + sf->predict[1][0][1] = vp9_convolve8_avg; } } else { - if (scale_comm->y_step_q4 == 16) { + if (sf->y_step_q4 == 16) { // No scaling in the y direction. Must always scale in the x direction. - scale_comm->predict[0][0][0] = vp9_convolve8_horiz; - scale_comm->predict[0][0][1] = vp9_convolve8_avg_horiz; - scale_comm->predict[0][1][0] = vp9_convolve8; - scale_comm->predict[0][1][1] = vp9_convolve8_avg; - scale_comm->predict[1][0][0] = vp9_convolve8_horiz; - scale_comm->predict[1][0][1] = vp9_convolve8_avg_horiz; + sf->predict[0][0][0] = vp9_convolve8_horiz; + sf->predict[0][0][1] = vp9_convolve8_avg_horiz; + sf->predict[0][1][0] = vp9_convolve8; + sf->predict[0][1][1] = vp9_convolve8_avg; + sf->predict[1][0][0] = vp9_convolve8_horiz; + sf->predict[1][0][1] = vp9_convolve8_avg_horiz; } else { // Must always scale in both directions. - scale_comm->predict[0][0][0] = vp9_convolve8; - scale_comm->predict[0][0][1] = vp9_convolve8_avg; - scale_comm->predict[0][1][0] = vp9_convolve8; - scale_comm->predict[0][1][1] = vp9_convolve8_avg; - scale_comm->predict[1][0][0] = vp9_convolve8; - scale_comm->predict[1][0][1] = vp9_convolve8_avg; + sf->predict[0][0][0] = vp9_convolve8; + sf->predict[0][0][1] = vp9_convolve8_avg; + sf->predict[0][1][0] = vp9_convolve8; + sf->predict[0][1][1] = vp9_convolve8_avg; + sf->predict[1][0][0] = vp9_convolve8; + sf->predict[1][0][1] = vp9_convolve8_avg; } } // 2D subpel motion always gets filtered in both directions - scale_comm->predict[1][1][0] = vp9_convolve8; - scale_comm->predict[1][1][1] = vp9_convolve8_avg; - - scale->sfc = scale_comm; - scale->x_offset_q4 = 0; // calculated per block - scale->y_offset_q4 = 0; // calculated per block + sf->predict[1][1][0] = vp9_convolve8; + sf->predict[1][1][1] = vp9_convolve8_avg; } diff --git a/media/libvpx/vp9/common/vp9_scale.h b/media/libvpx/vp9/common/vp9_scale.h index 1437fcd9c7ee..ad6f5d702aff 100644 --- a/media/libvpx/vp9/common/vp9_scale.h +++ b/media/libvpx/vp9/common/vp9_scale.h @@ -14,44 +14,52 @@ #include "vp9/common/vp9_mv.h" #include "vp9/common/vp9_convolve.h" +#ifdef __cplusplus +extern "C" { +#endif + #define REF_SCALE_SHIFT 14 #define REF_NO_SCALE (1 << REF_SCALE_SHIFT) #define REF_INVALID_SCALE -1 -struct scale_factors; -struct scale_factors_common { +struct scale_factors { int x_scale_fp; // horizontal fixed point scale factor int y_scale_fp; // vertical fixed point scale factor int x_step_q4; int y_step_q4; - int (*scale_value_x)(int val, const struct scale_factors_common *sfc); - int (*scale_value_y)(int val, const struct scale_factors_common *sfc); - void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col); - MV32 (*scale_mv)(const MV *mv, const struct scale_factors *scale); + int (*scale_value_x)(int val, const struct scale_factors *sf); + int (*scale_value_y)(int val, const struct scale_factors *sf); convolve_fn_t predict[2][2][2]; // horiz, vert, avg }; -struct scale_factors { - int x_offset_q4; - int y_offset_q4; - const struct scale_factors_common *sfc; -}; +MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf); -void vp9_setup_scale_factors_for_frame(struct scale_factors *scale, - struct scale_factors_common *scale_comm, +void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, int other_h, int this_w, int this_h); -static int vp9_is_valid_scale(const struct scale_factors_common *sfc) { - return sfc->x_scale_fp != REF_INVALID_SCALE && - sfc->y_scale_fp != REF_INVALID_SCALE; +static INLINE int vp9_is_valid_scale(const struct scale_factors *sf) { + return sf->x_scale_fp != REF_INVALID_SCALE && + sf->y_scale_fp != REF_INVALID_SCALE; } -static int vp9_is_scaled(const struct scale_factors_common *sfc) { - return sfc->x_scale_fp != REF_NO_SCALE || - sfc->y_scale_fp != REF_NO_SCALE; +static INLINE int vp9_is_scaled(const struct scale_factors *sf) { + return vp9_is_valid_scale(sf) && + (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE); } +static INLINE int valid_ref_frame_size(int ref_width, int ref_height, + int this_width, int this_height) { + return 2 * this_width >= ref_width && + 2 * this_height >= ref_height && + this_width <= 16 * ref_width && + this_height <= 16 * ref_height; +} + +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_SCALE_H_ diff --git a/media/libvpx/vp9/common/vp9_scan.c b/media/libvpx/vp9/common/vp9_scan.c index f17da91104bc..1ec5a0cf36c6 100644 --- a/media/libvpx/vp9/common/vp9_scan.c +++ b/media/libvpx/vp9/common/vp9_scan.c @@ -12,28 +12,28 @@ #include "vp9/common/vp9_scan.h" -DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]) = { +DECLARE_ALIGNED(16, static const int16_t, default_scan_4x4[16]) = { 0, 4, 1, 5, 8, 2, 12, 9, 3, 6, 13, 10, 7, 14, 11, 15, }; -DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]) = { +DECLARE_ALIGNED(16, static const int16_t, col_scan_4x4[16]) = { 0, 4, 8, 1, 12, 5, 9, 2, 13, 6, 10, 3, 7, 14, 11, 15, }; -DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]) = { +DECLARE_ALIGNED(16, static const int16_t, row_scan_4x4[16]) = { 0, 1, 4, 2, 5, 3, 6, 8, 9, 7, 12, 10, 13, 11, 14, 15, }; -DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = { +DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = { 0, 8, 1, 16, 9, 2, 17, 24, 10, 3, 18, 25, 32, 11, 4, 26, 33, 19, 40, 12, 34, 27, 5, 41, @@ -44,7 +44,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = { 46, 39, 61, 54, 47, 62, 55, 63, }; -DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = { +DECLARE_ALIGNED(16, static const int16_t, col_scan_8x8[64]) = { 0, 8, 16, 1, 24, 9, 32, 17, 2, 40, 25, 10, 33, 18, 48, 3, 26, 41, 11, 56, 19, 34, 4, 49, @@ -55,7 +55,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = { 31, 61, 39, 54, 47, 62, 55, 63, }; -DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = { +DECLARE_ALIGNED(16, static const int16_t, row_scan_8x8[64]) = { 0, 1, 2, 8, 9, 3, 16, 10, 4, 17, 11, 24, 5, 18, 25, 12, 19, 26, 32, 6, 13, 20, 33, 27, @@ -66,7 +66,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = { 60, 39, 61, 47, 54, 55, 62, 63, }; -DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = { +DECLARE_ALIGNED(16, static const int16_t, default_scan_16x16[256]) = { 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80, 50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52, 98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69, @@ -87,7 +87,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = { 255, }; -DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = { +DECLARE_ALIGNED(16, static const int16_t, col_scan_16x16[256]) = { 0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81, 34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4, 67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21, @@ -108,7 +108,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = { 255, }; -DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = { +DECLARE_ALIGNED(16, static const int16_t, row_scan_16x16[256]) = { 0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20, 49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52, 23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69, @@ -130,7 +130,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = { 255, }; -DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = { +DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = { 0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160, 129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193, 68, 131, 37, 100, @@ -233,38 +233,68 @@ DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = { // in {top, left, topleft, topright, bottomleft} order // for each position in raster scan order. // -1 indicates the neighbor does not exist. -DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); -DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, static int16_t, + default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, static int16_t, + col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, static int16_t, + row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, static int16_t, + col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, static int16_t, + row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, static int16_t, + default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, static int16_t, + col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, static int16_t, + row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, static int16_t, + default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, static int16_t, + default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, static int16_t, vp9_default_iscan_4x4[16]); +DECLARE_ALIGNED(16, static int16_t, vp9_col_iscan_4x4[16]); +DECLARE_ALIGNED(16, static int16_t, vp9_row_iscan_4x4[16]); +DECLARE_ALIGNED(16, static int16_t, vp9_col_iscan_8x8[64]); +DECLARE_ALIGNED(16, static int16_t, vp9_row_iscan_8x8[64]); +DECLARE_ALIGNED(16, static int16_t, vp9_default_iscan_8x8[64]); +DECLARE_ALIGNED(16, static int16_t, vp9_col_iscan_16x16[256]); +DECLARE_ALIGNED(16, static int16_t, vp9_row_iscan_16x16[256]); +DECLARE_ALIGNED(16, static int16_t, vp9_default_iscan_16x16[256]); +DECLARE_ALIGNED(16, static int16_t, vp9_default_iscan_32x32[1024]); -DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]); -DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]); -DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]); -DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]); -DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]); -DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]); -DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]); -DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]); -DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]); -DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]); +const scan_order vp9_default_scan_orders[TX_SIZES] = { + {default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors}, + {default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors}, + {default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors}, + {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors}, +}; + +const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES] = { + { // TX_4X4 + {default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors}, + {row_scan_4x4, vp9_row_iscan_4x4, row_scan_4x4_neighbors}, + {col_scan_4x4, vp9_col_iscan_4x4, col_scan_4x4_neighbors}, + {default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors} + }, { // TX_8X8 + {default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors}, + {row_scan_8x8, vp9_row_iscan_8x8, row_scan_8x8_neighbors}, + {col_scan_8x8, vp9_col_iscan_8x8, col_scan_8x8_neighbors}, + {default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors} + }, { // TX_16X16 + {default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors}, + {row_scan_16x16, vp9_row_iscan_16x16, row_scan_16x16_neighbors}, + {col_scan_16x16, vp9_col_iscan_16x16, col_scan_16x16_neighbors}, + {default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors} + }, { // TX_32X32 + {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors}, + {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors}, + {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors}, + {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors}, + } +}; static int find_in_scan(const int16_t *scan, int l, int idx) { int n, l2 = l * l; @@ -276,9 +306,9 @@ static int find_in_scan(const int16_t *scan, int l, int idx) { assert(0); return -1; } -static void init_scan_neighbors(const int16_t *scan, - int16_t *iscan, - int l, int16_t *neighbors) { + +static void init_scan_neighbors(const int16_t *scan, int16_t *iscan, int l, + int16_t *neighbors) { int l2 = l * l; int n, i, j; @@ -302,15 +332,15 @@ static void init_scan_neighbors(const int16_t *scan, // use the combination of the two as a context. int a = (i - 1) * l + j; int b = i * l + j - 1; - if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 || - scan == vp9_col_scan_16x16) { + if (scan == col_scan_4x4 || scan == col_scan_8x8 || + scan == col_scan_16x16) { // in the col/row scan cases (as well as left/top edge cases), we set // both contexts to the same value, so we can branchlessly do a+b+1>>1 // which automatically becomes a if a == b neighbors[MAX_NEIGHBORS * n + 0] = neighbors[MAX_NEIGHBORS * n + 1] = a; - } else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 || - scan == vp9_row_scan_16x16) { + } else if (scan == row_scan_4x4 || scan == row_scan_8x8 || + scan == row_scan_16x16) { neighbors[MAX_NEIGHBORS * n + 0] = neighbors[MAX_NEIGHBORS * n + 1] = b; } else { @@ -334,24 +364,24 @@ static void init_scan_neighbors(const int16_t *scan, } void vp9_init_neighbors() { - init_scan_neighbors(vp9_default_scan_4x4, vp9_default_iscan_4x4, 4, - vp9_default_scan_4x4_neighbors); - init_scan_neighbors(vp9_row_scan_4x4, vp9_row_iscan_4x4, 4, - vp9_row_scan_4x4_neighbors); - init_scan_neighbors(vp9_col_scan_4x4, vp9_col_iscan_4x4, 4, - vp9_col_scan_4x4_neighbors); - init_scan_neighbors(vp9_default_scan_8x8, vp9_default_iscan_8x8, 8, - vp9_default_scan_8x8_neighbors); - init_scan_neighbors(vp9_row_scan_8x8, vp9_row_iscan_8x8, 8, - vp9_row_scan_8x8_neighbors); - init_scan_neighbors(vp9_col_scan_8x8, vp9_col_iscan_8x8, 8, - vp9_col_scan_8x8_neighbors); - init_scan_neighbors(vp9_default_scan_16x16, vp9_default_iscan_16x16, 16, - vp9_default_scan_16x16_neighbors); - init_scan_neighbors(vp9_row_scan_16x16, vp9_row_iscan_16x16, 16, - vp9_row_scan_16x16_neighbors); - init_scan_neighbors(vp9_col_scan_16x16, vp9_col_iscan_16x16, 16, - vp9_col_scan_16x16_neighbors); - init_scan_neighbors(vp9_default_scan_32x32, vp9_default_iscan_32x32, 32, - vp9_default_scan_32x32_neighbors); + init_scan_neighbors(default_scan_4x4, vp9_default_iscan_4x4, 4, + default_scan_4x4_neighbors); + init_scan_neighbors(row_scan_4x4, vp9_row_iscan_4x4, 4, + row_scan_4x4_neighbors); + init_scan_neighbors(col_scan_4x4, vp9_col_iscan_4x4, 4, + col_scan_4x4_neighbors); + init_scan_neighbors(default_scan_8x8, vp9_default_iscan_8x8, 8, + default_scan_8x8_neighbors); + init_scan_neighbors(row_scan_8x8, vp9_row_iscan_8x8, 8, + row_scan_8x8_neighbors); + init_scan_neighbors(col_scan_8x8, vp9_col_iscan_8x8, 8, + col_scan_8x8_neighbors); + init_scan_neighbors(default_scan_16x16, vp9_default_iscan_16x16, 16, + default_scan_16x16_neighbors); + init_scan_neighbors(row_scan_16x16, vp9_row_iscan_16x16, 16, + row_scan_16x16_neighbors); + init_scan_neighbors(col_scan_16x16, vp9_col_iscan_16x16, 16, + col_scan_16x16_neighbors); + init_scan_neighbors(default_scan_32x32, vp9_default_iscan_32x32, 32, + default_scan_32x32_neighbors); } diff --git a/media/libvpx/vp9/common/vp9_scan.h b/media/libvpx/vp9/common/vp9_scan.h index 14a1a7eb02fd..9613b675c2e7 100644 --- a/media/libvpx/vp9/common/vp9_scan.h +++ b/media/libvpx/vp9/common/vp9_scan.h @@ -15,180 +15,24 @@ #include "vpx_ports/mem.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif #define MAX_NEIGHBORS 2 -extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]); -extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]); -extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]); - -extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]); -extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]); -extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]); - -extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]); -extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]); -extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]); - -extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]); - -extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]); -extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]); -extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]); - -extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]); -extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]); -extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]); - -extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]); -extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]); -extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]); - -extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]); - -extern DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); -extern DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]); - - void vp9_init_neighbors(); -static INLINE const int16_t* get_scan_4x4(TX_TYPE tx_type) { - switch (tx_type) { - case ADST_DCT: - return vp9_row_scan_4x4; - case DCT_ADST: - return vp9_col_scan_4x4; - default: - return vp9_default_scan_4x4; - } -} +typedef struct { + const int16_t *scan; + const int16_t *iscan; + const int16_t *neighbors; +} scan_order; -static INLINE void get_scan_nb_4x4(TX_TYPE tx_type, - const int16_t **scan, const int16_t **nb) { - switch (tx_type) { - case ADST_DCT: - *scan = vp9_row_scan_4x4; - *nb = vp9_row_scan_4x4_neighbors; - break; - case DCT_ADST: - *scan = vp9_col_scan_4x4; - *nb = vp9_col_scan_4x4_neighbors; - break; - default: - *scan = vp9_default_scan_4x4; - *nb = vp9_default_scan_4x4_neighbors; - break; - } -} - -static INLINE const int16_t* get_iscan_4x4(TX_TYPE tx_type) { - switch (tx_type) { - case ADST_DCT: - return vp9_row_iscan_4x4; - case DCT_ADST: - return vp9_col_iscan_4x4; - default: - return vp9_default_iscan_4x4; - } -} - -static INLINE const int16_t* get_scan_8x8(TX_TYPE tx_type) { - switch (tx_type) { - case ADST_DCT: - return vp9_row_scan_8x8; - case DCT_ADST: - return vp9_col_scan_8x8; - default: - return vp9_default_scan_8x8; - } -} - -static INLINE void get_scan_nb_8x8(TX_TYPE tx_type, - const int16_t **scan, const int16_t **nb) { - switch (tx_type) { - case ADST_DCT: - *scan = vp9_row_scan_8x8; - *nb = vp9_row_scan_8x8_neighbors; - break; - case DCT_ADST: - *scan = vp9_col_scan_8x8; - *nb = vp9_col_scan_8x8_neighbors; - break; - default: - *scan = vp9_default_scan_8x8; - *nb = vp9_default_scan_8x8_neighbors; - break; - } -} - -static INLINE const int16_t* get_iscan_8x8(TX_TYPE tx_type) { - switch (tx_type) { - case ADST_DCT: - return vp9_row_iscan_8x8; - case DCT_ADST: - return vp9_col_iscan_8x8; - default: - return vp9_default_iscan_8x8; - } -} - -static INLINE const int16_t* get_scan_16x16(TX_TYPE tx_type) { - switch (tx_type) { - case ADST_DCT: - return vp9_row_scan_16x16; - case DCT_ADST: - return vp9_col_scan_16x16; - default: - return vp9_default_scan_16x16; - } -} - -static INLINE void get_scan_nb_16x16(TX_TYPE tx_type, - const int16_t **scan, const int16_t **nb) { - switch (tx_type) { - case ADST_DCT: - *scan = vp9_row_scan_16x16; - *nb = vp9_row_scan_16x16_neighbors; - break; - case DCT_ADST: - *scan = vp9_col_scan_16x16; - *nb = vp9_col_scan_16x16_neighbors; - break; - default: - *scan = vp9_default_scan_16x16; - *nb = vp9_default_scan_16x16_neighbors; - break; - } -} - -static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) { - switch (tx_type) { - case ADST_DCT: - return vp9_row_iscan_16x16; - case DCT_ADST: - return vp9_col_iscan_16x16; - default: - return vp9_default_iscan_16x16; - } -} +extern const scan_order vp9_default_scan_orders[TX_SIZES]; +extern const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES]; static INLINE int get_coef_context(const int16_t *neighbors, const uint8_t *token_cache, int c) { @@ -196,4 +40,8 @@ static INLINE int get_coef_context(const int16_t *neighbors, token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1; } +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_SCAN_H_ diff --git a/media/libvpx/vp9/common/vp9_seg_common.c b/media/libvpx/vp9/common/vp9_seg_common.c index ef30404b4547..910200ecc9c5 100644 --- a/media/libvpx/vp9/common/vp9_seg_common.c +++ b/media/libvpx/vp9/common/vp9_seg_common.c @@ -41,11 +41,6 @@ void vp9_enable_segfeature(struct segmentation *seg, int segment_id, seg->feature_mask[segment_id] |= 1 << feature_id; } -void vp9_disable_segfeature(struct segmentation *seg, int segment_id, - SEG_LVL_FEATURES feature_id) { - seg->feature_mask[segment_id] &= ~(1 << feature_id); -} - int vp9_seg_feature_data_max(SEG_LVL_FEATURES feature_id) { return seg_feature_data_max[feature_id]; } @@ -54,11 +49,6 @@ int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) { return seg_feature_data_signed[feature_id]; } -void vp9_clear_segdata(struct segmentation *seg, int segment_id, - SEG_LVL_FEATURES feature_id) { - seg->feature_data[segment_id][feature_id] = 0; -} - void vp9_set_segdata(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id, int seg_data) { assert(seg_data <= seg_feature_data_max[feature_id]); diff --git a/media/libvpx/vp9/common/vp9_seg_common.h b/media/libvpx/vp9/common/vp9_seg_common.h index eb38c06be5ad..ff2d66a36582 100644 --- a/media/libvpx/vp9/common/vp9_seg_common.h +++ b/media/libvpx/vp9/common/vp9_seg_common.h @@ -11,7 +11,11 @@ #ifndef VP9_COMMON_VP9_SEG_COMMON_H_ #define VP9_COMMON_VP9_SEG_COMMON_H_ -#include "vp9/common/vp9_treecoder.h" +#include "vp9/common/vp9_prob.h" + +#ifdef __cplusplus +extern "C" { +#endif #define SEGMENT_DELTADATA 0 #define SEGMENT_ABSDATA 1 @@ -55,18 +59,10 @@ void vp9_enable_segfeature(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id); -void vp9_disable_segfeature(struct segmentation *seg, - int segment_id, - SEG_LVL_FEATURES feature_id); - int vp9_seg_feature_data_max(SEG_LVL_FEATURES feature_id); int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id); -void vp9_clear_segdata(struct segmentation *seg, - int segment_id, - SEG_LVL_FEATURES feature_id); - void vp9_set_segdata(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id, @@ -78,5 +74,9 @@ int vp9_get_segdata(const struct segmentation *seg, extern const vp9_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)]; +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_SEG_COMMON_H_ diff --git a/media/libvpx/vp9/common/vp9_systemdependent.h b/media/libvpx/vp9/common/vp9_systemdependent.h index 254a431a3005..e9711582303f 100644 --- a/media/libvpx/vp9/common/vp9_systemdependent.h +++ b/media/libvpx/vp9/common/vp9_systemdependent.h @@ -12,8 +12,16 @@ #define VP9_COMMON_VP9_SYSTEMDEPENDENT_H_ #ifdef _MSC_VER -#include -#define snprintf _snprintf +# include // the ceil() definition must precede intrin.h +# if _MSC_VER > 1310 && (defined(_M_X64) || defined(_M_IX86)) +# include +# define USE_MSC_INTRIN +# endif +# define snprintf _snprintf +#endif + +#ifdef __cplusplus +extern "C" { #endif #include "./vpx_config.h" @@ -26,7 +34,7 @@ void vpx_reset_mmx_state(void); #if defined(_MSC_VER) && _MSC_VER < 1800 // round is not defined in MSVC before VS2013. -static int round(double x) { +static INLINE int round(double x) { if (x < 0) return (int)ceil(x - 0.5); else @@ -34,7 +42,42 @@ static int round(double x) { } #endif -struct VP9Common; -void vp9_machine_specific_config(struct VP9Common *cm); +// use GNU builtins where available. +#if defined(__GNUC__) && \ + ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4) +static INLINE int get_msb(unsigned int n) { + return 31 ^ __builtin_clz(n); +} +#elif defined(USE_MSC_INTRIN) +#pragma intrinsic(_BitScanReverse) + +static INLINE int get_msb(unsigned int n) { + unsigned long first_set_bit; + _BitScanReverse(&first_set_bit, n); + return first_set_bit; +} +#undef USE_MSC_INTRIN +#else +// Returns (int)floor(log2(n)). n must be > 0. +static INLINE int get_msb(unsigned int n) { + int log = 0; + unsigned int value = n; + int i; + + for (i = 4; i >= 0; --i) { + const int shift = (1 << i); + const unsigned int x = value >> shift; + if (x != 0) { + value = x; + log += shift; + } + } + return log; +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif #endif // VP9_COMMON_VP9_SYSTEMDEPENDENT_H_ diff --git a/media/libvpx/vp9/common/vp9_thread.c b/media/libvpx/vp9/common/vp9_thread.c new file mode 100644 index 000000000000..1c6aec032fe1 --- /dev/null +++ b/media/libvpx/vp9/common/vp9_thread.c @@ -0,0 +1,184 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// Multi-threaded worker +// +// Original source: +// http://git.chromium.org/webm/libwebp.git +// 100644 blob 264210ba2807e4da47eb5d18c04cf869d89b9784 src/utils/thread.c + +#include +#include // for memset() +#include "./vp9_thread.h" +#include "vpx_mem/vpx_mem.h" + +#if CONFIG_MULTITHREAD + +struct VP9WorkerImpl { + pthread_mutex_t mutex_; + pthread_cond_t condition_; + pthread_t thread_; +}; + +//------------------------------------------------------------------------------ + +static void execute(VP9Worker *const worker); // Forward declaration. + +static THREADFN thread_loop(void *ptr) { + VP9Worker *const worker = (VP9Worker*)ptr; + int done = 0; + while (!done) { + pthread_mutex_lock(&worker->impl_->mutex_); + while (worker->status_ == OK) { // wait in idling mode + pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); + } + if (worker->status_ == WORK) { + execute(worker); + worker->status_ = OK; + } else if (worker->status_ == NOT_OK) { // finish the worker + done = 1; + } + // signal to the main thread that we're done (for sync()) + pthread_cond_signal(&worker->impl_->condition_); + pthread_mutex_unlock(&worker->impl_->mutex_); + } + return THREAD_RETURN(NULL); // Thread is finished +} + +// main thread state control +static void change_state(VP9Worker *const worker, + VP9WorkerStatus new_status) { + // No-op when attempting to change state on a thread that didn't come up. + // Checking status_ without acquiring the lock first would result in a data + // race. + if (worker->impl_ == NULL) return; + + pthread_mutex_lock(&worker->impl_->mutex_); + if (worker->status_ >= OK) { + // wait for the worker to finish + while (worker->status_ != OK) { + pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); + } + // assign new status and release the working thread if needed + if (new_status != OK) { + worker->status_ = new_status; + pthread_cond_signal(&worker->impl_->condition_); + } + } + pthread_mutex_unlock(&worker->impl_->mutex_); +} + +#endif // CONFIG_MULTITHREAD + +//------------------------------------------------------------------------------ + +static void init(VP9Worker *const worker) { + memset(worker, 0, sizeof(*worker)); + worker->status_ = NOT_OK; +} + +static int sync(VP9Worker *const worker) { +#if CONFIG_MULTITHREAD + change_state(worker, OK); +#endif + assert(worker->status_ <= OK); + return !worker->had_error; +} + +static int reset(VP9Worker *const worker) { + int ok = 1; + worker->had_error = 0; + if (worker->status_ < OK) { +#if CONFIG_MULTITHREAD + worker->impl_ = (VP9WorkerImpl*)vpx_calloc(1, sizeof(*worker->impl_)); + if (worker->impl_ == NULL) { + return 0; + } + if (pthread_mutex_init(&worker->impl_->mutex_, NULL)) { + goto Error; + } + if (pthread_cond_init(&worker->impl_->condition_, NULL)) { + pthread_mutex_destroy(&worker->impl_->mutex_); + goto Error; + } + pthread_mutex_lock(&worker->impl_->mutex_); + ok = !pthread_create(&worker->impl_->thread_, NULL, thread_loop, worker); + if (ok) worker->status_ = OK; + pthread_mutex_unlock(&worker->impl_->mutex_); + if (!ok) { + pthread_mutex_destroy(&worker->impl_->mutex_); + pthread_cond_destroy(&worker->impl_->condition_); + Error: + vpx_free(worker->impl_); + worker->impl_ = NULL; + return 0; + } +#else + worker->status_ = OK; +#endif + } else if (worker->status_ > OK) { + ok = sync(worker); + } + assert(!ok || (worker->status_ == OK)); + return ok; +} + +static void execute(VP9Worker *const worker) { + if (worker->hook != NULL) { + worker->had_error |= !worker->hook(worker->data1, worker->data2); + } +} + +static void launch(VP9Worker *const worker) { +#if CONFIG_MULTITHREAD + change_state(worker, WORK); +#else + execute(worker); +#endif +} + +static void end(VP9Worker *const worker) { +#if CONFIG_MULTITHREAD + if (worker->impl_ != NULL) { + change_state(worker, NOT_OK); + pthread_join(worker->impl_->thread_, NULL); + pthread_mutex_destroy(&worker->impl_->mutex_); + pthread_cond_destroy(&worker->impl_->condition_); + vpx_free(worker->impl_); + worker->impl_ = NULL; + } +#else + worker->status_ = NOT_OK; + assert(worker->impl_ == NULL); +#endif + assert(worker->status_ == NOT_OK); +} + +//------------------------------------------------------------------------------ + +static VP9WorkerInterface g_worker_interface = { + init, reset, sync, launch, execute, end +}; + +int vp9_set_worker_interface(const VP9WorkerInterface* const winterface) { + if (winterface == NULL || + winterface->init == NULL || winterface->reset == NULL || + winterface->sync == NULL || winterface->launch == NULL || + winterface->execute == NULL || winterface->end == NULL) { + return 0; + } + g_worker_interface = *winterface; + return 1; +} + +const VP9WorkerInterface *vp9_get_worker_interface(void) { + return &g_worker_interface; +} + +//------------------------------------------------------------------------------ diff --git a/media/libvpx/vp9/common/vp9_thread.h b/media/libvpx/vp9/common/vp9_thread.h new file mode 100644 index 000000000000..864579c03c3c --- /dev/null +++ b/media/libvpx/vp9/common/vp9_thread.h @@ -0,0 +1,219 @@ +// Copyright 2013 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// Multi-threaded worker +// +// Original source: +// http://git.chromium.org/webm/libwebp.git +// 100644 blob 7bd451b124ae3b81596abfbcc823e3cb129d3a38 src/utils/thread.h + +#ifndef VP9_DECODER_VP9_THREAD_H_ +#define VP9_DECODER_VP9_THREAD_H_ + +#include "./vpx_config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if CONFIG_MULTITHREAD + +#if defined(_WIN32) +#include // NOLINT +#include // NOLINT +#include // NOLINT +typedef HANDLE pthread_t; +typedef CRITICAL_SECTION pthread_mutex_t; +typedef struct { + HANDLE waiting_sem_; + HANDLE received_sem_; + HANDLE signal_event_; +} pthread_cond_t; + +//------------------------------------------------------------------------------ +// simplistic pthread emulation layer + +// _beginthreadex requires __stdcall +#define THREADFN unsigned int __stdcall +#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val) + +static INLINE int pthread_create(pthread_t* const thread, const void* attr, + unsigned int (__stdcall *start)(void*), + void* arg) { + (void)attr; + *thread = (pthread_t)_beginthreadex(NULL, /* void *security */ + 0, /* unsigned stack_size */ + start, + arg, + 0, /* unsigned initflag */ + NULL); /* unsigned *thrdaddr */ + if (*thread == NULL) return 1; + SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL); + return 0; +} + +static INLINE int pthread_join(pthread_t thread, void** value_ptr) { + (void)value_ptr; + return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 || + CloseHandle(thread) == 0); +} + +// Mutex +static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex, + void* mutexattr) { + (void)mutexattr; + InitializeCriticalSection(mutex); + return 0; +} + +static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) { + return TryEnterCriticalSection(mutex) ? 0 : EBUSY; +} + +static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) { + EnterCriticalSection(mutex); + return 0; +} + +static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) { + LeaveCriticalSection(mutex); + return 0; +} + +static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) { + DeleteCriticalSection(mutex); + return 0; +} + +// Condition +static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) { + int ok = 1; + ok &= (CloseHandle(condition->waiting_sem_) != 0); + ok &= (CloseHandle(condition->received_sem_) != 0); + ok &= (CloseHandle(condition->signal_event_) != 0); + return !ok; +} + +static INLINE int pthread_cond_init(pthread_cond_t *const condition, + void* cond_attr) { + (void)cond_attr; + condition->waiting_sem_ = CreateSemaphore(NULL, 0, 1, NULL); + condition->received_sem_ = CreateSemaphore(NULL, 0, 1, NULL); + condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL); + if (condition->waiting_sem_ == NULL || + condition->received_sem_ == NULL || + condition->signal_event_ == NULL) { + pthread_cond_destroy(condition); + return 1; + } + return 0; +} + +static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { + int ok = 1; + if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) { + // a thread is waiting in pthread_cond_wait: allow it to be notified + ok = SetEvent(condition->signal_event_); + // wait until the event is consumed so the signaler cannot consume + // the event via its own pthread_cond_wait. + ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) != + WAIT_OBJECT_0); + } + return !ok; +} + +static INLINE int pthread_cond_wait(pthread_cond_t *const condition, + pthread_mutex_t *const mutex) { + int ok; + // note that there is a consumer available so the signal isn't dropped in + // pthread_cond_signal + if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL)) + return 1; + // now unlock the mutex so pthread_cond_signal may be issued + pthread_mutex_unlock(mutex); + ok = (WaitForSingleObject(condition->signal_event_, INFINITE) == + WAIT_OBJECT_0); + ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL); + pthread_mutex_lock(mutex); + return !ok; +} +#else // _WIN32 +#include // NOLINT +# define THREADFN void* +# define THREAD_RETURN(val) val +#endif + +#endif // CONFIG_MULTITHREAD + +// State of the worker thread object +typedef enum { + NOT_OK = 0, // object is unusable + OK, // ready to work + WORK // busy finishing the current task +} VP9WorkerStatus; + +// Function to be called by the worker thread. Takes two opaque pointers as +// arguments (data1 and data2), and should return false in case of error. +typedef int (*VP9WorkerHook)(void*, void*); + +// Platform-dependent implementation details for the worker. +typedef struct VP9WorkerImpl VP9WorkerImpl; + +// Synchronization object used to launch job in the worker thread +typedef struct { + VP9WorkerImpl *impl_; + VP9WorkerStatus status_; + VP9WorkerHook hook; // hook to call + void *data1; // first argument passed to 'hook' + void *data2; // second argument passed to 'hook' + int had_error; // return value of the last call to 'hook' +} VP9Worker; + +// The interface for all thread-worker related functions. All these functions +// must be implemented. +typedef struct { + // Must be called first, before any other method. + void (*init)(VP9Worker *const worker); + // Must be called to initialize the object and spawn the thread. Re-entrant. + // Will potentially launch the thread. Returns false in case of error. + int (*reset)(VP9Worker *const worker); + // Makes sure the previous work is finished. Returns true if worker->had_error + // was not set and no error condition was triggered by the working thread. + int (*sync)(VP9Worker *const worker); + // Triggers the thread to call hook() with data1 and data2 arguments. These + // hook/data1/data2 values can be changed at any time before calling this + // function, but not be changed afterward until the next call to Sync(). + void (*launch)(VP9Worker *const worker); + // This function is similar to launch() except that it calls the + // hook directly instead of using a thread. Convenient to bypass the thread + // mechanism while still using the VP9Worker structs. sync() must + // still be called afterward (for error reporting). + void (*execute)(VP9Worker *const worker); + // Kill the thread and terminate the object. To use the object again, one + // must call reset() again. + void (*end)(VP9Worker *const worker); +} VP9WorkerInterface; + +// Install a new set of threading functions, overriding the defaults. This +// should be done before any workers are started, i.e., before any encoding or +// decoding takes place. The contents of the interface struct are copied, it +// is safe to free the corresponding memory after this call. This function is +// not thread-safe. Return false in case of invalid pointer or methods. +int vp9_set_worker_interface(const VP9WorkerInterface *const winterface); + +// Retrieve the currently set thread worker interface. +const VP9WorkerInterface *vp9_get_worker_interface(void); + +//------------------------------------------------------------------------------ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_DECODER_VP9_THREAD_H_ diff --git a/media/libvpx/vp9/common/vp9_tile_common.c b/media/libvpx/vp9/common/vp9_tile_common.c index 78909dd9bead..8c4a30353c16 100644 --- a/media/libvpx/vp9/common/vp9_tile_common.c +++ b/media/libvpx/vp9/common/vp9_tile_common.c @@ -21,13 +21,21 @@ static int get_tile_offset(int idx, int mis, int log2) { return MIN(offset, mis); } -void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) { +void vp9_tile_set_row(TileInfo *tile, const VP9_COMMON *cm, int row) { tile->mi_row_start = get_tile_offset(row, cm->mi_rows, cm->log2_tile_rows); tile->mi_row_end = get_tile_offset(row + 1, cm->mi_rows, cm->log2_tile_rows); +} + +void vp9_tile_set_col(TileInfo *tile, const VP9_COMMON *cm, int col) { tile->mi_col_start = get_tile_offset(col, cm->mi_cols, cm->log2_tile_cols); tile->mi_col_end = get_tile_offset(col + 1, cm->mi_cols, cm->log2_tile_cols); } +void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) { + vp9_tile_set_row(tile, cm, row); + vp9_tile_set_col(tile, cm, col); +} + void vp9_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols, int *max_log2_tile_cols) { const int sb_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2; diff --git a/media/libvpx/vp9/common/vp9_tile_common.h b/media/libvpx/vp9/common/vp9_tile_common.h index a09876e4b0c9..ae58805de1c8 100644 --- a/media/libvpx/vp9/common/vp9_tile_common.h +++ b/media/libvpx/vp9/common/vp9_tile_common.h @@ -11,6 +11,10 @@ #ifndef VP9_COMMON_VP9_TILE_COMMON_H_ #define VP9_COMMON_VP9_TILE_COMMON_H_ +#ifdef __cplusplus +extern "C" { +#endif + struct VP9Common; typedef struct TileInfo { @@ -23,7 +27,14 @@ typedef struct TileInfo { void vp9_tile_init(TileInfo *tile, const struct VP9Common *cm, int row, int col); +void vp9_tile_set_row(TileInfo *tile, const struct VP9Common *cm, int row); +void vp9_tile_set_col(TileInfo *tile, const struct VP9Common *cm, int col); + void vp9_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols, int *max_log2_tile_cols); +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_COMMON_VP9_TILE_COMMON_H_ diff --git a/media/libvpx/vp9/common/vp9_treecoder.c b/media/libvpx/vp9/common/vp9_treecoder.c deleted file mode 100644 index e2a5b9faaff4..000000000000 --- a/media/libvpx/vp9/common/vp9_treecoder.c +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include - -#include "./vpx_config.h" -#include "vp9/common/vp9_treecoder.h" - -static void tree2tok(struct vp9_token *const p, vp9_tree t, - int i, int v, int l) { - v += v; - ++l; - - do { - const vp9_tree_index j = t[i++]; - - if (j <= 0) { - p[-j].value = v; - p[-j].len = l; - } else { - tree2tok(p, t, j, v, l); - } - } while (++v & 1); -} - -void vp9_tokens_from_tree(struct vp9_token *p, vp9_tree t) { - tree2tok(p, t, 0, 0, 0); -} - -static unsigned int convert_distribution(unsigned int i, vp9_tree tree, - unsigned int branch_ct[][2], - const unsigned int num_events[]) { - unsigned int left, right; - - if (tree[i] <= 0) - left = num_events[-tree[i]]; - else - left = convert_distribution(tree[i], tree, branch_ct, num_events); - - if (tree[i + 1] <= 0) - right = num_events[-tree[i + 1]]; - else - right = convert_distribution(tree[i + 1], tree, branch_ct, num_events); - - branch_ct[i >> 1][0] = left; - branch_ct[i >> 1][1] = right; - return left + right; -} - -void vp9_tree_probs_from_distribution(vp9_tree tree, - unsigned int branch_ct[/* n-1 */][2], - const unsigned int num_events[/* n */]) { - convert_distribution(0, tree, branch_ct, num_events); -} - - diff --git a/media/libvpx/vp9/common/vp9_treecoder.h b/media/libvpx/vp9/common/vp9_treecoder.h deleted file mode 100644 index a79b1564a1c4..000000000000 --- a/media/libvpx/vp9/common/vp9_treecoder.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_COMMON_VP9_TREECODER_H_ -#define VP9_COMMON_VP9_TREECODER_H_ - -#include "./vpx_config.h" -#include "vpx/vpx_integer.h" -#include "vp9/common/vp9_common.h" - -typedef uint8_t vp9_prob; - -#define vp9_prob_half ((vp9_prob) 128) - -typedef int8_t vp9_tree_index; - -#define TREE_SIZE(leaf_count) (2 * (leaf_count) - 2) - -#define vp9_complement(x) (255 - x) - -/* We build coding trees compactly in arrays. - Each node of the tree is a pair of vp9_tree_indices. - Array index often references a corresponding probability table. - Index <= 0 means done encoding/decoding and value = -Index, - Index > 0 means need another bit, specification at index. - Nonnegative indices are always even; processing begins at node 0. */ - -typedef const vp9_tree_index vp9_tree[]; - -struct vp9_token { - int value; - int len; -}; - -/* Construct encoding array from tree. */ - -void vp9_tokens_from_tree(struct vp9_token*, vp9_tree); - -/* Convert array of token occurrence counts into a table of probabilities - for the associated binary encoding tree. Also writes count of branches - taken for each node on the tree; this facilitiates decisions as to - probability updates. */ - -void vp9_tree_probs_from_distribution(vp9_tree tree, - unsigned int branch_ct[ /* n - 1 */ ][2], - const unsigned int num_events[ /* n */ ]); - - -static INLINE vp9_prob clip_prob(int p) { - return (p > 255) ? 255u : (p < 1) ? 1u : p; -} - -// int64 is not needed for normal frame level calculations. -// However when outputing entropy stats accumulated over many frames -// or even clips we can overflow int math. -#ifdef ENTROPY_STATS -static INLINE vp9_prob get_prob(int num, int den) { - return (den == 0) ? 128u : clip_prob(((int64_t)num * 256 + (den >> 1)) / den); -} -#else -static INLINE vp9_prob get_prob(int num, int den) { - return (den == 0) ? 128u : clip_prob((num * 256 + (den >> 1)) / den); -} -#endif - -static INLINE vp9_prob get_binary_prob(int n0, int n1) { - return get_prob(n0, n0 + n1); -} - -/* this function assumes prob1 and prob2 are already within [1,255] range */ -static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) { - return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8); -} - -static INLINE vp9_prob merge_probs(vp9_prob pre_prob, - const unsigned int ct[2], - unsigned int count_sat, - unsigned int max_update_factor) { - const vp9_prob prob = get_binary_prob(ct[0], ct[1]); - const unsigned int count = MIN(ct[0] + ct[1], count_sat); - const unsigned int factor = max_update_factor * count / count_sat; - return weighted_prob(pre_prob, prob, factor); -} - -static unsigned int tree_merge_probs_impl(unsigned int i, - const vp9_tree_index *tree, - const vp9_prob *pre_probs, - const unsigned int *counts, - unsigned int count_sat, - unsigned int max_update_factor, - vp9_prob *probs) { - const int l = tree[i]; - const unsigned int left_count = (l <= 0) - ? counts[-l] - : tree_merge_probs_impl(l, tree, pre_probs, counts, - count_sat, max_update_factor, probs); - const int r = tree[i + 1]; - const unsigned int right_count = (r <= 0) - ? counts[-r] - : tree_merge_probs_impl(r, tree, pre_probs, counts, - count_sat, max_update_factor, probs); - const unsigned int ct[2] = { left_count, right_count }; - probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct, - count_sat, max_update_factor); - return left_count + right_count; -} - -static void tree_merge_probs(const vp9_tree_index *tree, - const vp9_prob *pre_probs, - const unsigned int *counts, - unsigned int count_sat, - unsigned int max_update_factor, vp9_prob *probs) { - tree_merge_probs_impl(0, tree, pre_probs, counts, - count_sat, max_update_factor, probs); -} - - -#endif // VP9_COMMON_VP9_TREECODER_H_ diff --git a/media/libvpx/vp9/common/x86/vp9_asm_stubs.c b/media/libvpx/vp9/common/x86/vp9_asm_stubs.c index 106e6d426f7d..1b4904c3936b 100644 --- a/media/libvpx/vp9/common/x86/vp9_asm_stubs.c +++ b/media/libvpx/vp9/common/x86/vp9_asm_stubs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -13,45 +13,205 @@ #include "./vpx_config.h" #include "./vp9_rtcd.h" #include "vpx_ports/mem.h" -/////////////////////////////////////////////////////////////////////////// -// the mmx function that does the bilinear filtering and var calculation // -// int one pass // -/////////////////////////////////////////////////////////////////////////// -DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = { - { 128, 128, 128, 128, 0, 0, 0, 0 }, - { 120, 120, 120, 120, 8, 8, 8, 8 }, - { 112, 112, 112, 112, 16, 16, 16, 16 }, - { 104, 104, 104, 104, 24, 24, 24, 24 }, - { 96, 96, 96, 96, 32, 32, 32, 32 }, - { 88, 88, 88, 88, 40, 40, 40, 40 }, - { 80, 80, 80, 80, 48, 48, 48, 48 }, - { 72, 72, 72, 72, 56, 56, 56, 56 }, - { 64, 64, 64, 64, 64, 64, 64, 64 }, - { 56, 56, 56, 56, 72, 72, 72, 72 }, - { 48, 48, 48, 48, 80, 80, 80, 80 }, - { 40, 40, 40, 40, 88, 88, 88, 88 }, - { 32, 32, 32, 32, 96, 96, 96, 96 }, - { 24, 24, 24, 24, 104, 104, 104, 104 }, - { 16, 16, 16, 16, 112, 112, 112, 112 }, - { 8, 8, 8, 8, 120, 120, 120, 120 } -}; typedef void filter8_1dfunction ( const unsigned char *src_ptr, - const unsigned int src_pitch, + const ptrdiff_t src_pitch, unsigned char *output_ptr, - unsigned int out_pitch, + ptrdiff_t out_pitch, unsigned int output_height, const short *filter ); +#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ + uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, \ + int w, int h) { \ + if (step_q4 == 16 && filter[3] != 128) { \ + if (filter[0] || filter[1] || filter[2]) { \ + while (w >= 16) { \ + vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + vp9_filter_block1d16_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_filter_block1d8_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_filter_block1d4_##dir##2_##avg##opt(src, \ + src_stride, \ + dst, \ + dst_stride, \ + h, \ + filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + } \ + if (w) { \ + vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h); \ + } \ +} + +#define FUN_CONV_2D(avg, opt) \ +void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ + uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, \ + int w, int h) { \ + assert(w <= 64); \ + assert(h <= 64); \ + if (x_step_q4 == 16 && y_step_q4 == 16) { \ + if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ + filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \ + vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h + 7); \ + vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h); \ + } else { \ + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 65); \ + vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h + 1); \ + vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h); \ + } \ + } else { \ + vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, w, h); \ + } \ +} +#if HAVE_AVX2 +filter8_1dfunction vp9_filter_block1d16_v8_avx2; +filter8_1dfunction vp9_filter_block1d16_h8_avx2; +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +#if (ARCH_X86_64) +filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; +#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3 +#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3 +#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3 +#else +filter8_1dfunction vp9_filter_block1d8_v8_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_ssse3; +#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3 +#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3 +#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3 +#endif +filter8_1dfunction vp9_filter_block1d16_v2_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_ssse3; +#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3 +#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3 +#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3 +#define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3 +#define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3 +#define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3 +#define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3 +// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); + +// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, avx2); +#endif #if HAVE_SSSE3 +#if (ARCH_X86_64) +filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; +#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3 +#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3 +#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3 +#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3 +#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3 +#else filter8_1dfunction vp9_filter_block1d16_v8_ssse3; filter8_1dfunction vp9_filter_block1d16_h8_ssse3; filter8_1dfunction vp9_filter_block1d8_v8_ssse3; filter8_1dfunction vp9_filter_block1d8_h8_ssse3; filter8_1dfunction vp9_filter_block1d4_v8_ssse3; filter8_1dfunction vp9_filter_block1d4_h8_ssse3; +#endif filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; @@ -59,201 +219,57 @@ filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; -void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - /* Ensure the filter can be compressed to int16_t. */ - if (x_step_q4 == 16 && filter_x[3] != 128) { - while (w >= 16) { - vp9_filter_block1d16_h8_ssse3(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 16; - dst += 16; - w -= 16; - } - while (w >= 8) { - vp9_filter_block1d8_h8_ssse3(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 8; - dst += 8; - w -= 8; - } - while (w >= 4) { - vp9_filter_block1d4_h8_ssse3(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 4; - dst += 4; - w -= 4; - } - } - if (w) { - vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } -} +filter8_1dfunction vp9_filter_block1d16_v2_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_ssse3; +filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3; -void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - if (y_step_q4 == 16 && filter_y[3] != 128) { - while (w >= 16) { - vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 16; - dst += 16; - w -= 16; - } - while (w >= 8) { - vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 8; - dst += 8; - w -= 8; - } - while (w >= 4) { - vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 4; - dst += 4; - w -= 4; - } - } - if (w) { - vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } -} +// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); +FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, + ssse3); -void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - if (x_step_q4 == 16 && filter_x[3] != 128) { - while (w >= 16) { - vp9_filter_block1d16_h8_avg_ssse3(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 16; - dst += 16; - w -= 16; - } - while (w >= 8) { - vp9_filter_block1d8_h8_avg_ssse3(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 8; - dst += 8; - w -= 8; - } - while (w >= 4) { - vp9_filter_block1d4_h8_avg_ssse3(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 4; - dst += 4; - w -= 4; - } - } - if (w) { - vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } -} - -void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - if (y_step_q4 == 16 && filter_y[3] != 128) { - while (w >= 16) { - vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 16; - dst += 16; - w -= 16; - } - while (w >= 8) { - vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 8; - dst += 8; - w -= 8; - } - while (w >= 4) { - vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 4; - dst += 4; - w -= 4; - } - } - if (w) { - vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } -} - -void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); - - assert(w <= 64); - assert(h <= 64); - if (x_step_q4 == 16 && y_step_q4 == 16) { - vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h + 7); - vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, h); - } else { - vp9_convolve8_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, h); - } -} - -void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); - - assert(w <= 64); - assert(h <= 64); - if (x_step_q4 == 16 && y_step_q4 == 16) { - vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h + 7); - vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } else { - vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, h); - } -} +// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, ssse3); +FUN_CONV_2D(avg_ , ssse3); #endif #if HAVE_SSE2 @@ -270,199 +286,54 @@ filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2; filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2; filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2; -void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - /* Ensure the filter can be compressed to int16_t. */ - if (x_step_q4 == 16 && filter_x[3] != 128) { - while (w >= 16) { - vp9_filter_block1d16_h8_sse2(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 16; - dst += 16; - w -= 16; - } - while (w >= 8) { - vp9_filter_block1d8_h8_sse2(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 8; - dst += 8; - w -= 8; - } - while (w >= 4) { - vp9_filter_block1d4_h8_sse2(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 4; - dst += 4; - w -= 4; - } - } - if (w) { - vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } -} +filter8_1dfunction vp9_filter_block1d16_v2_sse2; +filter8_1dfunction vp9_filter_block1d16_h2_sse2; +filter8_1dfunction vp9_filter_block1d8_v2_sse2; +filter8_1dfunction vp9_filter_block1d8_h2_sse2; +filter8_1dfunction vp9_filter_block1d4_v2_sse2; +filter8_1dfunction vp9_filter_block1d4_h2_sse2; +filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2; +filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2; +filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2; +filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2; +filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2; +filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2; -void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - if (y_step_q4 == 16 && filter_y[3] != 128) { - while (w >= 16) { - vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 16; - dst += 16; - w -= 16; - } - while (w >= 8) { - vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 8; - dst += 8; - w -= 8; - } - while (w >= 4) { - vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 4; - dst += 4; - w -= 4; - } - } - if (w) { - vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } -} +// void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); +FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); -void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - if (x_step_q4 == 16 && filter_x[3] != 128) { - while (w >= 16) { - vp9_filter_block1d16_h8_avg_sse2(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 16; - dst += 16; - w -= 16; - } - while (w >= 8) { - vp9_filter_block1d8_h8_avg_sse2(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 8; - dst += 8; - w -= 8; - } - while (w >= 4) { - vp9_filter_block1d4_h8_avg_sse2(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 4; - dst += 4; - w -= 4; - } - } - if (w) { - vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } -} - -void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - if (y_step_q4 == 16 && filter_y[3] != 128) { - while (w >= 16) { - vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 16; - dst += 16; - w -= 16; - } - while (w >= 8) { - vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 8; - dst += 8; - w -= 8; - } - while (w >= 4) { - vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 4; - dst += 4; - w -= 4; - } - } - if (w) { - vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } -} - -void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); - - assert(w <= 64); - assert(h <= 64); - if (x_step_q4 == 16 && y_step_q4 == 16) { - vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h + 7); - vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, h); - } else { - vp9_convolve8_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, h); - } -} - -void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); - - assert(w <= 64); - assert(h <= 64); - if (x_step_q4 == 16 && y_step_q4 == 16) { - vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h + 7); - vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } else { - vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, h); - } -} +// void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, sse2); +FUN_CONV_2D(avg_ , sse2); #endif diff --git a/media/libvpx/vp9/common/x86/vp9_copy_sse2.asm b/media/libvpx/vp9/common/x86/vp9_copy_sse2.asm index dd522c698dcc..b26383708f7d 100644 --- a/media/libvpx/vp9/common/x86/vp9_copy_sse2.asm +++ b/media/libvpx/vp9/common/x86/vp9_copy_sse2.asm @@ -133,10 +133,14 @@ INIT_MMX sse movh m3, [srcq+r5q] lea srcq, [srcq+src_strideq*4] %ifidn %1, avg - pavgb m0, [dstq] - pavgb m1, [dstq+dst_strideq] - pavgb m2, [dstq+dst_strideq*2] - pavgb m3, [dstq+r6q] + movh m4, [dstq] + movh m5, [dstq+dst_strideq] + movh m6, [dstq+dst_strideq*2] + movh m7, [dstq+r6q] + pavgb m0, m4 + pavgb m1, m5 + pavgb m2, m6 + pavgb m3, m7 %endif movh [dstq ], m0 movh [dstq+dst_strideq ], m1 diff --git a/media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c index 2a33844884e9..b60f8a06de98 100644 --- a/media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -8,12 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include -#include // SSE2 -#include "./vpx_config.h" -#include "vpx/vpx_integer.h" -#include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_idct.h" +#include "vp9/common/x86/vp9_idct_intrin_sse2.h" #define RECON_AND_STORE4X4(dest, in_x) \ { \ @@ -174,15 +169,13 @@ void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { static INLINE void transpose_4x4(__m128i *res) { const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]); - res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); - res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); - res[1] = _mm_unpackhi_epi64(res[0], res[0]); - res[3] = _mm_unpackhi_epi64(res[2], res[2]); + res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); + res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); } -static void idct4_1d_sse2(__m128i *in) { +static void idct4_sse2(__m128i *in) { const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); @@ -192,8 +185,8 @@ static void idct4_1d_sse2(__m128i *in) { transpose_4x4(in); // stage 1 - u[0] = _mm_unpacklo_epi16(in[0], in[2]); - u[1] = _mm_unpacklo_epi16(in[1], in[3]); + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpackhi_epi16(in[0], in[1]); v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); @@ -209,19 +202,16 @@ static void idct4_1d_sse2(__m128i *in) { v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[0] = _mm_packs_epi32(v[0], v[2]); - u[1] = _mm_packs_epi32(v[1], v[3]); - u[2] = _mm_unpackhi_epi64(u[0], u[0]); - u[3] = _mm_unpackhi_epi64(u[1], u[1]); + u[0] = _mm_packs_epi32(v[0], v[1]); + u[1] = _mm_packs_epi32(v[3], v[2]); // stage 2 - in[0] = _mm_add_epi16(u[0], u[3]); - in[1] = _mm_add_epi16(u[1], u[2]); - in[2] = _mm_sub_epi16(u[1], u[2]); - in[3] = _mm_sub_epi16(u[0], u[3]); + in[0] = _mm_add_epi16(u[0], u[1]); + in[1] = _mm_sub_epi16(u[0], u[1]); + in[1] = _mm_shuffle_epi32(in[1], 0x4E); } -static void iadst4_1d_sse2(__m128i *in) { +static void iadst4_sse2(__m128i *in) { const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); @@ -232,13 +222,14 @@ static void iadst4_1d_sse2(__m128i *in) { __m128i u[8], v[8], in7; transpose_4x4(in); - in7 = _mm_add_epi16(in[0], in[3]); - in7 = _mm_sub_epi16(in7, in[2]); + in7 = _mm_srli_si128(in[1], 8); + in7 = _mm_add_epi16(in7, in[0]); + in7 = _mm_sub_epi16(in7, in[1]); - u[0] = _mm_unpacklo_epi16(in[0], in[2]); - u[1] = _mm_unpacklo_epi16(in[1], in[3]); + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpackhi_epi16(in[0], in[1]); u[2] = _mm_unpacklo_epi16(in7, kZero); - u[3] = _mm_unpacklo_epi16(in[1], kZero); + u[3] = _mm_unpackhi_epi16(in[0], kZero); v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 @@ -265,39 +256,35 @@ static void iadst4_1d_sse2(__m128i *in) { u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - in[0] = _mm_packs_epi32(u[0], u[2]); - in[1] = _mm_packs_epi32(u[1], u[3]); - in[2] = _mm_unpackhi_epi64(in[0], in[0]); - in[3] = _mm_unpackhi_epi64(in[1], in[1]); + in[0] = _mm_packs_epi32(u[0], u[1]); + in[1] = _mm_packs_epi32(u[2], u[3]); } void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, int tx_type) { - __m128i in[4]; + __m128i in[2]; const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); - in[0] = _mm_loadl_epi64((const __m128i *)input); - in[1] = _mm_loadl_epi64((const __m128i *)(input + 4)); - in[2] = _mm_loadl_epi64((const __m128i *)(input + 8)); - in[3] = _mm_loadl_epi64((const __m128i *)(input + 12)); + in[0]= _mm_loadu_si128((const __m128i *)(input)); + in[1]= _mm_loadu_si128((const __m128i *)(input + 8)); switch (tx_type) { case 0: // DCT_DCT - idct4_1d_sse2(in); - idct4_1d_sse2(in); + idct4_sse2(in); + idct4_sse2(in); break; case 1: // ADST_DCT - idct4_1d_sse2(in); - iadst4_1d_sse2(in); + idct4_sse2(in); + iadst4_sse2(in); break; case 2: // DCT_ADST - iadst4_1d_sse2(in); - idct4_1d_sse2(in); + iadst4_sse2(in); + idct4_sse2(in); break; case 3: // ADST_ADST - iadst4_1d_sse2(in); - iadst4_1d_sse2(in); + iadst4_sse2(in); + iadst4_sse2(in); break; default: assert(0); @@ -307,18 +294,35 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, // Final round and shift in[0] = _mm_add_epi16(in[0], eight); in[1] = _mm_add_epi16(in[1], eight); - in[2] = _mm_add_epi16(in[2], eight); - in[3] = _mm_add_epi16(in[3], eight); in[0] = _mm_srai_epi16(in[0], 4); in[1] = _mm_srai_epi16(in[1], 4); - in[2] = _mm_srai_epi16(in[2], 4); - in[3] = _mm_srai_epi16(in[3], 4); - RECON_AND_STORE4X4(dest, in[0]); - RECON_AND_STORE4X4(dest, in[1]); - RECON_AND_STORE4X4(dest, in[2]); - RECON_AND_STORE4X4(dest, in[3]); + // Reconstruction and Store + { + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); + __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); + d0 = _mm_unpacklo_epi32(d0, + _mm_cvtsi32_si128(*(const int *) (dest + stride))); + d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128( + *(const int *) (dest + stride * 3))); + d0 = _mm_unpacklo_epi8(d0, zero); + d2 = _mm_unpacklo_epi8(d2, zero); + d0 = _mm_add_epi16(d0, in[0]); + d2 = _mm_add_epi16(d2, in[1]); + d0 = _mm_packus_epi16(d0, d2); + // store result[0] + *(int *)dest = _mm_cvtsi128_si32(d0); + // store result[1] + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); + // store result[2] + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); + // store result[3] + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); + } } #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ @@ -352,37 +356,31 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ } -#define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, out4, out5, out6, out7) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ - const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ - \ +#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \ + out0, out1, out2, out3) \ + { \ + const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ + const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ + const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ + \ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - \ + \ out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - out4 = out5 = out6 = out7 = zero; \ } -#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ +#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ + { \ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ - const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ - \ - in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ - in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ - in2 = _mm_unpacklo_epi32(tr0_2, tr0_3); /* i5 i4 */ \ - in3 = _mm_unpackhi_epi32(tr0_2, tr0_3); /* i7 i6 */ \ + out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ } // Define Macro for multiplying elements by constants and adding them together. @@ -422,7 +420,30 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, res3 = _mm_packs_epi32(tmp6, tmp7); \ } -#define IDCT8_1D \ +#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ + { \ + tmp0 = _mm_madd_epi16(lo_0, cst0); \ + tmp1 = _mm_madd_epi16(hi_0, cst0); \ + tmp2 = _mm_madd_epi16(lo_0, cst1); \ + tmp3 = _mm_madd_epi16(hi_0, cst1); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + res0 = _mm_packs_epi32(tmp0, tmp1); \ + res1 = _mm_packs_epi32(tmp2, tmp3); \ + } + +#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) \ + { \ /* Stage1 */ \ { \ const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ @@ -482,23 +503,14 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, } \ \ /* Stage4 */ \ - in0 = _mm_adds_epi16(stp1_0, stp2_7); \ - in1 = _mm_adds_epi16(stp1_1, stp1_6); \ - in2 = _mm_adds_epi16(stp1_2, stp1_5); \ - in3 = _mm_adds_epi16(stp1_3, stp2_4); \ - in4 = _mm_subs_epi16(stp1_3, stp2_4); \ - in5 = _mm_subs_epi16(stp1_2, stp1_5); \ - in6 = _mm_subs_epi16(stp1_1, stp1_6); \ - in7 = _mm_subs_epi16(stp1_0, stp2_7); - -#define RECON_AND_STORE(dest, in_x) \ - { \ - __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - _mm_storel_epi64((__m128i *)(dest), d0); \ - dest += stride; \ + out0 = _mm_adds_epi16(stp1_0, stp2_7); \ + out1 = _mm_adds_epi16(stp1_1, stp1_6); \ + out2 = _mm_adds_epi16(stp1_2, stp1_5); \ + out3 = _mm_adds_epi16(stp1_3, stp2_4); \ + out4 = _mm_subs_epi16(stp1_3, stp2_4); \ + out5 = _mm_subs_epi16(stp1_2, stp1_5); \ + out6 = _mm_subs_epi16(stp1_1, stp1_6); \ + out7 = _mm_subs_epi16(stp1_0, stp2_7); \ } void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { @@ -533,11 +545,12 @@ void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { // 2-D for (i = 0; i < 2; i++) { // 8x8 Transpose is copied from vp9_fdct8x8_sse2() - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); // 4-stage 1D idct8x8 - IDCT8_1D + IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, + in0, in1, in2, in3, in4, in5, in6, in7); } // Final rounding and shift @@ -590,37 +603,7 @@ void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { RECON_AND_STORE(dest, dc_value); } -// perform 8x8 transpose -static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); - - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); -} - -static void idct8_1d_sse2(__m128i *in) { +static void idct8_sse2(__m128i *in) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); @@ -636,32 +619,16 @@ static void idct8_1d_sse2(__m128i *in) { __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - in0 = in[0]; - in1 = in[1]; - in2 = in[2]; - in3 = in[3]; - in4 = in[4]; - in5 = in[5]; - in6 = in[6]; - in7 = in[7]; - // 8x8 Transpose is copied from vp9_fdct8x8_sse2() - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); + TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], + in0, in1, in2, in3, in4, in5, in6, in7); // 4-stage 1D idct8x8 - IDCT8_1D - in[0] = in0; - in[1] = in1; - in[2] = in2; - in[3] = in3; - in[4] = in4; - in[5] = in5; - in[6] = in6; - in[7] = in7; + IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, + in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); } -static void iadst8_1d_sse2(__m128i *in) { +static void iadst8_sse2(__m128i *in) { const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); @@ -908,20 +875,20 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, switch (tx_type) { case 0: // DCT_DCT - idct8_1d_sse2(in); - idct8_1d_sse2(in); + idct8_sse2(in); + idct8_sse2(in); break; case 1: // ADST_DCT - idct8_1d_sse2(in); - iadst8_1d_sse2(in); + idct8_sse2(in); + iadst8_sse2(in); break; case 2: // DCT_ADST - iadst8_1d_sse2(in); - idct8_1d_sse2(in); + iadst8_sse2(in); + idct8_sse2(in); break; case 3: // ADST_ADST - iadst8_1d_sse2(in); - iadst8_1d_sse2(in); + iadst8_sse2(in); + iadst8_sse2(in); break; default: assert(0); @@ -957,7 +924,7 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, RECON_AND_STORE(dest, in[7]); } -void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { +void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { const __m128i zero = _mm_setzero_si128(); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<4); @@ -983,12 +950,11 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); // 8x4 Transpose - TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3) - + TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); // Stage1 { //NOLINT - const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3); - const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2); + const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); + const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); tmp0 = _mm_madd_epi16(lo_17, stg1_0); tmp2 = _mm_madd_epi16(lo_17, stg1_1); @@ -1004,16 +970,14 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - stp1_4 = _mm_packs_epi32(tmp0, zero); - stp1_7 = _mm_packs_epi32(tmp2, zero); - stp1_5 = _mm_packs_epi32(tmp4, zero); - stp1_6 = _mm_packs_epi32(tmp6, zero); + stp1_4 = _mm_packs_epi32(tmp0, tmp2); + stp1_5 = _mm_packs_epi32(tmp4, tmp6); } // Stage2 { //NOLINT - const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2); - const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3); + const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); + const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); tmp0 = _mm_madd_epi16(lo_04, stg2_0); tmp2 = _mm_madd_epi16(lo_04, stg2_1); @@ -1029,24 +993,26 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - stp2_0 = _mm_packs_epi32(tmp0, zero); - stp2_1 = _mm_packs_epi32(tmp2, zero); - stp2_2 = _mm_packs_epi32(tmp4, zero); - stp2_3 = _mm_packs_epi32(tmp6, zero); + stp2_0 = _mm_packs_epi32(tmp0, tmp2); + stp2_2 = _mm_packs_epi32(tmp6, tmp4); - stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); - stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); - stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); - stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); + tmp0 = _mm_adds_epi16(stp1_4, stp1_5); + tmp1 = _mm_subs_epi16(stp1_4, stp1_5); + + stp2_4 = tmp0; + stp2_5 = _mm_unpacklo_epi64(tmp1, zero); + stp2_6 = _mm_unpackhi_epi64(tmp1, zero); } // Stage3 { //NOLINT const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); - stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); - stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); - stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); - stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); + + tmp4 = _mm_adds_epi16(stp2_0, stp2_2); + tmp6 = _mm_subs_epi16(stp2_0, stp2_2); + + stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); + stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); tmp0 = _mm_madd_epi16(lo_56, stg3_0); tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 @@ -1056,27 +1022,19 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - stp1_5 = _mm_packs_epi32(tmp0, zero); - stp1_6 = _mm_packs_epi32(tmp2, zero); + stp1_5 = _mm_packs_epi32(tmp0, tmp2); } // Stage4 - in0 = _mm_adds_epi16(stp1_0, stp2_7); - in1 = _mm_adds_epi16(stp1_1, stp1_6); - in2 = _mm_adds_epi16(stp1_2, stp1_5); - in3 = _mm_adds_epi16(stp1_3, stp2_4); - in4 = _mm_subs_epi16(stp1_3, stp2_4); - in5 = _mm_subs_epi16(stp1_2, stp1_5); - in6 = _mm_subs_epi16(stp1_1, stp1_6); - in7 = _mm_subs_epi16(stp1_0, stp2_7); + tmp0 = _mm_adds_epi16(stp1_3, stp2_4); + tmp1 = _mm_adds_epi16(stp1_2, stp1_5); + tmp2 = _mm_subs_epi16(stp1_3, stp2_4); + tmp3 = _mm_subs_epi16(stp1_2, stp1_5); - // Columns. 4x8 Transpose - TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7) - - // 1D idct8x8 - IDCT8_1D + TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) + IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, + in0, in1, in2, in3, in4, in5, in6, in7); // Final rounding and shift in0 = _mm_adds_epi16(in0, final_rounding); in1 = _mm_adds_epi16(in1, final_rounding); @@ -1106,17 +1064,17 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { RECON_AND_STORE(dest, in7); } -#define IDCT16_1D \ +#define IDCT16 \ /* Stage2 */ \ { \ - const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \ - const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \ - const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7); \ - const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7); \ - const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \ - const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \ - const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \ - const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \ + const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ + const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ + const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ + const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ + const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ + const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ + const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ + const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ \ MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ stg2_0, stg2_1, stg2_2, stg2_3, \ @@ -1129,10 +1087,10 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { \ /* Stage3 */ \ { \ - const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \ - const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \ - const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \ - const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \ + const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ + const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ + const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ + const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ \ MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ stg3_0, stg3_1, stg3_2, stg3_3, \ @@ -1151,10 +1109,10 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { \ /* Stage4 */ \ { \ - const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \ - const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \ - const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \ - const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \ + const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ + const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ + const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ + const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ \ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ @@ -1235,6 +1193,114 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { stp2_10, stp2_13, stp2_11, stp2_12) \ } +#define IDCT16_10 \ + /* Stage2 */ \ + { \ + const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ + const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ + const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ + const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ + \ + MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \ + stg2_0, stg2_1, stg2_6, stg2_7, \ + stp1_8_0, stp1_15, stp1_11, stp1_12_0) \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ + const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ + \ + MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \ + stg3_0, stg3_1, \ + stp2_4, stp2_7) \ + \ + stp1_9 = stp1_8_0; \ + stp1_10 = stp1_11; \ + \ + stp1_13 = stp1_12_0; \ + stp1_14 = stp1_15; \ + } \ + \ + /* Stage4 */ \ + { \ + const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ + const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \ + stg4_0, stg4_1, \ + stp1_0, stp1_1) \ + stp2_5 = stp2_4; \ + stp2_6 = stp2_7; \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ + stg4_4, stg4_5, stg4_6, stg4_7, \ + stp2_9, stp2_14, stp2_10, stp2_13) \ + } \ + \ + /* Stage5 */ \ + { \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_2 = stp1_1; \ + stp1_3 = stp1_0; \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ + \ + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ + } \ + \ + /* Stage6 */ \ + { \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ + stg6_0, stg4_0, stg6_0, stg4_0, \ + stp2_10, stp2_13, stp2_11, stp2_12) \ + } + void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); @@ -1266,16 +1332,7 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, - in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, - in10 = zero, in11 = zero, in12 = zero, in13 = zero, - in14 = zero, in15 = zero; - __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, - l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, - l12 = zero, l13 = zero, l14 = zero, l15 = zero; - __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero, - r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero, - r12 = zero, r13 = zero, r14 = zero, r15 = zero; + __m128i in[16], l[16], r[16], *curr1; __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0, stp1_12_0; @@ -1284,162 +1341,132 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int i; - // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct. - for (i = 0; i < 4; i++) { - // 1-D idct - if (i < 2) { - if (i == 1) input += 128; + curr1 = l; + for (i = 0; i < 2; i++) { + // 1-D idct // Load input data. - in0 = _mm_load_si128((const __m128i *)input); - in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); - in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); - in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); - in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); - in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); - in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); - in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); - in4 = _mm_load_si128((const __m128i *)(input + 8 * 8)); - in12 = _mm_load_si128((const __m128i *)(input + 8 * 9)); - in5 = _mm_load_si128((const __m128i *)(input + 8 * 10)); - in13 = _mm_load_si128((const __m128i *)(input + 8 * 11)); - in6 = _mm_load_si128((const __m128i *)(input + 8 * 12)); - in14 = _mm_load_si128((const __m128i *)(input + 8 * 13)); - in7 = _mm_load_si128((const __m128i *)(input + 8 * 14)); - in15 = _mm_load_si128((const __m128i *)(input + 8 * 15)); + in[0] = _mm_load_si128((const __m128i *)input); + in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1)); + in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3)); + in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); + in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5)); + in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); + in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7)); + in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8)); + in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9)); + in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10)); + in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11)); + in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12)); + in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13)); + in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14)); + in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15)); - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, - in10, in11, in12, in13, in14, in15); - } + array_transpose_8x8(in, in); + array_transpose_8x8(in+8, in+8); - if (i == 2) { - TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, - in5, in6, in7); - TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12, - in13, in14, in15); - } + IDCT16 - if (i == 3) { - TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, - in4, in5, in6, in7); - TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11, - in12, in13, in14, in15); - } + // Stage7 + curr1[0] = _mm_add_epi16(stp2_0, stp1_15); + curr1[1] = _mm_add_epi16(stp2_1, stp1_14); + curr1[2] = _mm_add_epi16(stp2_2, stp2_13); + curr1[3] = _mm_add_epi16(stp2_3, stp2_12); + curr1[4] = _mm_add_epi16(stp2_4, stp2_11); + curr1[5] = _mm_add_epi16(stp2_5, stp2_10); + curr1[6] = _mm_add_epi16(stp2_6, stp1_9); + curr1[7] = _mm_add_epi16(stp2_7, stp1_8); + curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); + curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); + curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); + curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); + curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); + curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); + curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); + curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); - IDCT16_1D + curr1 = r; + input += 128; + } + for (i = 0; i < 2; i++) { + // 1-D idct + array_transpose_8x8(l+i*8, in); + array_transpose_8x8(r+i*8, in+8); + + IDCT16 - // Stage7 - if (i == 0) { - // Left 8x16 - l0 = _mm_add_epi16(stp2_0, stp1_15); - l1 = _mm_add_epi16(stp2_1, stp1_14); - l2 = _mm_add_epi16(stp2_2, stp2_13); - l3 = _mm_add_epi16(stp2_3, stp2_12); - l4 = _mm_add_epi16(stp2_4, stp2_11); - l5 = _mm_add_epi16(stp2_5, stp2_10); - l6 = _mm_add_epi16(stp2_6, stp1_9); - l7 = _mm_add_epi16(stp2_7, stp1_8); - l8 = _mm_sub_epi16(stp2_7, stp1_8); - l9 = _mm_sub_epi16(stp2_6, stp1_9); - l10 = _mm_sub_epi16(stp2_5, stp2_10); - l11 = _mm_sub_epi16(stp2_4, stp2_11); - l12 = _mm_sub_epi16(stp2_3, stp2_12); - l13 = _mm_sub_epi16(stp2_2, stp2_13); - l14 = _mm_sub_epi16(stp2_1, stp1_14); - l15 = _mm_sub_epi16(stp2_0, stp1_15); - } else if (i == 1) { - // Right 8x16 - r0 = _mm_add_epi16(stp2_0, stp1_15); - r1 = _mm_add_epi16(stp2_1, stp1_14); - r2 = _mm_add_epi16(stp2_2, stp2_13); - r3 = _mm_add_epi16(stp2_3, stp2_12); - r4 = _mm_add_epi16(stp2_4, stp2_11); - r5 = _mm_add_epi16(stp2_5, stp2_10); - r6 = _mm_add_epi16(stp2_6, stp1_9); - r7 = _mm_add_epi16(stp2_7, stp1_8); - r8 = _mm_sub_epi16(stp2_7, stp1_8); - r9 = _mm_sub_epi16(stp2_6, stp1_9); - r10 = _mm_sub_epi16(stp2_5, stp2_10); - r11 = _mm_sub_epi16(stp2_4, stp2_11); - r12 = _mm_sub_epi16(stp2_3, stp2_12); - r13 = _mm_sub_epi16(stp2_2, stp2_13); - r14 = _mm_sub_epi16(stp2_1, stp1_14); - r15 = _mm_sub_epi16(stp2_0, stp1_15); - } else { // 2-D - in0 = _mm_add_epi16(stp2_0, stp1_15); - in1 = _mm_add_epi16(stp2_1, stp1_14); - in2 = _mm_add_epi16(stp2_2, stp2_13); - in3 = _mm_add_epi16(stp2_3, stp2_12); - in4 = _mm_add_epi16(stp2_4, stp2_11); - in5 = _mm_add_epi16(stp2_5, stp2_10); - in6 = _mm_add_epi16(stp2_6, stp1_9); - in7 = _mm_add_epi16(stp2_7, stp1_8); - in8 = _mm_sub_epi16(stp2_7, stp1_8); - in9 = _mm_sub_epi16(stp2_6, stp1_9); - in10 = _mm_sub_epi16(stp2_5, stp2_10); - in11 = _mm_sub_epi16(stp2_4, stp2_11); - in12 = _mm_sub_epi16(stp2_3, stp2_12); - in13 = _mm_sub_epi16(stp2_2, stp2_13); - in14 = _mm_sub_epi16(stp2_1, stp1_14); - in15 = _mm_sub_epi16(stp2_0, stp1_15); + in[0] = _mm_add_epi16(stp2_0, stp1_15); + in[1] = _mm_add_epi16(stp2_1, stp1_14); + in[2] = _mm_add_epi16(stp2_2, stp2_13); + in[3] = _mm_add_epi16(stp2_3, stp2_12); + in[4] = _mm_add_epi16(stp2_4, stp2_11); + in[5] = _mm_add_epi16(stp2_5, stp2_10); + in[6] = _mm_add_epi16(stp2_6, stp1_9); + in[7] = _mm_add_epi16(stp2_7, stp1_8); + in[8] = _mm_sub_epi16(stp2_7, stp1_8); + in[9] = _mm_sub_epi16(stp2_6, stp1_9); + in[10] = _mm_sub_epi16(stp2_5, stp2_10); + in[11] = _mm_sub_epi16(stp2_4, stp2_11); + in[12] = _mm_sub_epi16(stp2_3, stp2_12); + in[13] = _mm_sub_epi16(stp2_2, stp2_13); + in[14] = _mm_sub_epi16(stp2_1, stp1_14); + in[15] = _mm_sub_epi16(stp2_0, stp1_15); // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - in8 = _mm_adds_epi16(in8, final_rounding); - in9 = _mm_adds_epi16(in9, final_rounding); - in10 = _mm_adds_epi16(in10, final_rounding); - in11 = _mm_adds_epi16(in11, final_rounding); - in12 = _mm_adds_epi16(in12, final_rounding); - in13 = _mm_adds_epi16(in13, final_rounding); - in14 = _mm_adds_epi16(in14, final_rounding); - in15 = _mm_adds_epi16(in15, final_rounding); + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); - in0 = _mm_srai_epi16(in0, 6); - in1 = _mm_srai_epi16(in1, 6); - in2 = _mm_srai_epi16(in2, 6); - in3 = _mm_srai_epi16(in3, 6); - in4 = _mm_srai_epi16(in4, 6); - in5 = _mm_srai_epi16(in5, 6); - in6 = _mm_srai_epi16(in6, 6); - in7 = _mm_srai_epi16(in7, 6); - in8 = _mm_srai_epi16(in8, 6); - in9 = _mm_srai_epi16(in9, 6); - in10 = _mm_srai_epi16(in10, 6); - in11 = _mm_srai_epi16(in11, 6); - in12 = _mm_srai_epi16(in12, 6); - in13 = _mm_srai_epi16(in13, 6); - in14 = _mm_srai_epi16(in14, 6); - in15 = _mm_srai_epi16(in15, 6); + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); - RECON_AND_STORE(dest, in0); - RECON_AND_STORE(dest, in1); - RECON_AND_STORE(dest, in2); - RECON_AND_STORE(dest, in3); - RECON_AND_STORE(dest, in4); - RECON_AND_STORE(dest, in5); - RECON_AND_STORE(dest, in6); - RECON_AND_STORE(dest, in7); - RECON_AND_STORE(dest, in8); - RECON_AND_STORE(dest, in9); - RECON_AND_STORE(dest, in10); - RECON_AND_STORE(dest, in11); - RECON_AND_STORE(dest, in12); - RECON_AND_STORE(dest, in13); - RECON_AND_STORE(dest, in14); - RECON_AND_STORE(dest, in15); + RECON_AND_STORE(dest, in[0]); + RECON_AND_STORE(dest, in[1]); + RECON_AND_STORE(dest, in[2]); + RECON_AND_STORE(dest, in[3]); + RECON_AND_STORE(dest, in[4]); + RECON_AND_STORE(dest, in[5]); + RECON_AND_STORE(dest, in[6]); + RECON_AND_STORE(dest, in[7]); + RECON_AND_STORE(dest, in[8]); + RECON_AND_STORE(dest, in[9]); + RECON_AND_STORE(dest, in[10]); + RECON_AND_STORE(dest, in[11]); + RECON_AND_STORE(dest, in[12]); + RECON_AND_STORE(dest, in[13]); + RECON_AND_STORE(dest, in[14]); + RECON_AND_STORE(dest, in[15]); dest += 8 - (stride * 16); - } } } @@ -1475,24 +1502,7 @@ void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { } } -static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - array_transpose_8x8(res0, res0); - array_transpose_8x8(res1, tbuf); - array_transpose_8x8(res0 + 8, res1); - array_transpose_8x8(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; -} - -static void iadst16_1d_8col(__m128i *in) { +static void iadst16_8col(__m128i *in) { // perform 16x16 1-D ADST for 8 columns __m128i s[16], x[16], u[32], v[32]; const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); @@ -1962,7 +1972,7 @@ static void iadst16_1d_8col(__m128i *in) { in[15] = _mm_sub_epi16(kZero, s[1]); } -static void idct16_1d_8col(__m128i *in) { +static void idct16_8col(__m128i *in) { const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); @@ -2306,92 +2316,16 @@ static void idct16_1d_8col(__m128i *in) { in[15] = _mm_sub_epi16(s[0], s[15]); } -static void idct16_1d_sse2(__m128i *in0, __m128i *in1) { +static void idct16_sse2(__m128i *in0, __m128i *in1) { array_transpose_16x16(in0, in1); - idct16_1d_8col(in0); - idct16_1d_8col(in1); + idct16_8col(in0); + idct16_8col(in1); } -static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) { +static void iadst16_sse2(__m128i *in0, __m128i *in1) { array_transpose_16x16(in0, in1); - iadst16_1d_8col(in0); - iadst16_1d_8col(in1); -} - -static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { - in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); - in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); - in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); - in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); - in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); - in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); - in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); - in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); - - in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); - in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); - in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); - in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); - in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); - in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); - in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); - in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); -} - -static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { - const __m128i final_rounding = _mm_set1_epi16(1<<5); - const __m128i zero = _mm_setzero_si128(); - // Final rounding and shift - in[0] = _mm_adds_epi16(in[0], final_rounding); - in[1] = _mm_adds_epi16(in[1], final_rounding); - in[2] = _mm_adds_epi16(in[2], final_rounding); - in[3] = _mm_adds_epi16(in[3], final_rounding); - in[4] = _mm_adds_epi16(in[4], final_rounding); - in[5] = _mm_adds_epi16(in[5], final_rounding); - in[6] = _mm_adds_epi16(in[6], final_rounding); - in[7] = _mm_adds_epi16(in[7], final_rounding); - in[8] = _mm_adds_epi16(in[8], final_rounding); - in[9] = _mm_adds_epi16(in[9], final_rounding); - in[10] = _mm_adds_epi16(in[10], final_rounding); - in[11] = _mm_adds_epi16(in[11], final_rounding); - in[12] = _mm_adds_epi16(in[12], final_rounding); - in[13] = _mm_adds_epi16(in[13], final_rounding); - in[14] = _mm_adds_epi16(in[14], final_rounding); - in[15] = _mm_adds_epi16(in[15], final_rounding); - - in[0] = _mm_srai_epi16(in[0], 6); - in[1] = _mm_srai_epi16(in[1], 6); - in[2] = _mm_srai_epi16(in[2], 6); - in[3] = _mm_srai_epi16(in[3], 6); - in[4] = _mm_srai_epi16(in[4], 6); - in[5] = _mm_srai_epi16(in[5], 6); - in[6] = _mm_srai_epi16(in[6], 6); - in[7] = _mm_srai_epi16(in[7], 6); - in[8] = _mm_srai_epi16(in[8], 6); - in[9] = _mm_srai_epi16(in[9], 6); - in[10] = _mm_srai_epi16(in[10], 6); - in[11] = _mm_srai_epi16(in[11], 6); - in[12] = _mm_srai_epi16(in[12], 6); - in[13] = _mm_srai_epi16(in[13], 6); - in[14] = _mm_srai_epi16(in[14], 6); - in[15] = _mm_srai_epi16(in[15], 6); - - RECON_AND_STORE(dest, in[0]); - RECON_AND_STORE(dest, in[1]); - RECON_AND_STORE(dest, in[2]); - RECON_AND_STORE(dest, in[3]); - RECON_AND_STORE(dest, in[4]); - RECON_AND_STORE(dest, in[5]); - RECON_AND_STORE(dest, in[6]); - RECON_AND_STORE(dest, in[7]); - RECON_AND_STORE(dest, in[8]); - RECON_AND_STORE(dest, in[9]); - RECON_AND_STORE(dest, in[10]); - RECON_AND_STORE(dest, in[11]); - RECON_AND_STORE(dest, in[12]); - RECON_AND_STORE(dest, in[13]); - RECON_AND_STORE(dest, in[14]); - RECON_AND_STORE(dest, in[15]); + iadst16_8col(in0); + iadst16_8col(in1); } void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, @@ -2404,20 +2338,20 @@ void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, switch (tx_type) { case 0: // DCT_DCT - idct16_1d_sse2(in0, in1); - idct16_1d_sse2(in0, in1); + idct16_sse2(in0, in1); + idct16_sse2(in0, in1); break; case 1: // ADST_DCT - idct16_1d_sse2(in0, in1); - iadst16_1d_sse2(in0, in1); + idct16_sse2(in0, in1); + iadst16_sse2(in0, in1); break; case 2: // DCT_ADST - iadst16_1d_sse2(in0, in1); - idct16_1d_sse2(in0, in1); + iadst16_sse2(in0, in1); + idct16_sse2(in0, in1); break; case 3: // ADST_ADST - iadst16_1d_sse2(in0, in1); - iadst16_1d_sse2(in0, in1); + iadst16_sse2(in0, in1); + iadst16_sse2(in0, in1); break; default: assert(0); @@ -2437,149 +2371,87 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, - in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, - in10 = zero, in11 = zero, in12 = zero, in13 = zero, - in14 = zero, in15 = zero; - __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, - l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, - l12 = zero, l13 = zero, l14 = zero, l15 = zero; - - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + __m128i in[16], l[16]; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0, stp1_12_0; __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int i; - // 1-D idct. Load input data. - in0 = _mm_load_si128((const __m128i *)input); - in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); - in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); - in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); - in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); - in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); - in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); - in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); + // First 1-D inverse DCT + // Load input data. + in[0] = _mm_load_si128((const __m128i *)input); + in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); + in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); - TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3); - TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11); + TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); // Stage2 { - const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11); - const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3); - const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9); - const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1); + const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); + const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); - tmp4 = _mm_madd_epi16(lo_9_7, stg2_2); - tmp6 = _mm_madd_epi16(lo_9_7, stg2_3); - tmp1 = _mm_madd_epi16(lo_5_11, stg2_4); - tmp3 = _mm_madd_epi16(lo_5_11, stg2_5); tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); tmp0 = _mm_add_epi32(tmp0, rounding); tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); tmp5 = _mm_add_epi32(tmp5, rounding); tmp7 = _mm_add_epi32(tmp7, rounding); tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); - stp2_8 = _mm_packs_epi32(tmp0, zero); - stp2_15 = _mm_packs_epi32(tmp2, zero); - stp2_9 = _mm_packs_epi32(tmp4, zero); - stp2_14 = _mm_packs_epi32(tmp6, zero); - - stp2_10 = _mm_packs_epi32(tmp1, zero); - stp2_13 = _mm_packs_epi32(tmp3, zero); - stp2_11 = _mm_packs_epi32(tmp5, zero); - stp2_12 = _mm_packs_epi32(tmp7, zero); + stp2_8 = _mm_packs_epi32(tmp0, tmp2); + stp2_11 = _mm_packs_epi32(tmp5, tmp7); } // Stage3 { - const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11); - const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3); + const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero); tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); - tmp4 = _mm_madd_epi16(lo_10_6, stg3_2); - tmp6 = _mm_madd_epi16(lo_10_6, stg3_3); tmp0 = _mm_add_epi32(tmp0, rounding); tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - stp1_4 = _mm_packs_epi32(tmp0, zero); - stp1_7 = _mm_packs_epi32(tmp2, zero); - stp1_5 = _mm_packs_epi32(tmp4, zero); - stp1_6 = _mm_packs_epi32(tmp6, zero); + stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); + stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); - stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); - - stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); + stp1_4 = _mm_packs_epi32(tmp0, tmp2); } // Stage4 { - const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); - const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10); - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); - tmp4 = _mm_madd_epi16(lo_4_12, stg4_2); - tmp6 = _mm_madd_epi16(lo_4_12, stg4_3); tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); @@ -2587,8 +2459,6 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, tmp0 = _mm_add_epi32(tmp0, rounding); tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); tmp1 = _mm_add_epi32(tmp1, rounding); tmp3 = _mm_add_epi32(tmp3, rounding); tmp5 = _mm_add_epi32(tmp5, rounding); @@ -2596,49 +2466,40 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); - stp2_0 = _mm_packs_epi32(tmp0, zero); - stp2_1 = _mm_packs_epi32(tmp2, zero); - stp2_2 = _mm_packs_epi32(tmp4, zero); - stp2_3 = _mm_packs_epi32(tmp6, zero); - stp2_9 = _mm_packs_epi32(tmp1, zero); - stp2_14 = _mm_packs_epi32(tmp3, zero); - stp2_10 = _mm_packs_epi32(tmp5, zero); - stp2_13 = _mm_packs_epi32(tmp7, zero); + stp1_0 = _mm_packs_epi32(tmp0, tmp0); + stp1_1 = _mm_packs_epi32(tmp2, tmp2); + stp2_9 = _mm_packs_epi32(tmp1, tmp3); + stp2_10 = _mm_packs_epi32(tmp5, tmp7); - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); + stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); } // Stage5 and Stage6 { - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); + tmp0 = _mm_add_epi16(stp2_8, stp2_11); + tmp1 = _mm_sub_epi16(stp2_8, stp2_11); + tmp2 = _mm_add_epi16(stp2_9, stp2_10); + tmp3 = _mm_sub_epi16(stp2_9, stp2_10); - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); + stp1_9 = _mm_unpacklo_epi64(tmp2, zero); + stp1_10 = _mm_unpacklo_epi64(tmp3, zero); + stp1_8 = _mm_unpacklo_epi64(tmp0, zero); + stp1_11 = _mm_unpacklo_epi64(tmp1, zero); - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); + stp1_13 = _mm_unpackhi_epi64(tmp3, zero); + stp1_14 = _mm_unpackhi_epi64(tmp2, zero); + stp1_12 = _mm_unpackhi_epi64(tmp1, zero); + stp1_15 = _mm_unpackhi_epi64(tmp0, zero); } // Stage6 { - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4); const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); @@ -2663,124 +2524,121 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - stp1_5 = _mm_packs_epi32(tmp1, zero); - stp1_6 = _mm_packs_epi32(tmp3, zero); + stp1_6 = _mm_packs_epi32(tmp3, tmp1); + stp2_10 = _mm_packs_epi32(tmp0, zero); stp2_13 = _mm_packs_epi32(tmp2, zero); stp2_11 = _mm_packs_epi32(tmp4, zero); stp2_12 = _mm_packs_epi32(tmp6, zero); - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); + tmp0 = _mm_add_epi16(stp1_0, stp1_4); + tmp1 = _mm_sub_epi16(stp1_0, stp1_4); + tmp2 = _mm_add_epi16(stp1_1, stp1_6); + tmp3 = _mm_sub_epi16(stp1_1, stp1_6); + + stp2_0 = _mm_unpackhi_epi64(tmp0, zero); + stp2_1 = _mm_unpacklo_epi64(tmp2, zero); + stp2_2 = _mm_unpackhi_epi64(tmp2, zero); + stp2_3 = _mm_unpacklo_epi64(tmp0, zero); + stp2_4 = _mm_unpacklo_epi64(tmp1, zero); + stp2_5 = _mm_unpackhi_epi64(tmp3, zero); + stp2_6 = _mm_unpacklo_epi64(tmp3, zero); + stp2_7 = _mm_unpackhi_epi64(tmp1, zero); } // Stage7. Left 8x16 only. - l0 = _mm_add_epi16(stp2_0, stp1_15); - l1 = _mm_add_epi16(stp2_1, stp1_14); - l2 = _mm_add_epi16(stp2_2, stp2_13); - l3 = _mm_add_epi16(stp2_3, stp2_12); - l4 = _mm_add_epi16(stp2_4, stp2_11); - l5 = _mm_add_epi16(stp2_5, stp2_10); - l6 = _mm_add_epi16(stp2_6, stp1_9); - l7 = _mm_add_epi16(stp2_7, stp1_8); - l8 = _mm_sub_epi16(stp2_7, stp1_8); - l9 = _mm_sub_epi16(stp2_6, stp1_9); - l10 = _mm_sub_epi16(stp2_5, stp2_10); - l11 = _mm_sub_epi16(stp2_4, stp2_11); - l12 = _mm_sub_epi16(stp2_3, stp2_12); - l13 = _mm_sub_epi16(stp2_2, stp2_13); - l14 = _mm_sub_epi16(stp2_1, stp1_14); - l15 = _mm_sub_epi16(stp2_0, stp1_15); + l[0] = _mm_add_epi16(stp2_0, stp1_15); + l[1] = _mm_add_epi16(stp2_1, stp1_14); + l[2] = _mm_add_epi16(stp2_2, stp2_13); + l[3] = _mm_add_epi16(stp2_3, stp2_12); + l[4] = _mm_add_epi16(stp2_4, stp2_11); + l[5] = _mm_add_epi16(stp2_5, stp2_10); + l[6] = _mm_add_epi16(stp2_6, stp1_9); + l[7] = _mm_add_epi16(stp2_7, stp1_8); + l[8] = _mm_sub_epi16(stp2_7, stp1_8); + l[9] = _mm_sub_epi16(stp2_6, stp1_9); + l[10] = _mm_sub_epi16(stp2_5, stp2_10); + l[11] = _mm_sub_epi16(stp2_4, stp2_11); + l[12] = _mm_sub_epi16(stp2_3, stp2_12); + l[13] = _mm_sub_epi16(stp2_2, stp2_13); + l[14] = _mm_sub_epi16(stp2_1, stp1_14); + l[15] = _mm_sub_epi16(stp2_0, stp1_15); - // 2-D idct. We do 2 8x16 blocks. + // Second 1-D inverse transform, performed per 8x16 block for (i = 0; i < 2; i++) { - if (i == 0) - TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, - in5, in6, in7); + array_transpose_4X8(l + 8*i, in); - if (i == 1) - TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, - in4, in5, in6, in7); - - in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero; - - IDCT16_1D + IDCT16_10 // Stage7 - in0 = _mm_add_epi16(stp2_0, stp1_15); - in1 = _mm_add_epi16(stp2_1, stp1_14); - in2 = _mm_add_epi16(stp2_2, stp2_13); - in3 = _mm_add_epi16(stp2_3, stp2_12); - in4 = _mm_add_epi16(stp2_4, stp2_11); - in5 = _mm_add_epi16(stp2_5, stp2_10); - in6 = _mm_add_epi16(stp2_6, stp1_9); - in7 = _mm_add_epi16(stp2_7, stp1_8); - in8 = _mm_sub_epi16(stp2_7, stp1_8); - in9 = _mm_sub_epi16(stp2_6, stp1_9); - in10 = _mm_sub_epi16(stp2_5, stp2_10); - in11 = _mm_sub_epi16(stp2_4, stp2_11); - in12 = _mm_sub_epi16(stp2_3, stp2_12); - in13 = _mm_sub_epi16(stp2_2, stp2_13); - in14 = _mm_sub_epi16(stp2_1, stp1_14); - in15 = _mm_sub_epi16(stp2_0, stp1_15); + in[0] = _mm_add_epi16(stp2_0, stp1_15); + in[1] = _mm_add_epi16(stp2_1, stp1_14); + in[2] = _mm_add_epi16(stp2_2, stp2_13); + in[3] = _mm_add_epi16(stp2_3, stp2_12); + in[4] = _mm_add_epi16(stp2_4, stp2_11); + in[5] = _mm_add_epi16(stp2_5, stp2_10); + in[6] = _mm_add_epi16(stp2_6, stp1_9); + in[7] = _mm_add_epi16(stp2_7, stp1_8); + in[8] = _mm_sub_epi16(stp2_7, stp1_8); + in[9] = _mm_sub_epi16(stp2_6, stp1_9); + in[10] = _mm_sub_epi16(stp2_5, stp2_10); + in[11] = _mm_sub_epi16(stp2_4, stp2_11); + in[12] = _mm_sub_epi16(stp2_3, stp2_12); + in[13] = _mm_sub_epi16(stp2_2, stp2_13); + in[14] = _mm_sub_epi16(stp2_1, stp1_14); + in[15] = _mm_sub_epi16(stp2_0, stp1_15); // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - in8 = _mm_adds_epi16(in8, final_rounding); - in9 = _mm_adds_epi16(in9, final_rounding); - in10 = _mm_adds_epi16(in10, final_rounding); - in11 = _mm_adds_epi16(in11, final_rounding); - in12 = _mm_adds_epi16(in12, final_rounding); - in13 = _mm_adds_epi16(in13, final_rounding); - in14 = _mm_adds_epi16(in14, final_rounding); - in15 = _mm_adds_epi16(in15, final_rounding); + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); - in0 = _mm_srai_epi16(in0, 6); - in1 = _mm_srai_epi16(in1, 6); - in2 = _mm_srai_epi16(in2, 6); - in3 = _mm_srai_epi16(in3, 6); - in4 = _mm_srai_epi16(in4, 6); - in5 = _mm_srai_epi16(in5, 6); - in6 = _mm_srai_epi16(in6, 6); - in7 = _mm_srai_epi16(in7, 6); - in8 = _mm_srai_epi16(in8, 6); - in9 = _mm_srai_epi16(in9, 6); - in10 = _mm_srai_epi16(in10, 6); - in11 = _mm_srai_epi16(in11, 6); - in12 = _mm_srai_epi16(in12, 6); - in13 = _mm_srai_epi16(in13, 6); - in14 = _mm_srai_epi16(in14, 6); - in15 = _mm_srai_epi16(in15, 6); + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); - RECON_AND_STORE(dest, in0); - RECON_AND_STORE(dest, in1); - RECON_AND_STORE(dest, in2); - RECON_AND_STORE(dest, in3); - RECON_AND_STORE(dest, in4); - RECON_AND_STORE(dest, in5); - RECON_AND_STORE(dest, in6); - RECON_AND_STORE(dest, in7); - RECON_AND_STORE(dest, in8); - RECON_AND_STORE(dest, in9); - RECON_AND_STORE(dest, in10); - RECON_AND_STORE(dest, in11); - RECON_AND_STORE(dest, in12); - RECON_AND_STORE(dest, in13); - RECON_AND_STORE(dest, in14); - RECON_AND_STORE(dest, in15); + RECON_AND_STORE(dest, in[0]); + RECON_AND_STORE(dest, in[1]); + RECON_AND_STORE(dest, in[2]); + RECON_AND_STORE(dest, in[3]); + RECON_AND_STORE(dest, in[4]); + RECON_AND_STORE(dest, in[5]); + RECON_AND_STORE(dest, in[6]); + RECON_AND_STORE(dest, in[7]); + RECON_AND_STORE(dest, in[8]); + RECON_AND_STORE(dest, in[9]); + RECON_AND_STORE(dest, in[10]); + RECON_AND_STORE(dest, in[11]); + RECON_AND_STORE(dest, in[12]); + RECON_AND_STORE(dest, in[13]); + RECON_AND_STORE(dest, in[14]); + RECON_AND_STORE(dest, in[15]); dest += 8 - (stride * 16); } @@ -2792,28 +2650,329 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, input += 8; \ } \ -#define IDCT32_1D \ +#define IDCT32_34 \ /* Stage1 */ \ { \ - const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \ - const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \ - const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \ - const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \ + const __m128i zero = _mm_setzero_si128();\ + const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ + const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ \ - const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \ - const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \ - const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \ - const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \ + const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \ + const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ \ - const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \ - const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \ - const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \ - const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \ + const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ + const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ \ - const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \ - const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \ - const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \ - const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \ + const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ + const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ + \ + MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \ + stg1_1, stp1_16, stp1_31); \ + MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \ + stg1_7, stp1_19, stp1_28); \ + MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \ + stg1_9, stp1_20, stp1_27); \ + MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \ + stg1_15, stp1_23, stp1_24); \ +} \ +\ +/* Stage2 */ \ +{ \ + const __m128i zero = _mm_setzero_si128();\ + const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ + const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ + \ + const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ + const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ + \ + MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \ + stg2_1, stp2_8, stp2_15); \ + MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \ + stg2_7, stp2_11, stp2_12); \ + \ + stp2_16 = stp1_16; \ + stp2_19 = stp1_19; \ + \ + stp2_20 = stp1_20; \ + stp2_23 = stp1_23; \ + \ + stp2_24 = stp1_24; \ + stp2_27 = stp1_27; \ + \ + stp2_28 = stp1_28; \ + stp2_31 = stp1_31; \ +} \ +\ +/* Stage3 */ \ +{ \ + const __m128i zero = _mm_setzero_si128();\ + const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ + const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ + \ + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ + \ + MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \ + stg3_1, stp1_4, stp1_7); \ + \ + stp1_8 = stp2_8; \ + stp1_11 = stp2_11; \ + stp1_12 = stp2_12; \ + stp1_15 = stp2_15; \ + \ + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ + stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ + stp1_18, stp1_29) \ + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ + stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ + stp1_22, stp1_25) \ + \ + stp1_16 = stp2_16; \ + stp1_31 = stp2_31; \ + stp1_19 = stp2_19; \ + stp1_20 = stp2_20; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_27 = stp2_27; \ + stp1_28 = stp2_28; \ +} \ +\ +/* Stage4 */ \ +{ \ + const __m128i zero = _mm_setzero_si128();\ + const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ + const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ + \ + MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \ + stg4_1, stp2_0, stp2_1); \ + \ + stp2_4 = stp1_4; \ + stp2_5 = stp1_4; \ + stp2_6 = stp1_7; \ + stp2_7 = stp1_7; \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ + stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ + stp2_10, stp2_13) \ + \ + stp2_8 = stp1_8; \ + stp2_15 = stp1_15; \ + stp2_11 = stp1_11; \ + stp2_12 = stp1_12; \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ +} \ +\ +/* Stage5 */ \ +{ \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + stp1_0 = stp2_0; \ + stp1_1 = stp2_1; \ + stp1_2 = stp2_1; \ + stp1_3 = stp2_0; \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_4 = stp2_4; \ + stp1_7 = stp2_7; \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + \ + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ + stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ + stp1_19, stp1_28) \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ + stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + \ + stp1_22 = stp2_22; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_25 = stp2_25; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ +} \ +\ +/* Stage6 */ \ +{ \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ + \ + stp2_8 = stp1_8; \ + stp2_9 = stp1_9; \ + stp2_14 = stp1_14; \ + stp2_15 = stp1_15; \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ + stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ + stp2_13, stp2_11, stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ + \ + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ +} \ +\ +/* Stage7 */ \ +{ \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + stp1_18 = stp2_18; \ + stp1_19 = stp2_19; \ + \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ + stp1_23, stp1_24) \ + \ + stp1_28 = stp2_28; \ + stp1_29 = stp2_29; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ +} + + +#define IDCT32 \ +/* Stage1 */ \ +{ \ + const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \ + const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \ + const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \ + const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \ + \ + const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \ + const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \ + const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \ + const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \ + \ + const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \ + const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \ + const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \ + const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \ + \ + const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \ + const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \ + const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \ + const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \ \ MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ @@ -2831,15 +2990,15 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, \ /* Stage2 */ \ { \ - const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \ - const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \ - const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \ - const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \ + const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \ + const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \ + const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \ + const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \ \ - const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \ - const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \ - const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \ - const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \ + const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \ + const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \ + const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \ + const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \ \ MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ @@ -2871,10 +3030,10 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, \ /* Stage3 */ \ { \ - const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \ - const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \ - const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \ - const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \ + const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \ + const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \ + const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \ + const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \ \ const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ @@ -2918,10 +3077,10 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, \ /* Stage4 */ \ { \ - const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \ - const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \ - const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \ - const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \ + const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \ + const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \ + const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \ + const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \ \ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ @@ -3178,10 +3337,7 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, - in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, - in24, in25, in26, in27, in28, in29, in30, in31; - __m128i col[128]; + __m128i in[32], col[32]; __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, @@ -3193,296 +3349,225 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i, j, i32; + int i; + // Load input data. + LOAD_DQCOEFF(in[0], input); + LOAD_DQCOEFF(in[8], input); + LOAD_DQCOEFF(in[16], input); + LOAD_DQCOEFF(in[24], input); + LOAD_DQCOEFF(in[1], input); + LOAD_DQCOEFF(in[9], input); + LOAD_DQCOEFF(in[17], input); + LOAD_DQCOEFF(in[25], input); + LOAD_DQCOEFF(in[2], input); + LOAD_DQCOEFF(in[10], input); + LOAD_DQCOEFF(in[18], input); + LOAD_DQCOEFF(in[26], input); + LOAD_DQCOEFF(in[3], input); + LOAD_DQCOEFF(in[11], input); + LOAD_DQCOEFF(in[19], input); + LOAD_DQCOEFF(in[27], input); - // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. - for (i = 0; i < 8; i++) { - i32 = (i << 5); - if (i == 0) { - // First 1-D idct: first 8 rows - // Load input data. - LOAD_DQCOEFF(in0, input); - LOAD_DQCOEFF(in8, input); - LOAD_DQCOEFF(in16, input); - LOAD_DQCOEFF(in24, input); - LOAD_DQCOEFF(in1, input); - LOAD_DQCOEFF(in9, input); - LOAD_DQCOEFF(in17, input); - LOAD_DQCOEFF(in25, input); - LOAD_DQCOEFF(in2, input); - LOAD_DQCOEFF(in10, input); - LOAD_DQCOEFF(in18, input); - LOAD_DQCOEFF(in26, input); - LOAD_DQCOEFF(in3, input); - LOAD_DQCOEFF(in11, input); - LOAD_DQCOEFF(in19, input); - LOAD_DQCOEFF(in27, input); + LOAD_DQCOEFF(in[4], input); + LOAD_DQCOEFF(in[12], input); + LOAD_DQCOEFF(in[20], input); + LOAD_DQCOEFF(in[28], input); + LOAD_DQCOEFF(in[5], input); + LOAD_DQCOEFF(in[13], input); + LOAD_DQCOEFF(in[21], input); + LOAD_DQCOEFF(in[29], input); + LOAD_DQCOEFF(in[6], input); + LOAD_DQCOEFF(in[14], input); + LOAD_DQCOEFF(in[22], input); + LOAD_DQCOEFF(in[30], input); + LOAD_DQCOEFF(in[7], input); + LOAD_DQCOEFF(in[15], input); + LOAD_DQCOEFF(in[23], input); + LOAD_DQCOEFF(in[31], input); - LOAD_DQCOEFF(in4, input); - LOAD_DQCOEFF(in12, input); - LOAD_DQCOEFF(in20, input); - LOAD_DQCOEFF(in28, input); - LOAD_DQCOEFF(in5, input); - LOAD_DQCOEFF(in13, input); - LOAD_DQCOEFF(in21, input); - LOAD_DQCOEFF(in29, input); - LOAD_DQCOEFF(in6, input); - LOAD_DQCOEFF(in14, input); - LOAD_DQCOEFF(in22, input); - LOAD_DQCOEFF(in30, input); - LOAD_DQCOEFF(in7, input); - LOAD_DQCOEFF(in15, input); - LOAD_DQCOEFF(in23, input); - LOAD_DQCOEFF(in31, input); + array_transpose_8x8(in, in); + array_transpose_8x8(in+8, in+8); + array_transpose_8x8(in+16, in+16); + array_transpose_8x8(in+24, in+24); - // Transpose 32x8 block to 8x32 block - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, - in10, in11, in12, in13, in14, in15); - TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, - in18, in19, in20, in21, in22, in23); - TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, - in26, in27, in28, in29, in30, in31); - } else if (i < 4) { - // First 1-D idct: next 24 zero-coeff rows - col[i32 + 0] = _mm_setzero_si128(); - col[i32 + 1] = _mm_setzero_si128(); - col[i32 + 2] = _mm_setzero_si128(); - col[i32 + 3] = _mm_setzero_si128(); - col[i32 + 4] = _mm_setzero_si128(); - col[i32 + 5] = _mm_setzero_si128(); - col[i32 + 6] = _mm_setzero_si128(); - col[i32 + 7] = _mm_setzero_si128(); - col[i32 + 8] = _mm_setzero_si128(); - col[i32 + 9] = _mm_setzero_si128(); - col[i32 + 10] = _mm_setzero_si128(); - col[i32 + 11] = _mm_setzero_si128(); - col[i32 + 12] = _mm_setzero_si128(); - col[i32 + 13] = _mm_setzero_si128(); - col[i32 + 14] = _mm_setzero_si128(); - col[i32 + 15] = _mm_setzero_si128(); - col[i32 + 16] = _mm_setzero_si128(); - col[i32 + 17] = _mm_setzero_si128(); - col[i32 + 18] = _mm_setzero_si128(); - col[i32 + 19] = _mm_setzero_si128(); - col[i32 + 20] = _mm_setzero_si128(); - col[i32 + 21] = _mm_setzero_si128(); - col[i32 + 22] = _mm_setzero_si128(); - col[i32 + 23] = _mm_setzero_si128(); - col[i32 + 24] = _mm_setzero_si128(); - col[i32 + 25] = _mm_setzero_si128(); - col[i32 + 26] = _mm_setzero_si128(); - col[i32 + 27] = _mm_setzero_si128(); - col[i32 + 28] = _mm_setzero_si128(); - col[i32 + 29] = _mm_setzero_si128(); - col[i32 + 30] = _mm_setzero_si128(); - col[i32 + 31] = _mm_setzero_si128(); - continue; - } else { - // Second 1-D idct - j = i - 4; + IDCT32 - // Transpose 32x8 block to 8x32 block - TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], - col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], - col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, - in5, in6, in7); - j += 4; - TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], - col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], - col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, - in11, in12, in13, in14, in15); - j += 4; - TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], - col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], - col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, - in19, in20, in21, in22, in23); - j += 4; - TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], - col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], - col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, - in28, in29, in30, in31); - } - - IDCT32_1D - - // final stage - if (i < 4) { - // 1_D: Store 32 intermediate results for each 8x32 block. - col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); - col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); - col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); - col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); - col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); - col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); - col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); - col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); - col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); - col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); - col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); - col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); - col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); - col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); - col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); - col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); - col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); - col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); - col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); - col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); - col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); - col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); - col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); - col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); - col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); - col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); - col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); - col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); - col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); - col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); - col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); - col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); - } else { + // 1_D: Store 32 intermediate results for each 8x32 block. + col[0] = _mm_add_epi16(stp1_0, stp1_31); + col[1] = _mm_add_epi16(stp1_1, stp1_30); + col[2] = _mm_add_epi16(stp1_2, stp1_29); + col[3] = _mm_add_epi16(stp1_3, stp1_28); + col[4] = _mm_add_epi16(stp1_4, stp1_27); + col[5] = _mm_add_epi16(stp1_5, stp1_26); + col[6] = _mm_add_epi16(stp1_6, stp1_25); + col[7] = _mm_add_epi16(stp1_7, stp1_24); + col[8] = _mm_add_epi16(stp1_8, stp1_23); + col[9] = _mm_add_epi16(stp1_9, stp1_22); + col[10] = _mm_add_epi16(stp1_10, stp1_21); + col[11] = _mm_add_epi16(stp1_11, stp1_20); + col[12] = _mm_add_epi16(stp1_12, stp1_19); + col[13] = _mm_add_epi16(stp1_13, stp1_18); + col[14] = _mm_add_epi16(stp1_14, stp1_17); + col[15] = _mm_add_epi16(stp1_15, stp1_16); + col[16] = _mm_sub_epi16(stp1_15, stp1_16); + col[17] = _mm_sub_epi16(stp1_14, stp1_17); + col[18] = _mm_sub_epi16(stp1_13, stp1_18); + col[19] = _mm_sub_epi16(stp1_12, stp1_19); + col[20] = _mm_sub_epi16(stp1_11, stp1_20); + col[21] = _mm_sub_epi16(stp1_10, stp1_21); + col[22] = _mm_sub_epi16(stp1_9, stp1_22); + col[23] = _mm_sub_epi16(stp1_8, stp1_23); + col[24] = _mm_sub_epi16(stp1_7, stp1_24); + col[25] = _mm_sub_epi16(stp1_6, stp1_25); + col[26] = _mm_sub_epi16(stp1_5, stp1_26); + col[27] = _mm_sub_epi16(stp1_4, stp1_27); + col[28] = _mm_sub_epi16(stp1_3, stp1_28); + col[29] = _mm_sub_epi16(stp1_2, stp1_29); + col[30] = _mm_sub_epi16(stp1_1, stp1_30); + col[31] = _mm_sub_epi16(stp1_0, stp1_31); + for (i = 0; i < 4; i++) { const __m128i zero = _mm_setzero_si128(); + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(col+i*8, in); + IDCT32_34 // 2_D: Calculate the results and store them to destination. - in0 = _mm_add_epi16(stp1_0, stp1_31); - in1 = _mm_add_epi16(stp1_1, stp1_30); - in2 = _mm_add_epi16(stp1_2, stp1_29); - in3 = _mm_add_epi16(stp1_3, stp1_28); - in4 = _mm_add_epi16(stp1_4, stp1_27); - in5 = _mm_add_epi16(stp1_5, stp1_26); - in6 = _mm_add_epi16(stp1_6, stp1_25); - in7 = _mm_add_epi16(stp1_7, stp1_24); - in8 = _mm_add_epi16(stp1_8, stp1_23); - in9 = _mm_add_epi16(stp1_9, stp1_22); - in10 = _mm_add_epi16(stp1_10, stp1_21); - in11 = _mm_add_epi16(stp1_11, stp1_20); - in12 = _mm_add_epi16(stp1_12, stp1_19); - in13 = _mm_add_epi16(stp1_13, stp1_18); - in14 = _mm_add_epi16(stp1_14, stp1_17); - in15 = _mm_add_epi16(stp1_15, stp1_16); - in16 = _mm_sub_epi16(stp1_15, stp1_16); - in17 = _mm_sub_epi16(stp1_14, stp1_17); - in18 = _mm_sub_epi16(stp1_13, stp1_18); - in19 = _mm_sub_epi16(stp1_12, stp1_19); - in20 = _mm_sub_epi16(stp1_11, stp1_20); - in21 = _mm_sub_epi16(stp1_10, stp1_21); - in22 = _mm_sub_epi16(stp1_9, stp1_22); - in23 = _mm_sub_epi16(stp1_8, stp1_23); - in24 = _mm_sub_epi16(stp1_7, stp1_24); - in25 = _mm_sub_epi16(stp1_6, stp1_25); - in26 = _mm_sub_epi16(stp1_5, stp1_26); - in27 = _mm_sub_epi16(stp1_4, stp1_27); - in28 = _mm_sub_epi16(stp1_3, stp1_28); - in29 = _mm_sub_epi16(stp1_2, stp1_29); - in30 = _mm_sub_epi16(stp1_1, stp1_30); - in31 = _mm_sub_epi16(stp1_0, stp1_31); + in[0] = _mm_add_epi16(stp1_0, stp1_31); + in[1] = _mm_add_epi16(stp1_1, stp1_30); + in[2] = _mm_add_epi16(stp1_2, stp1_29); + in[3] = _mm_add_epi16(stp1_3, stp1_28); + in[4] = _mm_add_epi16(stp1_4, stp1_27); + in[5] = _mm_add_epi16(stp1_5, stp1_26); + in[6] = _mm_add_epi16(stp1_6, stp1_25); + in[7] = _mm_add_epi16(stp1_7, stp1_24); + in[8] = _mm_add_epi16(stp1_8, stp1_23); + in[9] = _mm_add_epi16(stp1_9, stp1_22); + in[10] = _mm_add_epi16(stp1_10, stp1_21); + in[11] = _mm_add_epi16(stp1_11, stp1_20); + in[12] = _mm_add_epi16(stp1_12, stp1_19); + in[13] = _mm_add_epi16(stp1_13, stp1_18); + in[14] = _mm_add_epi16(stp1_14, stp1_17); + in[15] = _mm_add_epi16(stp1_15, stp1_16); + in[16] = _mm_sub_epi16(stp1_15, stp1_16); + in[17] = _mm_sub_epi16(stp1_14, stp1_17); + in[18] = _mm_sub_epi16(stp1_13, stp1_18); + in[19] = _mm_sub_epi16(stp1_12, stp1_19); + in[20] = _mm_sub_epi16(stp1_11, stp1_20); + in[21] = _mm_sub_epi16(stp1_10, stp1_21); + in[22] = _mm_sub_epi16(stp1_9, stp1_22); + in[23] = _mm_sub_epi16(stp1_8, stp1_23); + in[24] = _mm_sub_epi16(stp1_7, stp1_24); + in[25] = _mm_sub_epi16(stp1_6, stp1_25); + in[26] = _mm_sub_epi16(stp1_5, stp1_26); + in[27] = _mm_sub_epi16(stp1_4, stp1_27); + in[28] = _mm_sub_epi16(stp1_3, stp1_28); + in[29] = _mm_sub_epi16(stp1_2, stp1_29); + in[30] = _mm_sub_epi16(stp1_1, stp1_30); + in[31] = _mm_sub_epi16(stp1_0, stp1_31); // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - in8 = _mm_adds_epi16(in8, final_rounding); - in9 = _mm_adds_epi16(in9, final_rounding); - in10 = _mm_adds_epi16(in10, final_rounding); - in11 = _mm_adds_epi16(in11, final_rounding); - in12 = _mm_adds_epi16(in12, final_rounding); - in13 = _mm_adds_epi16(in13, final_rounding); - in14 = _mm_adds_epi16(in14, final_rounding); - in15 = _mm_adds_epi16(in15, final_rounding); - in16 = _mm_adds_epi16(in16, final_rounding); - in17 = _mm_adds_epi16(in17, final_rounding); - in18 = _mm_adds_epi16(in18, final_rounding); - in19 = _mm_adds_epi16(in19, final_rounding); - in20 = _mm_adds_epi16(in20, final_rounding); - in21 = _mm_adds_epi16(in21, final_rounding); - in22 = _mm_adds_epi16(in22, final_rounding); - in23 = _mm_adds_epi16(in23, final_rounding); - in24 = _mm_adds_epi16(in24, final_rounding); - in25 = _mm_adds_epi16(in25, final_rounding); - in26 = _mm_adds_epi16(in26, final_rounding); - in27 = _mm_adds_epi16(in27, final_rounding); - in28 = _mm_adds_epi16(in28, final_rounding); - in29 = _mm_adds_epi16(in29, final_rounding); - in30 = _mm_adds_epi16(in30, final_rounding); - in31 = _mm_adds_epi16(in31, final_rounding); + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); + in[16] = _mm_adds_epi16(in[16], final_rounding); + in[17] = _mm_adds_epi16(in[17], final_rounding); + in[18] = _mm_adds_epi16(in[18], final_rounding); + in[19] = _mm_adds_epi16(in[19], final_rounding); + in[20] = _mm_adds_epi16(in[20], final_rounding); + in[21] = _mm_adds_epi16(in[21], final_rounding); + in[22] = _mm_adds_epi16(in[22], final_rounding); + in[23] = _mm_adds_epi16(in[23], final_rounding); + in[24] = _mm_adds_epi16(in[24], final_rounding); + in[25] = _mm_adds_epi16(in[25], final_rounding); + in[26] = _mm_adds_epi16(in[26], final_rounding); + in[27] = _mm_adds_epi16(in[27], final_rounding); + in[28] = _mm_adds_epi16(in[28], final_rounding); + in[29] = _mm_adds_epi16(in[29], final_rounding); + in[30] = _mm_adds_epi16(in[30], final_rounding); + in[31] = _mm_adds_epi16(in[31], final_rounding); - in0 = _mm_srai_epi16(in0, 6); - in1 = _mm_srai_epi16(in1, 6); - in2 = _mm_srai_epi16(in2, 6); - in3 = _mm_srai_epi16(in3, 6); - in4 = _mm_srai_epi16(in4, 6); - in5 = _mm_srai_epi16(in5, 6); - in6 = _mm_srai_epi16(in6, 6); - in7 = _mm_srai_epi16(in7, 6); - in8 = _mm_srai_epi16(in8, 6); - in9 = _mm_srai_epi16(in9, 6); - in10 = _mm_srai_epi16(in10, 6); - in11 = _mm_srai_epi16(in11, 6); - in12 = _mm_srai_epi16(in12, 6); - in13 = _mm_srai_epi16(in13, 6); - in14 = _mm_srai_epi16(in14, 6); - in15 = _mm_srai_epi16(in15, 6); - in16 = _mm_srai_epi16(in16, 6); - in17 = _mm_srai_epi16(in17, 6); - in18 = _mm_srai_epi16(in18, 6); - in19 = _mm_srai_epi16(in19, 6); - in20 = _mm_srai_epi16(in20, 6); - in21 = _mm_srai_epi16(in21, 6); - in22 = _mm_srai_epi16(in22, 6); - in23 = _mm_srai_epi16(in23, 6); - in24 = _mm_srai_epi16(in24, 6); - in25 = _mm_srai_epi16(in25, 6); - in26 = _mm_srai_epi16(in26, 6); - in27 = _mm_srai_epi16(in27, 6); - in28 = _mm_srai_epi16(in28, 6); - in29 = _mm_srai_epi16(in29, 6); - in30 = _mm_srai_epi16(in30, 6); - in31 = _mm_srai_epi16(in31, 6); + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); + in[16] = _mm_srai_epi16(in[16], 6); + in[17] = _mm_srai_epi16(in[17], 6); + in[18] = _mm_srai_epi16(in[18], 6); + in[19] = _mm_srai_epi16(in[19], 6); + in[20] = _mm_srai_epi16(in[20], 6); + in[21] = _mm_srai_epi16(in[21], 6); + in[22] = _mm_srai_epi16(in[22], 6); + in[23] = _mm_srai_epi16(in[23], 6); + in[24] = _mm_srai_epi16(in[24], 6); + in[25] = _mm_srai_epi16(in[25], 6); + in[26] = _mm_srai_epi16(in[26], 6); + in[27] = _mm_srai_epi16(in[27], 6); + in[28] = _mm_srai_epi16(in[28], 6); + in[29] = _mm_srai_epi16(in[29], 6); + in[30] = _mm_srai_epi16(in[30], 6); + in[31] = _mm_srai_epi16(in[31], 6); - RECON_AND_STORE(dest, in0); - RECON_AND_STORE(dest, in1); - RECON_AND_STORE(dest, in2); - RECON_AND_STORE(dest, in3); - RECON_AND_STORE(dest, in4); - RECON_AND_STORE(dest, in5); - RECON_AND_STORE(dest, in6); - RECON_AND_STORE(dest, in7); - RECON_AND_STORE(dest, in8); - RECON_AND_STORE(dest, in9); - RECON_AND_STORE(dest, in10); - RECON_AND_STORE(dest, in11); - RECON_AND_STORE(dest, in12); - RECON_AND_STORE(dest, in13); - RECON_AND_STORE(dest, in14); - RECON_AND_STORE(dest, in15); - RECON_AND_STORE(dest, in16); - RECON_AND_STORE(dest, in17); - RECON_AND_STORE(dest, in18); - RECON_AND_STORE(dest, in19); - RECON_AND_STORE(dest, in20); - RECON_AND_STORE(dest, in21); - RECON_AND_STORE(dest, in22); - RECON_AND_STORE(dest, in23); - RECON_AND_STORE(dest, in24); - RECON_AND_STORE(dest, in25); - RECON_AND_STORE(dest, in26); - RECON_AND_STORE(dest, in27); - RECON_AND_STORE(dest, in28); - RECON_AND_STORE(dest, in29); - RECON_AND_STORE(dest, in30); - RECON_AND_STORE(dest, in31); + RECON_AND_STORE(dest, in[0]); + RECON_AND_STORE(dest, in[1]); + RECON_AND_STORE(dest, in[2]); + RECON_AND_STORE(dest, in[3]); + RECON_AND_STORE(dest, in[4]); + RECON_AND_STORE(dest, in[5]); + RECON_AND_STORE(dest, in[6]); + RECON_AND_STORE(dest, in[7]); + RECON_AND_STORE(dest, in[8]); + RECON_AND_STORE(dest, in[9]); + RECON_AND_STORE(dest, in[10]); + RECON_AND_STORE(dest, in[11]); + RECON_AND_STORE(dest, in[12]); + RECON_AND_STORE(dest, in[13]); + RECON_AND_STORE(dest, in[14]); + RECON_AND_STORE(dest, in[15]); + RECON_AND_STORE(dest, in[16]); + RECON_AND_STORE(dest, in[17]); + RECON_AND_STORE(dest, in[18]); + RECON_AND_STORE(dest, in[19]); + RECON_AND_STORE(dest, in[20]); + RECON_AND_STORE(dest, in[21]); + RECON_AND_STORE(dest, in[22]); + RECON_AND_STORE(dest, in[23]); + RECON_AND_STORE(dest, in[24]); + RECON_AND_STORE(dest, in[25]); + RECON_AND_STORE(dest, in[26]); + RECON_AND_STORE(dest, in[27]); + RECON_AND_STORE(dest, in[28]); + RECON_AND_STORE(dest, in[29]); + RECON_AND_STORE(dest, in[30]); + RECON_AND_STORE(dest, in[31]); dest += 8 - (stride * 32); } } -} void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, int stride) { @@ -3537,10 +3622,7 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, - in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, - in24, in25, in26, in27, in28, in29, in30, in31; - __m128i col[128]; + __m128i in[32], col[128], zero_idx[16]; __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, @@ -3553,66 +3635,63 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, stp2_30, stp2_31; __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int i, j, i32; - __m128i zero_idx[16]; int zero_flag[2]; - // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. - for (i = 0; i < 8; i++) { + for (i = 0; i < 4; i++) { i32 = (i << 5); - if (i < 4) { // First 1-D idct // Load input data. - LOAD_DQCOEFF(in0, input); - LOAD_DQCOEFF(in8, input); - LOAD_DQCOEFF(in16, input); - LOAD_DQCOEFF(in24, input); - LOAD_DQCOEFF(in1, input); - LOAD_DQCOEFF(in9, input); - LOAD_DQCOEFF(in17, input); - LOAD_DQCOEFF(in25, input); - LOAD_DQCOEFF(in2, input); - LOAD_DQCOEFF(in10, input); - LOAD_DQCOEFF(in18, input); - LOAD_DQCOEFF(in26, input); - LOAD_DQCOEFF(in3, input); - LOAD_DQCOEFF(in11, input); - LOAD_DQCOEFF(in19, input); - LOAD_DQCOEFF(in27, input); + LOAD_DQCOEFF(in[0], input); + LOAD_DQCOEFF(in[8], input); + LOAD_DQCOEFF(in[16], input); + LOAD_DQCOEFF(in[24], input); + LOAD_DQCOEFF(in[1], input); + LOAD_DQCOEFF(in[9], input); + LOAD_DQCOEFF(in[17], input); + LOAD_DQCOEFF(in[25], input); + LOAD_DQCOEFF(in[2], input); + LOAD_DQCOEFF(in[10], input); + LOAD_DQCOEFF(in[18], input); + LOAD_DQCOEFF(in[26], input); + LOAD_DQCOEFF(in[3], input); + LOAD_DQCOEFF(in[11], input); + LOAD_DQCOEFF(in[19], input); + LOAD_DQCOEFF(in[27], input); - LOAD_DQCOEFF(in4, input); - LOAD_DQCOEFF(in12, input); - LOAD_DQCOEFF(in20, input); - LOAD_DQCOEFF(in28, input); - LOAD_DQCOEFF(in5, input); - LOAD_DQCOEFF(in13, input); - LOAD_DQCOEFF(in21, input); - LOAD_DQCOEFF(in29, input); - LOAD_DQCOEFF(in6, input); - LOAD_DQCOEFF(in14, input); - LOAD_DQCOEFF(in22, input); - LOAD_DQCOEFF(in30, input); - LOAD_DQCOEFF(in7, input); - LOAD_DQCOEFF(in15, input); - LOAD_DQCOEFF(in23, input); - LOAD_DQCOEFF(in31, input); + LOAD_DQCOEFF(in[4], input); + LOAD_DQCOEFF(in[12], input); + LOAD_DQCOEFF(in[20], input); + LOAD_DQCOEFF(in[28], input); + LOAD_DQCOEFF(in[5], input); + LOAD_DQCOEFF(in[13], input); + LOAD_DQCOEFF(in[21], input); + LOAD_DQCOEFF(in[29], input); + LOAD_DQCOEFF(in[6], input); + LOAD_DQCOEFF(in[14], input); + LOAD_DQCOEFF(in[22], input); + LOAD_DQCOEFF(in[30], input); + LOAD_DQCOEFF(in[7], input); + LOAD_DQCOEFF(in[15], input); + LOAD_DQCOEFF(in[23], input); + LOAD_DQCOEFF(in[31], input); // checking if all entries are zero - zero_idx[0] = _mm_or_si128(in0, in1); - zero_idx[1] = _mm_or_si128(in2, in3); - zero_idx[2] = _mm_or_si128(in4, in5); - zero_idx[3] = _mm_or_si128(in6, in7); - zero_idx[4] = _mm_or_si128(in8, in9); - zero_idx[5] = _mm_or_si128(in10, in11); - zero_idx[6] = _mm_or_si128(in12, in13); - zero_idx[7] = _mm_or_si128(in14, in15); - zero_idx[8] = _mm_or_si128(in16, in17); - zero_idx[9] = _mm_or_si128(in18, in19); - zero_idx[10] = _mm_or_si128(in20, in21); - zero_idx[11] = _mm_or_si128(in22, in23); - zero_idx[12] = _mm_or_si128(in24, in25); - zero_idx[13] = _mm_or_si128(in26, in27); - zero_idx[14] = _mm_or_si128(in28, in29); - zero_idx[15] = _mm_or_si128(in30, in31); + zero_idx[0] = _mm_or_si128(in[0], in[1]); + zero_idx[1] = _mm_or_si128(in[2], in[3]); + zero_idx[2] = _mm_or_si128(in[4], in[5]); + zero_idx[3] = _mm_or_si128(in[6], in[7]); + zero_idx[4] = _mm_or_si128(in[8], in[9]); + zero_idx[5] = _mm_or_si128(in[10], in[11]); + zero_idx[6] = _mm_or_si128(in[12], in[13]); + zero_idx[7] = _mm_or_si128(in[14], in[15]); + zero_idx[8] = _mm_or_si128(in[16], in[17]); + zero_idx[9] = _mm_or_si128(in[18], in[19]); + zero_idx[10] = _mm_or_si128(in[20], in[21]); + zero_idx[11] = _mm_or_si128(in[22], in[23]); + zero_idx[12] = _mm_or_si128(in[24], in[25]); + zero_idx[13] = _mm_or_si128(in[26], in[27]); + zero_idx[14] = _mm_or_si128(in[28], in[29]); + zero_idx[15] = _mm_or_si128(in[30], in[31]); zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); @@ -3674,44 +3753,13 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, } // Transpose 32x8 block to 8x32 block - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, - in10, in11, in12, in13, in14, in15); - TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, - in18, in19, in20, in21, in22, in23); - TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, - in26, in27, in28, in29, in30, in31); - } else { - // Second 1-D idct - j = i - 4; + array_transpose_8x8(in, in); + array_transpose_8x8(in+8, in+8); + array_transpose_8x8(in+16, in+16); + array_transpose_8x8(in+24, in+24); - // Transpose 32x8 block to 8x32 block - TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], - col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], - col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, - in5, in6, in7); - j += 4; - TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], - col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], - col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, - in11, in12, in13, in14, in15); - j += 4; - TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], - col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], - col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, - in19, in20, in21, in22, in23); - j += 4; - TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], - col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], - col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, - in28, in29, in30, in31); - } + IDCT32 - IDCT32_1D - - // final stage - if (i < 4) { // 1_D: Store 32 intermediate results for each 8x32 block. col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); @@ -3745,146 +3793,156 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); - } else { + } + for (i = 0; i < 4; i++) { const __m128i zero = _mm_setzero_si128(); + // Second 1-D idct + j = i << 3; + + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(col+j, in); + array_transpose_8x8(col+j+32, in+8); + array_transpose_8x8(col+j+64, in+16); + array_transpose_8x8(col+j+96, in+24); + + IDCT32 // 2_D: Calculate the results and store them to destination. - in0 = _mm_add_epi16(stp1_0, stp1_31); - in1 = _mm_add_epi16(stp1_1, stp1_30); - in2 = _mm_add_epi16(stp1_2, stp1_29); - in3 = _mm_add_epi16(stp1_3, stp1_28); - in4 = _mm_add_epi16(stp1_4, stp1_27); - in5 = _mm_add_epi16(stp1_5, stp1_26); - in6 = _mm_add_epi16(stp1_6, stp1_25); - in7 = _mm_add_epi16(stp1_7, stp1_24); - in8 = _mm_add_epi16(stp1_8, stp1_23); - in9 = _mm_add_epi16(stp1_9, stp1_22); - in10 = _mm_add_epi16(stp1_10, stp1_21); - in11 = _mm_add_epi16(stp1_11, stp1_20); - in12 = _mm_add_epi16(stp1_12, stp1_19); - in13 = _mm_add_epi16(stp1_13, stp1_18); - in14 = _mm_add_epi16(stp1_14, stp1_17); - in15 = _mm_add_epi16(stp1_15, stp1_16); - in16 = _mm_sub_epi16(stp1_15, stp1_16); - in17 = _mm_sub_epi16(stp1_14, stp1_17); - in18 = _mm_sub_epi16(stp1_13, stp1_18); - in19 = _mm_sub_epi16(stp1_12, stp1_19); - in20 = _mm_sub_epi16(stp1_11, stp1_20); - in21 = _mm_sub_epi16(stp1_10, stp1_21); - in22 = _mm_sub_epi16(stp1_9, stp1_22); - in23 = _mm_sub_epi16(stp1_8, stp1_23); - in24 = _mm_sub_epi16(stp1_7, stp1_24); - in25 = _mm_sub_epi16(stp1_6, stp1_25); - in26 = _mm_sub_epi16(stp1_5, stp1_26); - in27 = _mm_sub_epi16(stp1_4, stp1_27); - in28 = _mm_sub_epi16(stp1_3, stp1_28); - in29 = _mm_sub_epi16(stp1_2, stp1_29); - in30 = _mm_sub_epi16(stp1_1, stp1_30); - in31 = _mm_sub_epi16(stp1_0, stp1_31); + in[0] = _mm_add_epi16(stp1_0, stp1_31); + in[1] = _mm_add_epi16(stp1_1, stp1_30); + in[2] = _mm_add_epi16(stp1_2, stp1_29); + in[3] = _mm_add_epi16(stp1_3, stp1_28); + in[4] = _mm_add_epi16(stp1_4, stp1_27); + in[5] = _mm_add_epi16(stp1_5, stp1_26); + in[6] = _mm_add_epi16(stp1_6, stp1_25); + in[7] = _mm_add_epi16(stp1_7, stp1_24); + in[8] = _mm_add_epi16(stp1_8, stp1_23); + in[9] = _mm_add_epi16(stp1_9, stp1_22); + in[10] = _mm_add_epi16(stp1_10, stp1_21); + in[11] = _mm_add_epi16(stp1_11, stp1_20); + in[12] = _mm_add_epi16(stp1_12, stp1_19); + in[13] = _mm_add_epi16(stp1_13, stp1_18); + in[14] = _mm_add_epi16(stp1_14, stp1_17); + in[15] = _mm_add_epi16(stp1_15, stp1_16); + in[16] = _mm_sub_epi16(stp1_15, stp1_16); + in[17] = _mm_sub_epi16(stp1_14, stp1_17); + in[18] = _mm_sub_epi16(stp1_13, stp1_18); + in[19] = _mm_sub_epi16(stp1_12, stp1_19); + in[20] = _mm_sub_epi16(stp1_11, stp1_20); + in[21] = _mm_sub_epi16(stp1_10, stp1_21); + in[22] = _mm_sub_epi16(stp1_9, stp1_22); + in[23] = _mm_sub_epi16(stp1_8, stp1_23); + in[24] = _mm_sub_epi16(stp1_7, stp1_24); + in[25] = _mm_sub_epi16(stp1_6, stp1_25); + in[26] = _mm_sub_epi16(stp1_5, stp1_26); + in[27] = _mm_sub_epi16(stp1_4, stp1_27); + in[28] = _mm_sub_epi16(stp1_3, stp1_28); + in[29] = _mm_sub_epi16(stp1_2, stp1_29); + in[30] = _mm_sub_epi16(stp1_1, stp1_30); + in[31] = _mm_sub_epi16(stp1_0, stp1_31); // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - in8 = _mm_adds_epi16(in8, final_rounding); - in9 = _mm_adds_epi16(in9, final_rounding); - in10 = _mm_adds_epi16(in10, final_rounding); - in11 = _mm_adds_epi16(in11, final_rounding); - in12 = _mm_adds_epi16(in12, final_rounding); - in13 = _mm_adds_epi16(in13, final_rounding); - in14 = _mm_adds_epi16(in14, final_rounding); - in15 = _mm_adds_epi16(in15, final_rounding); - in16 = _mm_adds_epi16(in16, final_rounding); - in17 = _mm_adds_epi16(in17, final_rounding); - in18 = _mm_adds_epi16(in18, final_rounding); - in19 = _mm_adds_epi16(in19, final_rounding); - in20 = _mm_adds_epi16(in20, final_rounding); - in21 = _mm_adds_epi16(in21, final_rounding); - in22 = _mm_adds_epi16(in22, final_rounding); - in23 = _mm_adds_epi16(in23, final_rounding); - in24 = _mm_adds_epi16(in24, final_rounding); - in25 = _mm_adds_epi16(in25, final_rounding); - in26 = _mm_adds_epi16(in26, final_rounding); - in27 = _mm_adds_epi16(in27, final_rounding); - in28 = _mm_adds_epi16(in28, final_rounding); - in29 = _mm_adds_epi16(in29, final_rounding); - in30 = _mm_adds_epi16(in30, final_rounding); - in31 = _mm_adds_epi16(in31, final_rounding); + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); + in[16] = _mm_adds_epi16(in[16], final_rounding); + in[17] = _mm_adds_epi16(in[17], final_rounding); + in[18] = _mm_adds_epi16(in[18], final_rounding); + in[19] = _mm_adds_epi16(in[19], final_rounding); + in[20] = _mm_adds_epi16(in[20], final_rounding); + in[21] = _mm_adds_epi16(in[21], final_rounding); + in[22] = _mm_adds_epi16(in[22], final_rounding); + in[23] = _mm_adds_epi16(in[23], final_rounding); + in[24] = _mm_adds_epi16(in[24], final_rounding); + in[25] = _mm_adds_epi16(in[25], final_rounding); + in[26] = _mm_adds_epi16(in[26], final_rounding); + in[27] = _mm_adds_epi16(in[27], final_rounding); + in[28] = _mm_adds_epi16(in[28], final_rounding); + in[29] = _mm_adds_epi16(in[29], final_rounding); + in[30] = _mm_adds_epi16(in[30], final_rounding); + in[31] = _mm_adds_epi16(in[31], final_rounding); - in0 = _mm_srai_epi16(in0, 6); - in1 = _mm_srai_epi16(in1, 6); - in2 = _mm_srai_epi16(in2, 6); - in3 = _mm_srai_epi16(in3, 6); - in4 = _mm_srai_epi16(in4, 6); - in5 = _mm_srai_epi16(in5, 6); - in6 = _mm_srai_epi16(in6, 6); - in7 = _mm_srai_epi16(in7, 6); - in8 = _mm_srai_epi16(in8, 6); - in9 = _mm_srai_epi16(in9, 6); - in10 = _mm_srai_epi16(in10, 6); - in11 = _mm_srai_epi16(in11, 6); - in12 = _mm_srai_epi16(in12, 6); - in13 = _mm_srai_epi16(in13, 6); - in14 = _mm_srai_epi16(in14, 6); - in15 = _mm_srai_epi16(in15, 6); - in16 = _mm_srai_epi16(in16, 6); - in17 = _mm_srai_epi16(in17, 6); - in18 = _mm_srai_epi16(in18, 6); - in19 = _mm_srai_epi16(in19, 6); - in20 = _mm_srai_epi16(in20, 6); - in21 = _mm_srai_epi16(in21, 6); - in22 = _mm_srai_epi16(in22, 6); - in23 = _mm_srai_epi16(in23, 6); - in24 = _mm_srai_epi16(in24, 6); - in25 = _mm_srai_epi16(in25, 6); - in26 = _mm_srai_epi16(in26, 6); - in27 = _mm_srai_epi16(in27, 6); - in28 = _mm_srai_epi16(in28, 6); - in29 = _mm_srai_epi16(in29, 6); - in30 = _mm_srai_epi16(in30, 6); - in31 = _mm_srai_epi16(in31, 6); + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); + in[16] = _mm_srai_epi16(in[16], 6); + in[17] = _mm_srai_epi16(in[17], 6); + in[18] = _mm_srai_epi16(in[18], 6); + in[19] = _mm_srai_epi16(in[19], 6); + in[20] = _mm_srai_epi16(in[20], 6); + in[21] = _mm_srai_epi16(in[21], 6); + in[22] = _mm_srai_epi16(in[22], 6); + in[23] = _mm_srai_epi16(in[23], 6); + in[24] = _mm_srai_epi16(in[24], 6); + in[25] = _mm_srai_epi16(in[25], 6); + in[26] = _mm_srai_epi16(in[26], 6); + in[27] = _mm_srai_epi16(in[27], 6); + in[28] = _mm_srai_epi16(in[28], 6); + in[29] = _mm_srai_epi16(in[29], 6); + in[30] = _mm_srai_epi16(in[30], 6); + in[31] = _mm_srai_epi16(in[31], 6); - RECON_AND_STORE(dest, in0); - RECON_AND_STORE(dest, in1); - RECON_AND_STORE(dest, in2); - RECON_AND_STORE(dest, in3); - RECON_AND_STORE(dest, in4); - RECON_AND_STORE(dest, in5); - RECON_AND_STORE(dest, in6); - RECON_AND_STORE(dest, in7); - RECON_AND_STORE(dest, in8); - RECON_AND_STORE(dest, in9); - RECON_AND_STORE(dest, in10); - RECON_AND_STORE(dest, in11); - RECON_AND_STORE(dest, in12); - RECON_AND_STORE(dest, in13); - RECON_AND_STORE(dest, in14); - RECON_AND_STORE(dest, in15); - RECON_AND_STORE(dest, in16); - RECON_AND_STORE(dest, in17); - RECON_AND_STORE(dest, in18); - RECON_AND_STORE(dest, in19); - RECON_AND_STORE(dest, in20); - RECON_AND_STORE(dest, in21); - RECON_AND_STORE(dest, in22); - RECON_AND_STORE(dest, in23); - RECON_AND_STORE(dest, in24); - RECON_AND_STORE(dest, in25); - RECON_AND_STORE(dest, in26); - RECON_AND_STORE(dest, in27); - RECON_AND_STORE(dest, in28); - RECON_AND_STORE(dest, in29); - RECON_AND_STORE(dest, in30); - RECON_AND_STORE(dest, in31); + RECON_AND_STORE(dest, in[0]); + RECON_AND_STORE(dest, in[1]); + RECON_AND_STORE(dest, in[2]); + RECON_AND_STORE(dest, in[3]); + RECON_AND_STORE(dest, in[4]); + RECON_AND_STORE(dest, in[5]); + RECON_AND_STORE(dest, in[6]); + RECON_AND_STORE(dest, in[7]); + RECON_AND_STORE(dest, in[8]); + RECON_AND_STORE(dest, in[9]); + RECON_AND_STORE(dest, in[10]); + RECON_AND_STORE(dest, in[11]); + RECON_AND_STORE(dest, in[12]); + RECON_AND_STORE(dest, in[13]); + RECON_AND_STORE(dest, in[14]); + RECON_AND_STORE(dest, in[15]); + RECON_AND_STORE(dest, in[16]); + RECON_AND_STORE(dest, in[17]); + RECON_AND_STORE(dest, in[18]); + RECON_AND_STORE(dest, in[19]); + RECON_AND_STORE(dest, in[20]); + RECON_AND_STORE(dest, in[21]); + RECON_AND_STORE(dest, in[22]); + RECON_AND_STORE(dest, in[23]); + RECON_AND_STORE(dest, in[24]); + RECON_AND_STORE(dest, in[25]); + RECON_AND_STORE(dest, in[26]); + RECON_AND_STORE(dest, in[27]); + RECON_AND_STORE(dest, in[28]); + RECON_AND_STORE(dest, in[29]); + RECON_AND_STORE(dest, in[30]); + RECON_AND_STORE(dest, in[31]); dest += 8 - (stride * 32); } - } } //NOLINT void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { diff --git a/media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h b/media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h new file mode 100644 index 000000000000..0f179b49a57d --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include // SSE2 +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_idct.h" + +// perform 8x8 transpose +static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); + const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); + const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); + + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + + res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); + res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); + res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); + res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); + res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); + res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); + res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); + res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); +} + +#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + \ + in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ + in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ + } + +static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) { + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); + + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + + out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); + out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); + out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); + out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); +} + +static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { + __m128i tbuf[8]; + array_transpose_8x8(res0, res0); + array_transpose_8x8(res1, tbuf); + array_transpose_8x8(res0 + 8, res1); + array_transpose_8x8(res1 + 8, res1 + 8); + + res0[8] = tbuf[0]; + res0[9] = tbuf[1]; + res0[10] = tbuf[2]; + res0[11] = tbuf[3]; + res0[12] = tbuf[4]; + res0[13] = tbuf[5]; + res0[14] = tbuf[6]; + res0[15] = tbuf[7]; +} + +static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { + in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); + in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); + in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); + in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); + in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); + in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); + in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); + in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); + + in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); + in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); + in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); + in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); + in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); + in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); + in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); + in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); +} + +#define RECON_AND_STORE(dest, in_x) \ + { \ + __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ + d0 = _mm_unpacklo_epi8(d0, zero); \ + d0 = _mm_add_epi16(in_x, d0); \ + d0 = _mm_packus_epi16(d0, d0); \ + _mm_storel_epi64((__m128i *)(dest), d0); \ + dest += stride; \ + } + +static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { + const __m128i final_rounding = _mm_set1_epi16(1<<5); + const __m128i zero = _mm_setzero_si128(); + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); + + RECON_AND_STORE(dest, in[0]); + RECON_AND_STORE(dest, in[1]); + RECON_AND_STORE(dest, in[2]); + RECON_AND_STORE(dest, in[3]); + RECON_AND_STORE(dest, in[4]); + RECON_AND_STORE(dest, in[5]); + RECON_AND_STORE(dest, in[6]); + RECON_AND_STORE(dest, in[7]); + RECON_AND_STORE(dest, in[8]); + RECON_AND_STORE(dest, in[9]); + RECON_AND_STORE(dest, in[10]); + RECON_AND_STORE(dest, in[11]); + RECON_AND_STORE(dest, in[12]); + RECON_AND_STORE(dest, in[13]); + RECON_AND_STORE(dest, in[14]); + RECON_AND_STORE(dest, in[15]); +} diff --git a/media/libvpx/vp9/common/x86/vp9_idct_intrin_ssse3.c b/media/libvpx/vp9/common/x86/vp9_idct_intrin_ssse3.c new file mode 100644 index 000000000000..73bf5d1d78ec --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_idct_intrin_ssse3.c @@ -0,0 +1,762 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#if defined(_MSC_VER) && _MSC_VER <= 1500 +// Need to include math.h before calling tmmintrin.h/intrin.h +// in certain versions of MSVS. +#include +#endif +#include // SSSE3 +#include "vp9/common/x86/vp9_idct_intrin_sse2.h" + +static void idct16_8col(__m128i *in, int round) { + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i k__cospi_p16_p16_x2 = pair_set_epi16(23170, 23170); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i v[16], u[16], s[16], t[16]; + + // stage 1 + s[0] = in[0]; + s[1] = in[8]; + s[2] = in[4]; + s[3] = in[12]; + s[4] = in[2]; + s[5] = in[10]; + s[6] = in[6]; + s[7] = in[14]; + s[8] = in[1]; + s[9] = in[9]; + s[10] = in[5]; + s[11] = in[13]; + s[12] = in[3]; + s[13] = in[11]; + s[14] = in[7]; + s[15] = in[15]; + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[8], s[15]); + u[1] = _mm_unpackhi_epi16(s[8], s[15]); + u[2] = _mm_unpacklo_epi16(s[9], s[14]); + u[3] = _mm_unpackhi_epi16(s[9], s[14]); + u[4] = _mm_unpacklo_epi16(s[10], s[13]); + u[5] = _mm_unpackhi_epi16(s[10], s[13]); + u[6] = _mm_unpacklo_epi16(s[11], s[12]); + u[7] = _mm_unpackhi_epi16(s[11], s[12]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); + v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); + v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); + v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); + v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); + v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); + v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); + v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); + v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); + v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); + v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); + v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); + v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); + v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); + v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); + v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[8] = _mm_packs_epi32(u[0], u[1]); + s[15] = _mm_packs_epi32(u[2], u[3]); + s[9] = _mm_packs_epi32(u[4], u[5]); + s[14] = _mm_packs_epi32(u[6], u[7]); + s[10] = _mm_packs_epi32(u[8], u[9]); + s[13] = _mm_packs_epi32(u[10], u[11]); + s[11] = _mm_packs_epi32(u[12], u[13]); + s[12] = _mm_packs_epi32(u[14], u[15]); + + // stage 3 + t[0] = s[0]; + t[1] = s[1]; + t[2] = s[2]; + t[3] = s[3]; + u[0] = _mm_unpacklo_epi16(s[4], s[7]); + u[1] = _mm_unpackhi_epi16(s[4], s[7]); + u[2] = _mm_unpacklo_epi16(s[5], s[6]); + u[3] = _mm_unpackhi_epi16(s[5], s[6]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); + v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); + v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); + v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); + v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); + v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); + v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); + v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + t[4] = _mm_packs_epi32(u[0], u[1]); + t[7] = _mm_packs_epi32(u[2], u[3]); + t[5] = _mm_packs_epi32(u[4], u[5]); + t[6] = _mm_packs_epi32(u[6], u[7]); + t[8] = _mm_add_epi16(s[8], s[9]); + t[9] = _mm_sub_epi16(s[8], s[9]); + t[10] = _mm_sub_epi16(s[11], s[10]); + t[11] = _mm_add_epi16(s[10], s[11]); + t[12] = _mm_add_epi16(s[12], s[13]); + t[13] = _mm_sub_epi16(s[12], s[13]); + t[14] = _mm_sub_epi16(s[15], s[14]); + t[15] = _mm_add_epi16(s[14], s[15]); + + // stage 4 + u[0] = _mm_add_epi16(t[0], t[1]); + u[1] = _mm_sub_epi16(t[0], t[1]); + u[2] = _mm_unpacklo_epi16(t[2], t[3]); + u[3] = _mm_unpackhi_epi16(t[2], t[3]); + u[4] = _mm_unpacklo_epi16(t[9], t[14]); + u[5] = _mm_unpackhi_epi16(t[9], t[14]); + u[6] = _mm_unpacklo_epi16(t[10], t[13]); + u[7] = _mm_unpackhi_epi16(t[10], t[13]); + + s[0] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2); + s[1] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2); + v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); + v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); + v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); + v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); + v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); + v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); + v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); + v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); + v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); + v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); + + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[2] = _mm_packs_epi32(u[4], u[5]); + s[3] = _mm_packs_epi32(u[6], u[7]); + s[4] = _mm_add_epi16(t[4], t[5]); + s[5] = _mm_sub_epi16(t[4], t[5]); + s[6] = _mm_sub_epi16(t[7], t[6]); + s[7] = _mm_add_epi16(t[6], t[7]); + s[8] = t[8]; + s[15] = t[15]; + s[9] = _mm_packs_epi32(u[8], u[9]); + s[14] = _mm_packs_epi32(u[10], u[11]); + s[10] = _mm_packs_epi32(u[12], u[13]); + s[13] = _mm_packs_epi32(u[14], u[15]); + s[11] = t[11]; + s[12] = t[12]; + + // stage 5 + t[0] = _mm_add_epi16(s[0], s[3]); + t[1] = _mm_add_epi16(s[1], s[2]); + t[2] = _mm_sub_epi16(s[1], s[2]); + t[3] = _mm_sub_epi16(s[0], s[3]); + t[4] = s[4]; + t[7] = s[7]; + + u[0] = _mm_sub_epi16(s[6], s[5]); + u[1] = _mm_add_epi16(s[6], s[5]); + t[5] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2); + t[6] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2); + + t[8] = _mm_add_epi16(s[8], s[11]); + t[9] = _mm_add_epi16(s[9], s[10]); + t[10] = _mm_sub_epi16(s[9], s[10]); + t[11] = _mm_sub_epi16(s[8], s[11]); + t[12] = _mm_sub_epi16(s[15], s[12]); + t[13] = _mm_sub_epi16(s[14], s[13]); + t[14] = _mm_add_epi16(s[13], s[14]); + t[15] = _mm_add_epi16(s[12], s[15]); + + // stage 6 + if (round == 1) { + s[0] = _mm_add_epi16(t[0], t[7]); + s[1] = _mm_add_epi16(t[1], t[6]); + s[2] = _mm_add_epi16(t[2], t[5]); + s[3] = _mm_add_epi16(t[3], t[4]); + s[4] = _mm_sub_epi16(t[3], t[4]); + s[5] = _mm_sub_epi16(t[2], t[5]); + s[6] = _mm_sub_epi16(t[1], t[6]); + s[7] = _mm_sub_epi16(t[0], t[7]); + s[8] = t[8]; + s[9] = t[9]; + + u[0] = _mm_unpacklo_epi16(t[10], t[13]); + u[1] = _mm_unpackhi_epi16(t[10], t[13]); + u[2] = _mm_unpacklo_epi16(t[11], t[12]); + u[3] = _mm_unpackhi_epi16(t[11], t[12]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); + v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); + v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); + v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + s[10] = _mm_packs_epi32(u[0], u[1]); + s[13] = _mm_packs_epi32(u[2], u[3]); + s[11] = _mm_packs_epi32(u[4], u[5]); + s[12] = _mm_packs_epi32(u[6], u[7]); + s[14] = t[14]; + s[15] = t[15]; + } else { + s[0] = _mm_add_epi16(t[0], t[7]); + s[1] = _mm_add_epi16(t[1], t[6]); + s[2] = _mm_add_epi16(t[2], t[5]); + s[3] = _mm_add_epi16(t[3], t[4]); + s[4] = _mm_sub_epi16(t[3], t[4]); + s[5] = _mm_sub_epi16(t[2], t[5]); + s[6] = _mm_sub_epi16(t[1], t[6]); + s[7] = _mm_sub_epi16(t[0], t[7]); + s[8] = t[8]; + s[9] = t[9]; + + u[0] = _mm_sub_epi16(t[13], t[10]); + u[1] = _mm_add_epi16(t[13], t[10]); + u[2] = _mm_sub_epi16(t[12], t[11]); + u[3] = _mm_add_epi16(t[12], t[11]); + + s[10] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2); + s[13] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2); + s[11] = _mm_mulhrs_epi16(u[2], k__cospi_p16_p16_x2); + s[12] = _mm_mulhrs_epi16(u[3], k__cospi_p16_p16_x2); + s[14] = t[14]; + s[15] = t[15]; + } + + // stage 7 + in[0] = _mm_add_epi16(s[0], s[15]); + in[1] = _mm_add_epi16(s[1], s[14]); + in[2] = _mm_add_epi16(s[2], s[13]); + in[3] = _mm_add_epi16(s[3], s[12]); + in[4] = _mm_add_epi16(s[4], s[11]); + in[5] = _mm_add_epi16(s[5], s[10]); + in[6] = _mm_add_epi16(s[6], s[9]); + in[7] = _mm_add_epi16(s[7], s[8]); + in[8] = _mm_sub_epi16(s[7], s[8]); + in[9] = _mm_sub_epi16(s[6], s[9]); + in[10] = _mm_sub_epi16(s[5], s[10]); + in[11] = _mm_sub_epi16(s[4], s[11]); + in[12] = _mm_sub_epi16(s[3], s[12]); + in[13] = _mm_sub_epi16(s[2], s[13]); + in[14] = _mm_sub_epi16(s[1], s[14]); + in[15] = _mm_sub_epi16(s[0], s[15]); +} + +static void idct16_sse2(__m128i *in0, __m128i *in1, int round) { + array_transpose_16x16(in0, in1); + idct16_8col(in0, round); + idct16_8col(in1, round); +} + +void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest, + int stride) { + __m128i in0[16], in1[16]; + + load_buffer_8x16(input, in0); + input += 8; + load_buffer_8x16(input, in1); + + idct16_sse2(in0, in1, 0); + idct16_sse2(in0, in1, 1); + + write_buffer_8x16(dest, in0, stride); + dest += 8; + write_buffer_8x16(dest, in1, stride); +} + +static void idct16_10_r1(__m128i *in, __m128i *l) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i zero = _mm_setzero_si128(); + + const __m128i stg2_01 = dual_set_epi16(3212, 32610); + const __m128i stg2_67 = dual_set_epi16(-9512, 31358); + const __m128i stg3_01 = dual_set_epi16(6392, 32138); + const __m128i stg4_01 = dual_set_epi16(23170, 23170); + + + + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); + + __m128i stp1_0, stp1_1, stp1_4, stp1_6, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4; + + // Stage2 + { + const __m128i lo_1_15 = _mm_unpackhi_epi64(in[0], in[0]); + const __m128i lo_13_3 = _mm_unpackhi_epi64(in[1], in[1]); + + stp2_8 = _mm_mulhrs_epi16(lo_1_15, stg2_01); + stp2_11 = _mm_mulhrs_epi16(lo_13_3, stg2_67); + } + + // Stage3 + { + const __m128i lo_2_14 = _mm_unpacklo_epi64(in[1], in[1]); + stp1_4 = _mm_mulhrs_epi16(lo_2_14, stg3_01); + + stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); + stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); + } + + // Stage4 + { + const __m128i lo_0_8 = _mm_unpacklo_epi64(in[0], in[0]); + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); + + tmp0 = _mm_mulhrs_epi16(lo_0_8, stg4_01); + tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); + tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); + tmp2 = _mm_madd_epi16(lo_10_13, stg4_6); + tmp4 = _mm_madd_epi16(lo_10_13, stg4_7); + + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + + stp1_0 = _mm_unpacklo_epi64(tmp0, tmp0); + stp1_1 = _mm_unpackhi_epi64(tmp0, tmp0); + stp2_9 = _mm_packs_epi32(tmp1, tmp3); + stp2_10 = _mm_packs_epi32(tmp2, tmp4); + + stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); + } + + // Stage5 and Stage6 + { + tmp0 = _mm_add_epi16(stp2_8, stp2_11); + tmp1 = _mm_sub_epi16(stp2_8, stp2_11); + tmp2 = _mm_add_epi16(stp2_9, stp2_10); + tmp3 = _mm_sub_epi16(stp2_9, stp2_10); + + stp1_9 = _mm_unpacklo_epi64(tmp2, zero); + stp1_10 = _mm_unpacklo_epi64(tmp3, zero); + stp1_8 = _mm_unpacklo_epi64(tmp0, zero); + stp1_11 = _mm_unpacklo_epi64(tmp1, zero); + + stp1_13 = _mm_unpackhi_epi64(tmp3, zero); + stp1_14 = _mm_unpackhi_epi64(tmp2, zero); + stp1_12 = _mm_unpackhi_epi64(tmp1, zero); + stp1_15 = _mm_unpackhi_epi64(tmp0, zero); + } + + // Stage6 + { + const __m128i lo_6_5 = _mm_add_epi16(stp2_6, stp1_4); + const __m128i lo_6_6 = _mm_sub_epi16(stp2_6, stp1_4); + const __m128i lo_10_13 = _mm_sub_epi16(stp1_13, stp1_10); + const __m128i lo_10_14 = _mm_add_epi16(stp1_13, stp1_10); + const __m128i lo_11_12 = _mm_sub_epi16(stp1_12, stp1_11); + const __m128i lo_11_13 = _mm_add_epi16(stp1_12, stp1_11); + + tmp1 = _mm_unpacklo_epi64(lo_6_5, lo_6_6); + tmp0 = _mm_unpacklo_epi64(lo_10_13, lo_10_14); + tmp4 = _mm_unpacklo_epi64(lo_11_12, lo_11_13); + + stp1_6 = _mm_mulhrs_epi16(tmp1, stg4_01); + tmp0 = _mm_mulhrs_epi16(tmp0, stg4_01); + tmp4 = _mm_mulhrs_epi16(tmp4, stg4_01); + + stp2_10 = _mm_unpacklo_epi64(tmp0, zero); + stp2_13 = _mm_unpackhi_epi64(tmp0, zero); + stp2_11 = _mm_unpacklo_epi64(tmp4, zero); + stp2_12 = _mm_unpackhi_epi64(tmp4, zero); + + tmp0 = _mm_add_epi16(stp1_0, stp1_4); + tmp1 = _mm_sub_epi16(stp1_0, stp1_4); + tmp2 = _mm_add_epi16(stp1_1, stp1_6); + tmp3 = _mm_sub_epi16(stp1_1, stp1_6); + + stp2_0 = _mm_unpackhi_epi64(tmp0, zero); + stp2_1 = _mm_unpacklo_epi64(tmp2, zero); + stp2_2 = _mm_unpackhi_epi64(tmp2, zero); + stp2_3 = _mm_unpacklo_epi64(tmp0, zero); + stp2_4 = _mm_unpacklo_epi64(tmp1, zero); + stp2_5 = _mm_unpackhi_epi64(tmp3, zero); + stp2_6 = _mm_unpacklo_epi64(tmp3, zero); + stp2_7 = _mm_unpackhi_epi64(tmp1, zero); + } + + // Stage7. Left 8x16 only. + l[0] = _mm_add_epi16(stp2_0, stp1_15); + l[1] = _mm_add_epi16(stp2_1, stp1_14); + l[2] = _mm_add_epi16(stp2_2, stp2_13); + l[3] = _mm_add_epi16(stp2_3, stp2_12); + l[4] = _mm_add_epi16(stp2_4, stp2_11); + l[5] = _mm_add_epi16(stp2_5, stp2_10); + l[6] = _mm_add_epi16(stp2_6, stp1_9); + l[7] = _mm_add_epi16(stp2_7, stp1_8); + l[8] = _mm_sub_epi16(stp2_7, stp1_8); + l[9] = _mm_sub_epi16(stp2_6, stp1_9); + l[10] = _mm_sub_epi16(stp2_5, stp2_10); + l[11] = _mm_sub_epi16(stp2_4, stp2_11); + l[12] = _mm_sub_epi16(stp2_3, stp2_12); + l[13] = _mm_sub_epi16(stp2_2, stp2_13); + l[14] = _mm_sub_epi16(stp2_1, stp1_14); + l[15] = _mm_sub_epi16(stp2_0, stp1_15); +} + +static void idct16_10_r2(__m128i *in) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + + const __m128i stg2_0 = dual_set_epi16(3212, 3212); + const __m128i stg2_1 = dual_set_epi16(32610, 32610); + const __m128i stg2_6 = dual_set_epi16(-9512, -9512); + const __m128i stg2_7 = dual_set_epi16(31358, 31358); + const __m128i stg3_0 = dual_set_epi16(6392, 6392); + const __m128i stg3_1 = dual_set_epi16(32138, 32138); + const __m128i stg4_01 = dual_set_epi16(23170, 23170); + + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); + + __m128i stp1_0, stp1_2, stp1_3, stp1_5, stp1_6, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_8_0, stp1_12_0; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + /* Stage2 */ + { + stp1_8_0 = _mm_mulhrs_epi16(in[1], stg2_0); + stp1_15 = _mm_mulhrs_epi16(in[1], stg2_1); + stp1_11 = _mm_mulhrs_epi16(in[3], stg2_6); + stp1_12_0 = _mm_mulhrs_epi16(in[3], stg2_7); + } + + /* Stage3 */ + { + stp2_4 = _mm_mulhrs_epi16(in[2], stg3_0); + stp2_7 = _mm_mulhrs_epi16(in[2], stg3_1); + + stp1_9 = stp1_8_0; + stp1_10 = stp1_11; + + stp1_13 = stp1_12_0; + stp1_14 = stp1_15; + } + + /* Stage4 */ + { + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); + + stp1_0 = _mm_mulhrs_epi16(in[0], stg4_01); + + stp2_5 = stp2_4; + stp2_6 = stp2_7; + + + tmp0 = _mm_madd_epi16(lo_9_14, stg4_4); + tmp1 = _mm_madd_epi16(hi_9_14, stg4_4); + tmp2 = _mm_madd_epi16(lo_9_14, stg4_5); + tmp3 = _mm_madd_epi16(hi_9_14, stg4_5); + tmp4 = _mm_madd_epi16(lo_10_13, stg4_6); + tmp5 = _mm_madd_epi16(hi_10_13, stg4_6); + tmp6 = _mm_madd_epi16(lo_10_13, stg4_7); + tmp7 = _mm_madd_epi16(hi_10_13, stg4_7); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp7 = _mm_add_epi32(tmp7, rounding); + + tmp0 = _mm_srai_epi32(tmp0, 14); + tmp1 = _mm_srai_epi32(tmp1, 14); + tmp2 = _mm_srai_epi32(tmp2, 14); + tmp3 = _mm_srai_epi32(tmp3, 14); + tmp4 = _mm_srai_epi32(tmp4, 14); + tmp5 = _mm_srai_epi32(tmp5, 14); + tmp6 = _mm_srai_epi32(tmp6, 14); + tmp7 = _mm_srai_epi32(tmp7, 14); + + stp2_9 = _mm_packs_epi32(tmp0, tmp1); + stp2_14 = _mm_packs_epi32(tmp2, tmp3); + stp2_10 = _mm_packs_epi32(tmp4, tmp5); + stp2_13 = _mm_packs_epi32(tmp6, tmp7); + } + + /* Stage5 */ + { + stp1_2 = stp1_0; + stp1_3 = stp1_0; + + tmp0 = _mm_sub_epi16(stp2_6, stp2_5); + tmp1 = _mm_add_epi16(stp2_6, stp2_5); + + stp1_5 = _mm_mulhrs_epi16(tmp0, stg4_01); + stp1_6 = _mm_mulhrs_epi16(tmp1, stg4_01); + + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); + + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); + } + + /* Stage6 */ + { + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); + stp2_1 = _mm_add_epi16(stp1_0, stp1_6); + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); + + tmp0 = _mm_sub_epi16(stp1_13, stp1_10); + tmp1 = _mm_add_epi16(stp1_13, stp1_10); + tmp2 = _mm_sub_epi16(stp1_12, stp1_11); + tmp3 = _mm_add_epi16(stp1_12, stp1_11); + + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_0, stp1_6); + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); + + stp2_10 = _mm_mulhrs_epi16(tmp0, stg4_01); + stp2_13 = _mm_mulhrs_epi16(tmp1, stg4_01); + stp2_11 = _mm_mulhrs_epi16(tmp2, stg4_01); + stp2_12 = _mm_mulhrs_epi16(tmp3, stg4_01); + } + + // Stage7 + in[0] = _mm_add_epi16(stp2_0, stp1_15); + in[1] = _mm_add_epi16(stp2_1, stp1_14); + in[2] = _mm_add_epi16(stp2_2, stp2_13); + in[3] = _mm_add_epi16(stp2_3, stp2_12); + in[4] = _mm_add_epi16(stp2_4, stp2_11); + in[5] = _mm_add_epi16(stp2_5, stp2_10); + in[6] = _mm_add_epi16(stp2_6, stp1_9); + in[7] = _mm_add_epi16(stp2_7, stp1_8); + in[8] = _mm_sub_epi16(stp2_7, stp1_8); + in[9] = _mm_sub_epi16(stp2_6, stp1_9); + in[10] = _mm_sub_epi16(stp2_5, stp2_10); + in[11] = _mm_sub_epi16(stp2_4, stp2_11); + in[12] = _mm_sub_epi16(stp2_3, stp2_12); + in[13] = _mm_sub_epi16(stp2_2, stp2_13); + in[14] = _mm_sub_epi16(stp2_1, stp1_14); + in[15] = _mm_sub_epi16(stp2_0, stp1_15); +} + +void vp9_idct16x16_10_add_ssse3(const int16_t *input, uint8_t *dest, + int stride) { + const __m128i final_rounding = _mm_set1_epi16(1<<5); + const __m128i zero = _mm_setzero_si128(); + __m128i in[16], l[16]; + + int i; + // First 1-D inverse DCT + // Load input data. + in[0] = _mm_load_si128((const __m128i *)input); + in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); + in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); + in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); + + TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); + + idct16_10_r1(in, l); + + // Second 1-D inverse transform, performed per 8x16 block + for (i = 0; i < 2; i++) { + array_transpose_4X8(l + 8*i, in); + + idct16_10_r2(in); + + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); + + RECON_AND_STORE(dest, in[0]); + RECON_AND_STORE(dest, in[1]); + RECON_AND_STORE(dest, in[2]); + RECON_AND_STORE(dest, in[3]); + RECON_AND_STORE(dest, in[4]); + RECON_AND_STORE(dest, in[5]); + RECON_AND_STORE(dest, in[6]); + RECON_AND_STORE(dest, in[7]); + RECON_AND_STORE(dest, in[8]); + RECON_AND_STORE(dest, in[9]); + RECON_AND_STORE(dest, in[10]); + RECON_AND_STORE(dest, in[11]); + RECON_AND_STORE(dest, in[12]); + RECON_AND_STORE(dest, in[13]); + RECON_AND_STORE(dest, in[14]); + RECON_AND_STORE(dest, in[15]); + + dest += 8 - (stride * 16); + } +} diff --git a/media/libvpx/vp9/common/x86/vp9_idct_ssse3_x86_64.asm b/media/libvpx/vp9/common/x86/vp9_idct_ssse3_x86_64.asm new file mode 100644 index 000000000000..2c1060710cc0 --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_idct_ssse3_x86_64.asm @@ -0,0 +1,300 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +%include "third_party/x86inc/x86inc.asm" + +; This file provides SSSE3 version of the inverse transformation. Part +; of the functions are originally derived from the ffmpeg project. +; Note that the current version applies to x86 64-bit only. + +SECTION_RODATA + +pw_11585x2: times 8 dw 23170 +pd_8192: times 4 dd 8192 +pw_16: times 8 dw 16 + +%macro TRANSFORM_COEFFS 2 +pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 +pw_m%2_%1: dw -%2, %1, -%2, %1, -%2, %1, -%2, %1 +%endmacro + +TRANSFORM_COEFFS 6270, 15137 +TRANSFORM_COEFFS 3196, 16069 +TRANSFORM_COEFFS 13623, 9102 + +%macro PAIR_PP_COEFFS 2 +dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2 +%endmacro + +%macro PAIR_MP_COEFFS 2 +dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2 +%endmacro + +%macro PAIR_MM_COEFFS 2 +dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2 +%endmacro + +PAIR_PP_COEFFS 30274, 12540 +PAIR_PP_COEFFS 6392, 32138 +PAIR_MP_COEFFS 18204, 27246 + +PAIR_PP_COEFFS 12540, 12540 +PAIR_PP_COEFFS 30274, 30274 +PAIR_PP_COEFFS 6392, 6392 +PAIR_PP_COEFFS 32138, 32138 +PAIR_MM_COEFFS 18204, 18204 +PAIR_PP_COEFFS 27246, 27246 + +SECTION .text + +%if ARCH_X86_64 +%macro SUM_SUB 3 + psubw m%3, m%1, m%2 + paddw m%1, m%2 + SWAP %2, %3 +%endmacro + +; butterfly operation +%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2 + pmaddwd m%1, m%3, %5 + pmaddwd m%2, m%3, %6 + paddd m%1, %4 + paddd m%2, %4 + psrad m%1, 14 + psrad m%2, 14 +%endmacro + +%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 + punpckhwd m%6, m%2, m%1 + MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_%3_%4] + punpcklwd m%2, m%1 + MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_%3_%4] + packssdw m%1, m%7 + packssdw m%2, m%6 +%endmacro + +; matrix transpose +%macro INTERLEAVE_2X 4 + punpckh%1 m%4, m%2, m%3 + punpckl%1 m%2, m%3 + SWAP %3, %4 +%endmacro + +%macro TRANSPOSE8X8 9 + INTERLEAVE_2X wd, %1, %2, %9 + INTERLEAVE_2X wd, %3, %4, %9 + INTERLEAVE_2X wd, %5, %6, %9 + INTERLEAVE_2X wd, %7, %8, %9 + + INTERLEAVE_2X dq, %1, %3, %9 + INTERLEAVE_2X dq, %2, %4, %9 + INTERLEAVE_2X dq, %5, %7, %9 + INTERLEAVE_2X dq, %6, %8, %9 + + INTERLEAVE_2X qdq, %1, %5, %9 + INTERLEAVE_2X qdq, %3, %7, %9 + INTERLEAVE_2X qdq, %2, %6, %9 + INTERLEAVE_2X qdq, %4, %8, %9 + + SWAP %2, %5 + SWAP %4, %7 +%endmacro + +%macro IDCT8_1D 0 + SUM_SUB 0, 4, 9 + BUTTERFLY_4X 2, 6, 6270, 15137, m8, 9, 10 + pmulhrsw m0, m12 + pmulhrsw m4, m12 + BUTTERFLY_4X 1, 7, 3196, 16069, m8, 9, 10 + BUTTERFLY_4X 5, 3, 13623, 9102, m8, 9, 10 + + SUM_SUB 1, 5, 9 + SUM_SUB 7, 3, 9 + SUM_SUB 0, 6, 9 + SUM_SUB 4, 2, 9 + SUM_SUB 3, 5, 9 + pmulhrsw m3, m12 + pmulhrsw m5, m12 + + SUM_SUB 0, 7, 9 + SUM_SUB 4, 3, 9 + SUM_SUB 2, 5, 9 + SUM_SUB 6, 1, 9 + + SWAP 3, 6 + SWAP 1, 4 +%endmacro + +; This macro handles 8 pixels per line +%macro ADD_STORE_8P_2X 5; src1, src2, tmp1, tmp2, zero + paddw m%1, m11 + paddw m%2, m11 + psraw m%1, 5 + psraw m%2, 5 + + movh m%3, [outputq] + movh m%4, [outputq + strideq] + punpcklbw m%3, m%5 + punpcklbw m%4, m%5 + paddw m%3, m%1 + paddw m%4, m%2 + packuswb m%3, m%5 + packuswb m%4, m%5 + movh [outputq], m%3 + movh [outputq + strideq], m%4 +%endmacro + +INIT_XMM ssse3 +; full inverse 8x8 2D-DCT transform +cglobal idct8x8_64_add, 3, 5, 13, input, output, stride + mova m8, [pd_8192] + mova m11, [pw_16] + mova m12, [pw_11585x2] + + lea r3, [2 * strideq] + + mova m0, [inputq + 0] + mova m1, [inputq + 16] + mova m2, [inputq + 32] + mova m3, [inputq + 48] + mova m4, [inputq + 64] + mova m5, [inputq + 80] + mova m6, [inputq + 96] + mova m7, [inputq + 112] + + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + IDCT8_1D + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + IDCT8_1D + + pxor m12, m12 + ADD_STORE_8P_2X 0, 1, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 2, 3, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 4, 5, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 6, 7, 9, 10, 12 + + RET + +; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero +cglobal idct8x8_12_add, 3, 5, 13, input, output, stride + mova m8, [pd_8192] + mova m11, [pw_16] + mova m12, [pw_11585x2] + + lea r3, [2 * strideq] + + mova m0, [inputq + 0] + mova m1, [inputq + 16] + mova m2, [inputq + 32] + mova m3, [inputq + 48] + + punpcklwd m0, m1 + punpcklwd m2, m3 + punpckhdq m9, m0, m2 + punpckldq m0, m2 + SWAP 2, 9 + + ; m0 -> [0], [0] + ; m1 -> [1], [1] + ; m2 -> [2], [2] + ; m3 -> [3], [3] + punpckhqdq m10, m0, m0 + punpcklqdq m0, m0 + punpckhqdq m9, m2, m2 + punpcklqdq m2, m2 + SWAP 1, 10 + SWAP 3, 9 + + pmulhrsw m0, m12 + pmulhrsw m2, [dpw_30274_12540] + pmulhrsw m1, [dpw_6392_32138] + pmulhrsw m3, [dpw_m18204_27246] + + SUM_SUB 0, 2, 9 + SUM_SUB 1, 3, 9 + + punpcklqdq m9, m3, m3 + punpckhqdq m5, m3, m9 + + SUM_SUB 3, 5, 9 + punpckhqdq m5, m3 + pmulhrsw m5, m12 + + punpckhqdq m9, m1, m5 + punpcklqdq m1, m5 + SWAP 5, 9 + + SUM_SUB 0, 5, 9 + SUM_SUB 2, 1, 9 + + punpckhqdq m3, m0, m0 + punpckhqdq m4, m1, m1 + punpckhqdq m6, m5, m5 + punpckhqdq m7, m2, m2 + + punpcklwd m0, m3 + punpcklwd m7, m2 + punpcklwd m1, m4 + punpcklwd m6, m5 + + punpckhdq m4, m0, m7 + punpckldq m0, m7 + punpckhdq m10, m1, m6 + punpckldq m5, m1, m6 + + punpckhqdq m1, m0, m5 + punpcklqdq m0, m5 + punpckhqdq m3, m4, m10 + punpcklqdq m2, m4, m10 + + + pmulhrsw m0, m12 + pmulhrsw m6, m2, [dpw_30274_30274] + pmulhrsw m4, m2, [dpw_12540_12540] + + pmulhrsw m7, m1, [dpw_32138_32138] + pmulhrsw m1, [dpw_6392_6392] + pmulhrsw m5, m3, [dpw_m18204_m18204] + pmulhrsw m3, [dpw_27246_27246] + + mova m2, m0 + SUM_SUB 0, 6, 9 + SUM_SUB 2, 4, 9 + SUM_SUB 1, 5, 9 + SUM_SUB 7, 3, 9 + + SUM_SUB 3, 5, 9 + pmulhrsw m3, m12 + pmulhrsw m5, m12 + + SUM_SUB 0, 7, 9 + SUM_SUB 2, 3, 9 + SUM_SUB 4, 5, 9 + SUM_SUB 6, 1, 9 + + SWAP 3, 6 + SWAP 1, 2 + SWAP 2, 4 + + + pxor m12, m12 + ADD_STORE_8P_2X 0, 1, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 2, 3, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 4, 5, 9, 10, 12 + lea outputq, [outputq + r3] + ADD_STORE_8P_2X 6, 7, 9, 10, 12 + + RET + +%endif diff --git a/media/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/media/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c index 3c5cb8ffd90e..439c028f29d0 100644 --- a/media/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c +++ b/media/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c @@ -933,7 +933,7 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p, } } -void vp9_mb_lpf_horizontal_edge_w_avx2(unsigned char *s, int p, +void vp9_lpf_horizontal_16_avx2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh, int count) { if (count == 1) diff --git a/media/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/media/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c index fa4dd9bcb8d4..448ad5af7fcb 100644 --- a/media/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c +++ b/media/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include /* SSE2 */ +#include // SSE2 #include "vp9/common/vp9_loopfilter.h" #include "vpx_ports/emmintrin_compat.h" @@ -17,20 +17,14 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { - __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); + const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + const __m128i limit = _mm_load_si128((const __m128i *)_limit); + const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + __m128i mask, hev, flat, flat2; __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; __m128i abs_p1p0; - const unsigned int extended_thresh = _thresh[0] * 0x01010101u; - const unsigned int extended_limit = _limit[0] * 0x01010101u; - const unsigned int extended_blimit = _blimit[0] * 0x01010101u; - const __m128i thresh = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0); - const __m128i limit = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0); - const __m128i blimit = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4), @@ -105,7 +99,7 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); - /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ + // (vp9_filter + 3 * (qs0 - ps0)) & mask filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi8(filt, t4); @@ -116,11 +110,11 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, filter2 = _mm_unpacklo_epi8(zero, filter2); filter2 = _mm_srai_epi16(filter2, 0xB); - /* Filter1 >> 3 */ + // Filter1 >> 3 filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); - /* filt >> 1 */ + // filt >> 1 filt = _mm_adds_epi16(filter1, t1); filt = _mm_srai_epi16(filt, 1); filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), @@ -375,32 +369,25 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { - DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]); - DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_op, 7 * 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_oq, 7 * 16); - DECLARE_ALIGNED(16, unsigned char, flat_op[3][16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq[3][16]); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op, 3 * 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq, 3 * 16); - DECLARE_ALIGNED(16, unsigned char, ap[8][16]); - DECLARE_ALIGNED(16, unsigned char, aq[8][16]); + DECLARE_ALIGNED_ARRAY(16, unsigned char, ap, 8 * 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, aq, 8 * 16); - - __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); + const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + const __m128i limit = _mm_load_si128((const __m128i *)_limit); + const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + __m128i mask, hev, flat, flat2; __m128i p7, p6, p5; __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; __m128i q5, q6, q7; int i = 0; - const unsigned int extended_thresh = _thresh[0] * 0x01010101u; - const unsigned int extended_limit = _limit[0] * 0x01010101u; - const unsigned int extended_blimit = _blimit[0] * 0x01010101u; - const __m128i thresh = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0); - const __m128i limit = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0); - const __m128i blimit = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); @@ -413,16 +400,16 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); - _mm_store_si128((__m128i *)ap[4], p4); - _mm_store_si128((__m128i *)ap[3], p3); - _mm_store_si128((__m128i *)ap[2], p2); - _mm_store_si128((__m128i *)ap[1], p1); - _mm_store_si128((__m128i *)ap[0], p0); - _mm_store_si128((__m128i *)aq[4], q4); - _mm_store_si128((__m128i *)aq[3], q3); - _mm_store_si128((__m128i *)aq[2], q2); - _mm_store_si128((__m128i *)aq[1], q1); - _mm_store_si128((__m128i *)aq[0], q0); + _mm_store_si128((__m128i *)&ap[4 * 16], p4); + _mm_store_si128((__m128i *)&ap[3 * 16], p3); + _mm_store_si128((__m128i *)&ap[2 * 16], p2); + _mm_store_si128((__m128i *)&ap[1 * 16], p1); + _mm_store_si128((__m128i *)&ap[0 * 16], p0); + _mm_store_si128((__m128i *)&aq[4 * 16], q4); + _mm_store_si128((__m128i *)&aq[3 * 16], q3); + _mm_store_si128((__m128i *)&aq[2 * 16], q2); + _mm_store_si128((__m128i *)&aq[1 * 16], q1); + _mm_store_si128((__m128i *)&aq[0 * 16], q0); { @@ -486,13 +473,13 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); - /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ + // (vp9_filter + 3 * (qs0 - ps0)) & mask filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi8(filt, t4); filter2 = _mm_adds_epi8(filt, t3); - /* Filter1 >> 3 */ + // Filter1 >> 3 work_a = _mm_cmpgt_epi8(zero, filter1); filter1 = _mm_srli_epi16(filter1, 3); work_a = _mm_and_si128(work_a, te0); @@ -500,7 +487,7 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, filter1 = _mm_or_si128(filter1, work_a); qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); - /* Filter2 >> 3 */ + // Filter2 >> 3 work_a = _mm_cmpgt_epi8(zero, filter2); filter2 = _mm_srli_epi16(filter2, 3); work_a = _mm_and_si128(work_a, te0); @@ -508,7 +495,7 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, filter2 = _mm_or_si128(filter2, work_a); ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); - /* filt >> 1 */ + // filt >> 1 filt = _mm_adds_epi8(filter1, t1); work_a = _mm_cmpgt_epi8(zero, filt); filt = _mm_srli_epi16(filt, 1); @@ -546,8 +533,8 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, _mm_subs_epu8(p0, p5)), _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5))); - _mm_store_si128((__m128i *)ap[5], p5); - _mm_store_si128((__m128i *)aq[5], q5); + _mm_store_si128((__m128i *)&ap[5 * 16], p5); + _mm_store_si128((__m128i *)&aq[5 * 16], q5); flat2 = _mm_max_epu8(work, flat2); p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); @@ -555,8 +542,8 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, _mm_subs_epu8(p0, p6)), _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6))); - _mm_store_si128((__m128i *)ap[6], p6); - _mm_store_si128((__m128i *)aq[6], q6); + _mm_store_si128((__m128i *)&ap[6 * 16], p6); + _mm_store_si128((__m128i *)&aq[6 * 16], q6); flat2 = _mm_max_epu8(work, flat2); p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); @@ -565,8 +552,8 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, _mm_subs_epu8(p0, p7)), _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7))); - _mm_store_si128((__m128i *)ap[7], p7); - _mm_store_si128((__m128i *)aq[7], q7); + _mm_store_si128((__m128i *)&ap[7 * 16], p7); + _mm_store_si128((__m128i *)&aq[7 * 16], q7); flat2 = _mm_max_epu8(work, flat2); flat2 = _mm_subs_epu8(flat2, one); flat2 = _mm_cmpeq_epi8(flat2, zero); @@ -586,22 +573,38 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, __m128i a, b, c; unsigned int off = i * 8; - p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7] + off)), zero); - p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6] + off)), zero); - p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5] + off)), zero); - p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4] + off)), zero); - p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3] + off)), zero); - p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2] + off)), zero); - p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1] + off)), zero); - p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0] + off)), zero); - q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0] + off)), zero); - q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1] + off)), zero); - q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2] + off)), zero); - q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3] + off)), zero); - q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4] + off)), zero); - q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5] + off)), zero); - q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6] + off)), zero); - q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7] + off)), zero); + p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[7 * 16] + off)), + zero); + p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[6 * 16] + off)), + zero); + p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[5 * 16] + off)), + zero); + p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[4 * 16] + off)), + zero); + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[3 * 16] + off)), + zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[2 * 16] + off)), + zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[1 * 16] + off)), + zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[0 * 16] + off)), + zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[0 * 16] + off)), + zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[1 * 16] + off)), + zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[2 * 16] + off)), + zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[3 * 16] + off)), + zero); + q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[4 * 16] + off)), + zero); + q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[5 * 16] + off)), + zero); + q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[6 * 16] + off)), + zero); + q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[7 * 16] + off)), + zero); c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7 c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c)); @@ -610,117 +613,117 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1)); a = _mm_add_epi16(_mm_add_epi16(p0, q0), a); - _mm_storel_epi64((__m128i *)&flat_op[2][i*8], + _mm_storel_epi64((__m128i *)&flat_op[2 * 16 + i * 8], _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) , b)); c = _mm_add_epi16(_mm_add_epi16(p5, eight), c); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[6][i*8], + _mm_storel_epi64((__m128i *)&flat2_op[6 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q1, a); b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1); - _mm_storel_epi64((__m128i *)&flat_op[1][i*8], + _mm_storel_epi64((__m128i *)&flat_op[1 * 16 + i * 8], _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) , b)); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[5][i*8], + _mm_storel_epi64((__m128i *)&flat2_op[5 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q2, a); b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0); - _mm_storel_epi64((__m128i *)&flat_op[0][i*8], + _mm_storel_epi64((__m128i *)&flat_op[i * 8], _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) , b)); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[4][i*8], + _mm_storel_epi64((__m128i *)&flat2_op[4 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q3, a); b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0); - _mm_storel_epi64((__m128i *)&flat_oq[0][i*8], + _mm_storel_epi64((__m128i *)&flat_oq[i * 8], _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) , b)); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[3][i*8], + _mm_storel_epi64((__m128i *)&flat2_op[3 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); b = _mm_add_epi16(q3, b); b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1); - _mm_storel_epi64((__m128i *)&flat_oq[1][i*8], + _mm_storel_epi64((__m128i *)&flat_oq[16 + i * 8], _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) , b)); c = _mm_add_epi16(q4, c); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[2][i*8], + _mm_storel_epi64((__m128i *)&flat2_op[2 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); b = _mm_add_epi16(q3, b); b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2); - _mm_storel_epi64((__m128i *)&flat_oq[2][i*8], + _mm_storel_epi64((__m128i *)&flat_oq[2 * 16 + i * 8], _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) , b)); a = _mm_add_epi16(q5, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[1][i*8], + _mm_storel_epi64((__m128i *)&flat2_op[16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q6, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[0][i*8], + _mm_storel_epi64((__m128i *)&flat2_op[i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q7, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8], + _mm_storel_epi64((__m128i *)&flat2_oq[i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q7, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8], + _mm_storel_epi64((__m128i *)&flat2_oq[16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q7, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8], + _mm_storel_epi64((__m128i *)&flat2_oq[2 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q7, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8], + _mm_storel_epi64((__m128i *)&flat2_oq[3 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q7, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8], + _mm_storel_epi64((__m128i *)&flat2_oq[4 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q7, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8], + _mm_storel_epi64((__m128i *)&flat2_oq[5 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q7, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8], + _mm_storel_epi64((__m128i *)&flat2_oq[6 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); temp_flat2 = _mm_srli_si128(temp_flat2, 8); @@ -730,51 +733,51 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, // wide flat // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - work_a = _mm_load_si128((__m128i *)ap[2]); - p2 = _mm_load_si128((__m128i *)flat_op[2]); + work_a = _mm_load_si128((__m128i *)&ap[2 * 16]); + p2 = _mm_load_si128((__m128i *)&flat_op[2 * 16]); work_a = _mm_andnot_si128(flat, work_a); p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); - _mm_store_si128((__m128i *)flat_op[2], p2); + _mm_store_si128((__m128i *)&flat_op[2 * 16], p2); - p1 = _mm_load_si128((__m128i *)flat_op[1]); + p1 = _mm_load_si128((__m128i *)&flat_op[1 * 16]); work_a = _mm_andnot_si128(flat, ps1); p1 = _mm_and_si128(flat, p1); p1 = _mm_or_si128(work_a, p1); - _mm_store_si128((__m128i *)flat_op[1], p1); + _mm_store_si128((__m128i *)&flat_op[1 * 16], p1); - p0 = _mm_load_si128((__m128i *)flat_op[0]); + p0 = _mm_load_si128((__m128i *)&flat_op[0]); work_a = _mm_andnot_si128(flat, ps0); p0 = _mm_and_si128(flat, p0); p0 = _mm_or_si128(work_a, p0); - _mm_store_si128((__m128i *)flat_op[0], p0); + _mm_store_si128((__m128i *)&flat_op[0], p0); - q0 = _mm_load_si128((__m128i *)flat_oq[0]); + q0 = _mm_load_si128((__m128i *)&flat_oq[0]); work_a = _mm_andnot_si128(flat, qs0); q0 = _mm_and_si128(flat, q0); q0 = _mm_or_si128(work_a, q0); - _mm_store_si128((__m128i *)flat_oq[0], q0); + _mm_store_si128((__m128i *)&flat_oq[0], q0); - q1 = _mm_load_si128((__m128i *)flat_oq[1]); + q1 = _mm_load_si128((__m128i *)&flat_oq[1 * 16]); work_a = _mm_andnot_si128(flat, qs1); q1 = _mm_and_si128(flat, q1); q1 = _mm_or_si128(work_a, q1); - _mm_store_si128((__m128i *)flat_oq[1], q1); + _mm_store_si128((__m128i *)&flat_oq[1 * 16], q1); - work_a = _mm_load_si128((__m128i *)aq[2]); - q2 = _mm_load_si128((__m128i *)flat_oq[2]); + work_a = _mm_load_si128((__m128i *)&aq[2 * 16]); + q2 = _mm_load_si128((__m128i *)&flat_oq[2 * 16]); work_a = _mm_andnot_si128(flat, work_a); q2 = _mm_and_si128(flat, q2); q2 = _mm_or_si128(work_a, q2); - _mm_store_si128((__m128i *)flat_oq[2], q2); + _mm_store_si128((__m128i *)&flat_oq[2 * 16], q2); // write out op6 - op3 { unsigned char *dst = (s - 7 * p); for (i = 6; i > 2; i--) { __m128i flat2_output; - work_a = _mm_load_si128((__m128i *)ap[i]); - flat2_output = _mm_load_si128((__m128i *)flat2_op[i]); + work_a = _mm_load_si128((__m128i *)&ap[i * 16]); + flat2_output = _mm_load_si128((__m128i *)&flat2_op[i * 16]); work_a = _mm_andnot_si128(flat2, work_a); flat2_output = _mm_and_si128(flat2, flat2_output); work_a = _mm_or_si128(work_a, flat2_output); @@ -783,43 +786,43 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, } } - work_a = _mm_load_si128((__m128i *)flat_op[2]); - p2 = _mm_load_si128((__m128i *)flat2_op[2]); + work_a = _mm_load_si128((__m128i *)&flat_op[2 * 16]); + p2 = _mm_load_si128((__m128i *)&flat2_op[2 * 16]); work_a = _mm_andnot_si128(flat2, work_a); p2 = _mm_and_si128(flat2, p2); p2 = _mm_or_si128(work_a, p2); _mm_storeu_si128((__m128i *)(s - 3 * p), p2); - work_a = _mm_load_si128((__m128i *)flat_op[1]); - p1 = _mm_load_si128((__m128i *)flat2_op[1]); + work_a = _mm_load_si128((__m128i *)&flat_op[1 * 16]); + p1 = _mm_load_si128((__m128i *)&flat2_op[1 * 16]); work_a = _mm_andnot_si128(flat2, work_a); p1 = _mm_and_si128(flat2, p1); p1 = _mm_or_si128(work_a, p1); _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - work_a = _mm_load_si128((__m128i *)flat_op[0]); - p0 = _mm_load_si128((__m128i *)flat2_op[0]); + work_a = _mm_load_si128((__m128i *)&flat_op[0]); + p0 = _mm_load_si128((__m128i *)&flat2_op[0]); work_a = _mm_andnot_si128(flat2, work_a); p0 = _mm_and_si128(flat2, p0); p0 = _mm_or_si128(work_a, p0); _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - work_a = _mm_load_si128((__m128i *)flat_oq[0]); - q0 = _mm_load_si128((__m128i *)flat2_oq[0]); + work_a = _mm_load_si128((__m128i *)&flat_oq[0]); + q0 = _mm_load_si128((__m128i *)&flat2_oq[0]); work_a = _mm_andnot_si128(flat2, work_a); q0 = _mm_and_si128(flat2, q0); q0 = _mm_or_si128(work_a, q0); _mm_storeu_si128((__m128i *)(s - 0 * p), q0); - work_a = _mm_load_si128((__m128i *)flat_oq[1]); - q1 = _mm_load_si128((__m128i *)flat2_oq[1]); + work_a = _mm_load_si128((__m128i *)&flat_oq[1 * 16]); + q1 = _mm_load_si128((__m128i *)&flat2_oq[16]); work_a = _mm_andnot_si128(flat2, work_a); q1 = _mm_and_si128(flat2, q1); q1 = _mm_or_si128(work_a, q1); _mm_storeu_si128((__m128i *)(s + 1 * p), q1); - work_a = _mm_load_si128((__m128i *)flat_oq[2]); - q2 = _mm_load_si128((__m128i *)flat2_oq[2]); + work_a = _mm_load_si128((__m128i *)&flat_oq[2 * 16]); + q2 = _mm_load_si128((__m128i *)&flat2_oq[2 * 16]); work_a = _mm_andnot_si128(flat2, work_a); q2 = _mm_and_si128(flat2, q2); q2 = _mm_or_si128(work_a, q2); @@ -830,8 +833,8 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, unsigned char *dst = (s + 3 * p); for (i = 3; i < 7; i++) { __m128i flat2_output; - work_a = _mm_load_si128((__m128i *)aq[i]); - flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]); + work_a = _mm_load_si128((__m128i *)&aq[i * 16]); + flat2_output = _mm_load_si128((__m128i *)&flat2_oq[i * 16]); work_a = _mm_andnot_si128(flat2, work_a); flat2_output = _mm_and_si128(flat2, flat2_output); work_a = _mm_or_si128(work_a, flat2_output); @@ -842,65 +845,62 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, } } -void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, - int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh, - int count) { +// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly. +void vp9_lpf_horizontal_16_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh, int count) { if (count == 1) mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh); else mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh); } -void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, - int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh, - int count) { - DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); - __m128i mask, hev, flat; +void vp9_lpf_horizontal_8_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh, int count) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16); const __m128i zero = _mm_set1_epi16(0); + const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + const __m128i limit = _mm_load_si128((const __m128i *)_limit); + const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + __m128i mask, hev, flat; __m128i p3, p2, p1, p0, q0, q1, q2, q3; - const unsigned int extended_thresh = _thresh[0] * 0x01010101u; - const unsigned int extended_limit = _limit[0] * 0x01010101u; - const unsigned int extended_blimit = _blimit[0] * 0x01010101u; - const __m128i thresh = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0); - const __m128i limit = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0); - const __m128i blimit = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); + __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0; (void)count; - p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); - p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); - p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); - p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); - q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); - q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); - q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); - q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); + + q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), + _mm_loadl_epi64((__m128i *)(s + 3 * p))); + q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), + _mm_loadl_epi64((__m128i *)(s + 2 * p))); + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + _mm_loadl_epi64((__m128i *)(s + 1 * p))); + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + _mm_loadl_epi64((__m128i *)(s - 0 * p))); + p1q1 = _mm_shuffle_epi32(q1p1, 78); + p0q0 = _mm_shuffle_epi32(q0p0, 78); + { - const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), - _mm_subs_epu8(p0, p1)); - const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), - _mm_subs_epu8(q0, q1)); + // filter_mask and hev_mask const __m128i one = _mm_set1_epi8(1); const __m128i fe = _mm_set1_epi8(0xfe); - const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), - _mm_subs_epu8(q0, p0)); - __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), - _mm_subs_epu8(q1, p1)); - __m128i work; + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), + _mm_subs_epu8(q0p0, q1p1)); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + + abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), + _mm_subs_epu8(p0q0, q0p0)); + abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), + _mm_subs_epu8(p1q1, q1p1)); flat = _mm_max_epu8(abs_p1p0, abs_q1q0); hev = _mm_subs_epu8(flat, thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); @@ -910,36 +910,32 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(flat, mask); + mask = _mm_max_epu8(abs_p1p0, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), - _mm_subs_epu8(p1, p2)), - _mm_or_si128(_mm_subs_epu8(p3, p2), - _mm_subs_epu8(p2, p3))); - mask = _mm_max_epu8(work, mask); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), - _mm_subs_epu8(q1, q2)), - _mm_or_si128(_mm_subs_epu8(q3, q2), - _mm_subs_epu8(q2, q3))); + + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1), + _mm_subs_epu8(q1p1, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), + _mm_subs_epu8(q2p2, q3p3))); mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); mask = _mm_subs_epu8(mask, limit); mask = _mm_cmpeq_epi8(mask, zero); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), - _mm_subs_epu8(p0, p2)), - _mm_or_si128(_mm_subs_epu8(q2, q0), - _mm_subs_epu8(q0, q2))); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), - _mm_subs_epu8(p0, p3)), - _mm_or_si128(_mm_subs_epu8(q3, q0), - _mm_subs_epu8(q0, q3))); - flat = _mm_max_epu8(work, flat); + // flat_mask4 + + flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0), + _mm_subs_epu8(q0p0, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), + _mm_subs_epu8(q0p0, q3p3))); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); } + { const __m128i four = _mm_set1_epi16(4); unsigned char *src = s; @@ -996,11 +992,7 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); - const __m128i t1f = _mm_set1_epi8(0x1f); const __m128i t1 = _mm_set1_epi8(0x1); - const __m128i t7f = _mm_set1_epi8(0x7f); - const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80); const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), @@ -1018,33 +1010,27 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); - /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ + // (vp9_filter + 3 * (qs0 - ps0)) & mask filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi8(filt, t4); filter2 = _mm_adds_epi8(filt, t3); - /* Filter1 >> 3 */ - work_a = _mm_cmpgt_epi8(zero, filter1); - filter1 = _mm_srli_epi16(filter1, 3); - work_a = _mm_and_si128(work_a, te0); - filter1 = _mm_and_si128(filter1, t1f); - filter1 = _mm_or_si128(filter1, work_a); + // Filter1 >> 3 + filter1 = _mm_unpacklo_epi8(zero, filter1); + filter1 = _mm_srai_epi16(filter1, 11); + filter1 = _mm_packs_epi16(filter1, filter1); - /* Filter2 >> 3 */ - work_a = _mm_cmpgt_epi8(zero, filter2); - filter2 = _mm_srli_epi16(filter2, 3); - work_a = _mm_and_si128(work_a, te0); - filter2 = _mm_and_si128(filter2, t1f); - filter2 = _mm_or_si128(filter2, work_a); + // Filter2 >> 3 + filter2 = _mm_unpacklo_epi8(zero, filter2); + filter2 = _mm_srai_epi16(filter2, 11); + filter2 = _mm_packs_epi16(filter2, zero); - /* filt >> 1 */ + // filt >> 1 filt = _mm_adds_epi8(filter1, t1); - work_a = _mm_cmpgt_epi8(zero, filt); - filt = _mm_srli_epi16(filt, 1); - work_a = _mm_and_si128(work_a, t80); - filt = _mm_and_si128(filt, t7f); - filt = _mm_or_si128(filt, work_a); + filt = _mm_unpacklo_epi8(zero, filt); + filt = _mm_srai_epi16(filt, 9); + filt = _mm_packs_epi16(filt, zero); filt = _mm_andnot_si128(hev, filt); @@ -1093,12 +1079,397 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, } } +void vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, + const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16); + const __m128i zero = _mm_set1_epi16(0); + const __m128i blimit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + const __m128i limit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + const __m128i thresh = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), + _mm_load_si128((const __m128i *)_thresh1)); + + __m128i mask, hev, flat; + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + { + const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), + _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), + _mm_subs_epu8(q0, q1)); + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), + _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), + _mm_subs_epu8(q1, p1)); + __m128i work; + + // filter_mask and hev_mask + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), + _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), + _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), + _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), + _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // flat_mask4 + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), + _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), + _mm_subs_epu8(q0, q2))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), + _mm_subs_epu8(p0, p3)), + _mm_or_si128(_mm_subs_epu8(q3, q0), + _mm_subs_epu8(q0, q3))); + flat = _mm_max_epu8(work, flat); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + } + { + const __m128i four = _mm_set1_epi16(4); + unsigned char *src = s; + int i = 0; + + do { + __m128i workp_a, workp_b, workp_shft; + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); + + workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); + workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op2[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op1[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op0[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq0[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq1[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq2[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + src += 8; + } while (++i < 2); + } + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), + t80); + const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), + t80); + const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), + t80); + const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), + t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (vp9_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + + // Filter2 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + + filt = _mm_andnot_si128(hev, filt); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q0 = _mm_load_si128((__m128i *)flat_oq0); + work_a = _mm_andnot_si128(flat, work_a); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + q1 = _mm_load_si128((__m128i *)flat_oq1); + work_a = _mm_andnot_si128(flat, work_a); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + + work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q2 = _mm_load_si128((__m128i *)flat_oq2); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p0 = _mm_load_si128((__m128i *)flat_op0); + work_a = _mm_andnot_si128(flat, work_a); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + p1 = _mm_load_si128((__m128i *)flat_op1); + work_a = _mm_andnot_si128(flat, work_a); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + + work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p2 = _mm_load_si128((__m128i *)flat_op2); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + + _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + } +} + +void vp9_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0, + const unsigned char *_blimit1, + const unsigned char *_limit1, + const unsigned char *_thresh1) { + const __m128i blimit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + const __m128i limit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + const __m128i thresh = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), + _mm_load_si128((const __m128i *)_thresh1)); + const __m128i zero = _mm_set1_epi16(0); + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + __m128i mask, hev, flat; + + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + + // filter_mask and hev_mask + { + const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), + _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), + _mm_subs_epu8(q0, q1)); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), + _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), + _mm_subs_epu8(q1, p1)); + __m128i work; + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), + _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), + _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), + _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), + _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // filter4 + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), + t80); + const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), + t80); + const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), + t80); + const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), + t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (vp9_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + + // Filter2 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + + filt = _mm_andnot_si128(hev, filt); + + q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + } +} + static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, int in_p, unsigned char *out, int out_p) { __m128i x0, x1, x2, x3, x4, x5, x6, x7; __m128i x8, x9, x10, x11, x12, x13, x14, x15; - /* Read in 16 lines */ + // Read in 16 lines x0 = _mm_loadl_epi64((__m128i *)in0); x8 = _mm_loadl_epi64((__m128i *)in1); x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); @@ -1136,7 +1507,7 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, x14 = _mm_unpacklo_epi32(x12, x13); x15 = _mm_unpackhi_epi32(x12, x13); - /* Store first 4-line result */ + // Store first 4-line result _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14)); _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14)); _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15)); @@ -1152,7 +1523,7 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, x14 = _mm_unpacklo_epi32(x12, x13); x15 = _mm_unpackhi_epi32(x12, x13); - /* Store second 4-line result */ + // Store second 4-line result _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14)); _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14)); _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15)); @@ -1222,61 +1593,124 @@ static INLINE void transpose(unsigned char *src[], int in_p, } while (++idx8x8 < num_8x8_to_transpose); } -void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s, - int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, - int count) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); +void vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8); unsigned char *src[2]; unsigned char *dst[2]; - (void)count; - /* Transpose 16x16 */ - transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16); - transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16); + // Transpose 8x16 + transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); - /* Loop filtering */ - vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit, - thresh, 1); - src[0] = t_dst + 3 * 16; - src[1] = t_dst + 3 * 16 + 8; + // Loop filtering + vp9_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, + blimit1, limit1, thresh1); + src[0] = t_dst; + src[1] = t_dst + 8; + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; - dst[0] = s - 5; - dst[1] = s - 5 + p * 8; - - /* Transpose 16x8 */ + // Transpose back transpose(src, 16, dst, p, 2); } -void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s, - int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); - unsigned char *src[4]; - unsigned char *dst[4]; +void vp9_lpf_vertical_8_sse2(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, int count) { + DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 8); + unsigned char *src[1]; + unsigned char *dst[1]; + (void)count; + // Transpose 8x8 + src[0] = s - 4; dst[0] = t_dst; - dst[1] = t_dst + 8 * 16; - src[0] = s - 8; - src[1] = s - 8 + 8; + transpose(src, p, dst, 8, 1); - /* Transpose 16x16 */ - transpose(src, p, dst, 16, 2); - - /* Loop filtering */ - vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit, - thresh, 1); + // Loop filtering + vp9_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1); src[0] = t_dst; - src[1] = t_dst + 8 * 16; + dst[0] = s - 4; - dst[0] = s - 8; - dst[1] = s - 8 + 8; + // Transpose back + transpose(src, 8, dst, p, 1); +} +void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8); + unsigned char *src[2]; + unsigned char *dst[2]; + + // Transpose 8x16 + transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + + // Loop filtering + vp9_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, + blimit1, limit1, thresh1); + src[0] = t_dst; + src[1] = t_dst + 8; + + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; + + // Transpose back transpose(src, 16, dst, p, 2); } + +void vp9_lpf_vertical_16_sse2(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { + DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 16); + unsigned char *src[2]; + unsigned char *dst[2]; + + src[0] = s - 8; + src[1] = s; + dst[0] = t_dst; + dst[1] = t_dst + 8 * 8; + + // Transpose 16x8 + transpose(src, p, dst, 8, 2); + + // Loop filtering + mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh); + + src[0] = t_dst; + src[1] = t_dst + 8 * 8; + dst[0] = s - 8; + dst[1] = s; + + // Transpose back + transpose(src, 8, dst, p, 2); +} + +void vp9_lpf_vertical_16_dual_sse2(unsigned char *s, int p, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); + + // Transpose 16x16 + transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); + transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); + + // Loop filtering + mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit, + thresh); + + // Transpose back + transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); + transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); +} diff --git a/media/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm b/media/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm index 4ebb51b77274..91055b9f9d49 100644 --- a/media/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm +++ b/media/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm @@ -12,7 +12,7 @@ %include "vpx_ports/x86_abi_support.asm" -;void vp9_loop_filter_horizontal_edge_mmx +;void vp9_lpf_horizontal_4_mmx ;( ; unsigned char *src_ptr, ; int src_pixel_step, @@ -21,8 +21,8 @@ ; const char *thresh, ; int count ;) -global sym(vp9_loop_filter_horizontal_edge_mmx) PRIVATE -sym(vp9_loop_filter_horizontal_edge_mmx): +global sym(vp9_lpf_horizontal_4_mmx) PRIVATE +sym(vp9_lpf_horizontal_4_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -224,7 +224,7 @@ sym(vp9_loop_filter_horizontal_edge_mmx): ret -;void vp9_loop_filter_vertical_edge_mmx +;void vp9_lpf_vertical_4_mmx ;( ; unsigned char *src_ptr, ; int src_pixel_step, @@ -233,8 +233,8 @@ sym(vp9_loop_filter_horizontal_edge_mmx): ; const char *thresh, ; int count ;) -global sym(vp9_loop_filter_vertical_edge_mmx) PRIVATE -sym(vp9_loop_filter_vertical_edge_mmx): +global sym(vp9_lpf_vertical_4_mmx) PRIVATE +sym(vp9_lpf_vertical_4_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 @@ -527,7 +527,7 @@ sym(vp9_loop_filter_vertical_edge_mmx): pxor mm7, [GLOBAL(t80)] ; unoffset ; mm7 = q1 - ; tranpose and write back + ; transpose and write back ; mm1 = 72 62 52 42 32 22 12 02 ; mm6 = 73 63 53 43 33 23 13 03 ; mm3 = 74 64 54 44 34 24 14 04 diff --git a/media/libvpx/vp9/common/x86/vp9_postproc_mmx.asm b/media/libvpx/vp9/common/x86/vp9_postproc_mmx.asm deleted file mode 100644 index c2118dbb74e7..000000000000 --- a/media/libvpx/vp9/common/x86/vp9_postproc_mmx.asm +++ /dev/null @@ -1,534 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%define VP9_FILTER_WEIGHT 128 -%define VP9_FILTER_SHIFT 7 - -;void vp9_post_proc_down_and_across_mmx -;( -; unsigned char *src_ptr, -; unsigned char *dst_ptr, -; int src_pixels_per_line, -; int dst_pixels_per_line, -; int rows, -; int cols, -; int flimit -;) -global sym(vp9_post_proc_down_and_across_mmx) PRIVATE -sym(vp9_post_proc_down_and_across_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - -%if ABI_IS_32BIT=1 && CONFIG_PIC=1 - ; move the global rd onto the stack, since we don't have enough registers - ; to do PIC addressing - movq mm0, [GLOBAL(rd)] - sub rsp, 8 - movq [rsp], mm0 -%define RD [rsp] -%else -%define RD [GLOBAL(rd)] -%endif - - push rbx - lea rbx, [GLOBAL(Blur)] - movd mm2, dword ptr arg(6) ;flimit - punpcklwd mm2, mm2 - punpckldq mm2, mm2 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(1) ;dst_ptr - - movsxd rcx, DWORD PTR arg(4) ;rows - movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? - pxor mm0, mm0 ; mm0 = 00000000 - -.nextrow: - - xor rdx, rdx ; clear out rdx for use as loop counter -.nextcol: - - pxor mm7, mm7 ; mm7 = 00000000 - movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps - movq mm3, [rsi] ; mm4 = r0 p0..p7 - punpcklbw mm3, mm0 ; mm3 = p0..p3 - movq mm1, mm3 ; mm1 = p0..p3 - pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers - - movq mm6, [rbx + 48] ; mm6 = kernel 3 taps - movq mm5, [rsi + rax] ; mm4 = r1 p0..p7 - punpcklbw mm5, mm0 ; mm5 = r1 p0..p3 - pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers - paddusw mm3, mm6 ; mm3 += mm6 - - ; thresholding - movq mm7, mm1 ; mm7 = r0 p0..p3 - psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3 - psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3 - paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) - pcmpgtw mm7, mm2 - - movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers - movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7 - punpcklbw mm5, mm0 ; mm5 = r2 p0..p3 - pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = r0 p0..p3 - psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3 - psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3 - paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - - neg rax - movq mm6, [rbx ] ; kernel 0 taps - movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7 - punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3 - pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = r0 p0..p3 - psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3 - psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3 - paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - movq mm6, [rbx + 16] ; kernel 1 taps - movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7 - punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3 - pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = r0 p0..p3 - psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3 - psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3 - paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - - paddusw mm3, RD ; mm3 += round value - psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 - - pand mm1, mm7 ; mm1 select vals > thresh from source - pandn mm7, mm3 ; mm7 select vals < thresh from blurred result - paddusw mm1, mm7 ; combination - - packuswb mm1, mm0 ; pack to bytes - - movd [rdi], mm1 ; - neg rax ; pitch is positive - - - add rsi, 4 - add rdi, 4 - add rdx, 4 - - cmp edx, dword ptr arg(5) ;cols - jl .nextcol - ; done with the all cols, start the across filtering in place - sub rsi, rdx - sub rdi, rdx - - - push rax - xor rdx, rdx - mov rax, [rdi-4]; - -.acrossnextcol: - pxor mm7, mm7 ; mm7 = 00000000 - movq mm6, [rbx + 32 ] ; - movq mm4, [rdi+rdx] ; mm4 = p0..p7 - movq mm3, mm4 ; mm3 = p0..p7 - punpcklbw mm3, mm0 ; mm3 = p0..p3 - movq mm1, mm3 ; mm1 = p0..p3 - pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers - - movq mm6, [rbx + 48] - psrlq mm4, 8 ; mm4 = p1..p7 - movq mm5, mm4 ; mm5 = p1..p7 - punpcklbw mm5, mm0 ; mm5 = p1..p4 - pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers - paddusw mm3, mm6 ; mm3 += mm6 - - ; thresholding - movq mm7, mm1 ; mm7 = p0..p3 - psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4 - psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 - paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4) - pcmpgtw mm7, mm2 - - movq mm6, [rbx + 64 ] - psrlq mm4, 8 ; mm4 = p2..p7 - movq mm5, mm4 ; mm5 = p2..p7 - punpcklbw mm5, mm0 ; mm5 = p2..p5 - pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = p0..p3 - psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 - psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 - paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - - movq mm6, [rbx ] - movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5 - movq mm5, mm4 ; mm5 = p-2..p5 - punpcklbw mm5, mm0 ; mm5 = p-2..p1 - pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = p0..p3 - psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 - psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 - paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - movq mm6, [rbx + 16] - psrlq mm4, 8 ; mm4 = p-1..p5 - punpcklbw mm4, mm0 ; mm4 = p-1..p2 - pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = p0..p3 - psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4 - psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3 - paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - paddusw mm3, RD ; mm3 += round value - psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 - - pand mm1, mm7 ; mm1 select vals > thresh from source - pandn mm7, mm3 ; mm7 select vals < thresh from blurred result - paddusw mm1, mm7 ; combination - - packuswb mm1, mm0 ; pack to bytes - mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes - movd eax, mm1 - - add rdx, 4 - cmp edx, dword ptr arg(5) ;cols - jl .acrossnextcol; - - mov DWORD PTR [rdi+rdx-4], eax - pop rax - - ; done with this rwo - add rsi,rax ; next line - movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch? - add rdi,rax ; next destination - movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch? - - dec rcx ; decrement count - jnz .nextrow ; next row - pop rbx - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret -%undef RD - - -;void vp9_mbpost_proc_down_mmx(unsigned char *dst, -; int pitch, int rows, int cols,int flimit) -extern sym(vp9_rv) -global sym(vp9_mbpost_proc_down_mmx) PRIVATE -sym(vp9_mbpost_proc_down_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 136 - - ; unsigned char d[16][8] at [rsp] - ; create flimit2 at [rsp+128] - mov eax, dword ptr arg(4) ;flimit - mov [rsp+128], eax - mov [rsp+128+4], eax -%define flimit2 [rsp+128] - -%if ABI_IS_32BIT=0 - lea r8, [GLOBAL(sym(vp9_rv))] -%endif - - ;rows +=8; - add dword ptr arg(2), 8 - - ;for(c=0; c +#include "vpx_ports/mem.h" + +// filters for 16_h8 and 16_v8 +DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +#if defined(__clang__) +# if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \ + (defined(__APPLE__) && __clang_major__ == 5 && __clang_minor__ == 0) +# define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +# else // clang > 3.3, and not 5.0 on macosx. +# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +# endif // clang <= 3.3 +#elif defined(__GNUC__) +# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) +# define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +# elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 +# define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) +# else // gcc > 4.7 +# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +# endif // gcc <= 4.6 +#else // !(gcc || clang) +# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // __clang__ + +void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1, srcReg32b2, filtersReg32; + unsigned int i; + unsigned int src_stride, dst_stride; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((__m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x706u)); + + filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i-=2) { + // load the 2 strides of source + srcReg32b1 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr-3))); + srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, + _mm_loadu_si128((__m128i *) + (src_ptr+src_pixels_per_line-3)), 1); + + // filter the source buffer + srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, + _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + // reading 2 strides of the next 16 bytes + // (part of it was being read by earlier read) + srcReg32b2 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+5))); + srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, + _mm_loadu_si128((__m128i *) + (src_ptr+src_pixels_per_line+5)), 1); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, + _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + // filter the source buffer + srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg); + srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, + _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, + _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); + + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); + srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, + srcRegFilt32b2_1); + + src_ptr+=src_stride; + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, + _mm256_castsi256_si128(srcRegFilt32b1_1)); + + // save the next 16 bits + _mm_store_si128((__m128i*)(output_ptr+output_pitch), + _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); + output_ptr+=dst_stride; + } + + // if the number of strides is odd. + // process only 16 bytes + if (i > 0) { + __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; + __m128i srcRegFilt2, srcRegFilt3; + + srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt4Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, + _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3= _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2= _mm_shuffle_epi8(srcReg1, + _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, + _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + + // reading the next 16 bytes + // (part of it was being read by earlier read) + srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + // filter the source buffer + srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt4Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, + _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, + _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, + _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm256_castsi256_si128(addFilterReg64)); + + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm256_castsi256_si128(addFilterReg64)); + + // shift by 7 bit each 16 bit + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); + srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); + } +} + +void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg64; + __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; + __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; + __m256i srcReg32b11, srcReg32b12, filtersReg32; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + unsigned int i; + unsigned int src_stride, dst_stride; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((__m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, + _mm256_set1_epi16(0x706u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + // load 16 bytes 7 times in stride of src_pitch + srcReg32b1 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr))); + srcReg32b2 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch))); + srcReg32b3 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2))); + srcReg32b4 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3))); + srcReg32b5 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4))); + srcReg32b6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5))); + srcReg32b7 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6))); + + // have each consecutive loads on the same 256 register + srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, + _mm256_castsi256_si128(srcReg32b2), 1); + srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, + _mm256_castsi256_si128(srcReg32b3), 1); + srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, + _mm256_castsi256_si128(srcReg32b4), 1); + srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, + _mm256_castsi256_si128(srcReg32b5), 1); + srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, + _mm256_castsi256_si128(srcReg32b6), 1); + srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, + _mm256_castsi256_si128(srcReg32b7), 1); + + // merge every two consecutive registers except the last one + srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); + srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); + + // save + srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); + + // save + srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); + + // save + srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); + + // save + srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); + + + for (i = output_height; i > 1; i-=2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg32b8 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7))); + srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, + _mm256_castsi256_si128(srcReg32b8), 1); + srcReg32b9 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8))); + srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, + _mm256_castsi256_si128(srcReg32b9), 1); + + // merge every two consecutive registers + // save + srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); + srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_min_epi16(srcReg32b8, srcReg32b12)); + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_max_epi16(srcReg32b8, srcReg32b12)); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); + + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); + + // add and saturate the results together + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, + _mm256_min_epi16(srcReg32b8, srcReg32b12)); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, + _mm256_max_epi16(srcReg32b8, srcReg32b12)); + + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); + srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); + + src_ptr+=src_stride; + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, + _mm256_castsi256_si128(srcReg32b1)); + + // save the next 16 bits + _mm_store_si128((__m128i*)(output_ptr+out_pitch), + _mm256_extractf128_si256(srcReg32b1, 1)); + + output_ptr+=dst_stride; + + // save part of the registers for next strides + srcReg32b10 = srcReg32b11; + srcReg32b1 = srcReg32b3; + srcReg32b11 = srcReg32b2; + srcReg32b3 = srcReg32b5; + srcReg32b2 = srcReg32b4; + srcReg32b5 = srcReg32b7; + srcReg32b7 = srcReg32b9; + } + if (i > 0) { + __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; + __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; + // load the last 16 bytes + srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); + + // merge the last 2 results together + srcRegFilt4 = _mm_unpacklo_epi8( + _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + srcRegFilt7 = _mm_unpackhi_epi8( + _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, + _mm256_castsi256_si128(forthFilters)); + srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, + _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); + + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), + _mm256_castsi256_si128(secondFilters)); + srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), + _mm256_castsi256_si128(secondFilters)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), + _mm256_castsi256_si128(thirdFilters)); + srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm_min_epi16(srcRegFilt4, srcRegFilt6)); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, + _mm_min_epi16(srcRegFilt5, srcRegFilt7)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm_max_epi16(srcRegFilt4, srcRegFilt6)); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, + _mm_max_epi16(srcRegFilt5, srcRegFilt7)); + + + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm256_castsi256_si128(addFilterReg64)); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, + _mm256_castsi256_si128(addFilterReg64)); + + // shift by 7 bit each 16 bit + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); + } +} diff --git a/media/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/media/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c new file mode 100644 index 000000000000..c4efa6565f3d --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c @@ -0,0 +1,492 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "vpx_ports/mem.h" +#include "vpx_ports/emmintrin_compat.h" + +// filters only for the 4_h8 convolution +DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +// filters for 8_h8 and 16_h8 +DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = { + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + int16_t *filter) { + __m128i firstFilters, secondFilters, shuffle1, shuffle2; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; + __m128i addFilterReg64, filtersReg, srcReg, minReg; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 =_mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((__m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits in the filter into the first lane + firstFilters = _mm_shufflelo_epi16(filtersReg, 0); + // duplicate only the third 16 bit in the filter into the first lane + secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); + // duplicate only the seconds 16 bits in the filter into the second lane + // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 + firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); + // duplicate only the forth 16 bits in the filter into the second lane + // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 + secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); + + // loading the local filters + shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8); + shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); + + for (i = 0; i < output_height; i++) { + srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); + + // filter the source buffer + srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1); + srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + + // extract the higher half of the lane + srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); + srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); + + minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); + + // add and saturate all the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bits + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + src_ptr+=src_pixels_per_line; + + // save only 4 bytes + *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); + + output_ptr+=output_pitch; + } +} + +void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + int16_t *filter) { + __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; + __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; + __m128i addFilterReg64, filtersReg, minReg; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((__m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 128 bit register + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 128 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 128 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 128 bit register + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + filt1Reg = _mm_load_si128((__m128i const *)filt1_global); + filt2Reg = _mm_load_si128((__m128i const *)filt2_global); + filt3Reg = _mm_load_si128((__m128i const *)filt3_global); + filt4Reg = _mm_load_si128((__m128i const *)filt4_global); + + for (i = 0; i < output_height; i++) { + srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); + + // filter the source buffer + srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg); + srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + + // filter the source buffer + srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg); + srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); + srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); + + // add and saturate all the results together + minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + + srcRegFilt2= _mm_max_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bits + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + + src_ptr+=src_pixels_per_line; + + // save only 8 bytes + _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); + + output_ptr+=output_pitch; + } +} + +void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + int16_t *filter) { + __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; + __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m128i firstFilters, secondFilters, thirdFilters, forthFilters; + __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((__m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 128 bit register + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 128 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 128 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 128 bit register + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + filt1Reg = _mm_load_si128((__m128i const *)filt1_global); + filt2Reg = _mm_load_si128((__m128i const *)filt2_global); + filt3Reg = _mm_load_si128((__m128i const *)filt3_global); + filt4Reg = _mm_load_si128((__m128i const *)filt4_global); + + for (i = 0; i < output_height; i++) { + srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); + + // filter the source buffer + srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); + srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg); + srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + + // reading the next 16 bytes. + // (part of it was being read by earlier read) + srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, + _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + // filter the source buffer + srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); + srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg); + srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, + _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64); + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); + srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); + + src_ptr+=src_pixels_per_line; + + // save 16 bytes + _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); + + output_ptr+=output_pitch; + } +} + +void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + int16_t *filter) { + __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6; + __m128i firstFilters, secondFilters, thirdFilters, forthFilters; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((__m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits in the filter + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits in the filter + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits in the filter + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits in the filter + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + for (i = 0; i < output_height; i++) { + // load the first 8 bytes + srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]); + // load the next 8 bytes in stride of src_pitch + srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]); + srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]); + srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]); + + // merge the result together + srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); + srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); + + // load the next 8 bytes in stride of src_pitch + srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]); + srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]); + srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]); + srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]); + + // merge the result together + srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4); + srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); + srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); + + // add and saturate the results together + minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); + srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + + src_ptr+=src_pitch; + + // save only 8 bytes convolve result + _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); + + output_ptr+=out_pitch; + } +} + +void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + int16_t *filter) { + __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3; + __m128i firstFilters, secondFilters, thirdFilters, forthFilters; + __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((__m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg =_mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits in the filter + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits in the filter + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits in the filter + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits in the filter + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + for (i = 0; i < output_height; i++) { + // load the first 16 bytes + srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr)); + // load the next 16 bytes in stride of src_pitch + srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)); + srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)); + srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); + + // merge the result together + srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); + srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); + srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2); + srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters); + srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters); + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); + + // add and saturate the results together + srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); + + // load the next 16 bytes in stride of two/three src_pitch + srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)); + srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)); + + // merge the result together + srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); + srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters); + srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters); + + // load the next 16 bytes in stride of four/five src_pitch + srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)); + srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)); + + // merge the result together + srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); + srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters); + srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters); + + // add and saturate the results together + srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, + _mm_min_epi16(srcRegFilt4, srcRegFilt7)); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm_min_epi16(srcRegFilt6, srcRegFilt8)); + + // add and saturate the results together + srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, + _mm_max_epi16(srcRegFilt4, srcRegFilt7)); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, + _mm_max_epi16(srcRegFilt6, srcRegFilt8)); + srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7); + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1); + + src_ptr+=src_pitch; + + // save 16 bytes convolve result + _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); + + output_ptr+=out_pitch; + } +} diff --git a/media/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/media/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm index 634fa77462fc..fd781d4bc6a6 100644 --- a/media/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm +++ b/media/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm @@ -272,22 +272,23 @@ punpcklbw xmm2, xmm3 ;C D punpcklbw xmm4, xmm5 ;E F - movq xmm6, [rsi + rbx + 8] ;G movq xmm7, [rax + rbx + 8] ;H punpcklbw xmm6, xmm7 ;G H - pmaddubsw xmm0, k0k1 pmaddubsw xmm2, k2k3 pmaddubsw xmm4, k4k5 pmaddubsw xmm6, k6k7 paddsw xmm0, xmm6 - paddsw xmm0, xmm2 + movdqa xmm1, xmm2 + pmaxsw xmm2, xmm4 + pminsw xmm4, xmm1 paddsw xmm0, xmm4 - paddsw xmm0, krd + paddsw xmm0, xmm2 + paddsw xmm0, krd psraw xmm0, 7 packuswb xmm0, xmm0 diff --git a/media/libvpx/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm b/media/libvpx/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm new file mode 100644 index 000000000000..d94ccf2e9b79 --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm @@ -0,0 +1,448 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm3, [rdx] ;load filters + pshuflw xmm4, xmm3, 11111111b ;k3 + psrldq xmm3, 8 + pshuflw xmm3, xmm3, 0b ;k4 + punpcklqdq xmm4, xmm3 ;k3k4 + + movq xmm3, rcx ;rounding + pshufd xmm3, xmm3, 0 + + pxor xmm2, xmm2 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + + punpckldq xmm0, xmm1 ;two row in one register + punpcklbw xmm0, xmm2 ;unpack to word + pmullw xmm0, xmm4 ;multiply the filter factors + + movdqa xmm1, xmm0 + psrldq xmm1, 8 + paddsw xmm0, xmm1 + + paddsw xmm0, xmm3 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + + pshuflw xmm6, xmm7, 11111111b ;k3 + pshufhw xmm7, xmm7, 0b ;k4 + punpcklwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + + movq xmm4, rcx ;rounding + pshufd xmm4, xmm4, 0 + + pxor xmm5, xmm5 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + + pmullw xmm0, xmm6 + pmullw xmm1, xmm7 + paddsw xmm0, xmm1 + paddsw xmm0, xmm4 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpckhbw xmm2, xmm5 + punpckhbw xmm3, xmm5 + + pmullw xmm0, xmm6 + pmullw xmm1, xmm7 + pmullw xmm2, xmm6 + pmullw xmm3, xmm7 + + paddsw xmm0, xmm1 + paddsw xmm2, xmm3 + + paddsw xmm0, xmm4 ;rounding + paddsw xmm2, xmm4 + psraw xmm0, 7 ;shift + psraw xmm2, 7 + packuswb xmm0, xmm2 ;pack back to byte +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +global sym(vp9_filter_block1d4_v2_sse2) PRIVATE +sym(vp9_filter_block1d4_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v2_sse2) PRIVATE +sym(vp9_filter_block1d8_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v2_sse2) PRIVATE +sym(vp9_filter_block1d16_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_v2_avg_sse2) PRIVATE +sym(vp9_filter_block1d4_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v2_avg_sse2) PRIVATE +sym(vp9_filter_block1d8_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v2_avg_sse2) PRIVATE +sym(vp9_filter_block1d16_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h2_sse2) PRIVATE +sym(vp9_filter_block1d4_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h2_sse2) PRIVATE +sym(vp9_filter_block1d8_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h2_sse2) PRIVATE +sym(vp9_filter_block1d16_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h2_avg_sse2) PRIVATE +sym(vp9_filter_block1d4_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h2_avg_sse2) PRIVATE +sym(vp9_filter_block1d8_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h2_avg_sse2) PRIVATE +sym(vp9_filter_block1d16_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm b/media/libvpx/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm new file mode 100644 index 000000000000..b5e18fe6d4a7 --- /dev/null +++ b/media/libvpx/vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm @@ -0,0 +1,422 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm3, [rdx] ;load filters + psrldq xmm3, 6 + packsswb xmm3, xmm3 + pshuflw xmm3, xmm3, 0b ;k3_k4 + + movq xmm2, rcx ;rounding + pshufd xmm2, xmm2, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm3 + + paddsw xmm0, xmm2 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + psrldq xmm7, 6 + packsswb xmm7, xmm7 + pshuflw xmm7, xmm7, 0b ;k3_k4 + punpcklwd xmm7, xmm7 + + movq xmm6, rcx ;rounding + pshufd xmm6, xmm6, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm7 + + paddsw xmm0, xmm6 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte + +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm1 + punpckhbw xmm2, xmm1 + pmaddubsw xmm0, xmm7 + pmaddubsw xmm2, xmm7 + + paddsw xmm0, xmm6 ;rounding + paddsw xmm2, xmm6 + psraw xmm0, 7 ;shift + psraw xmm2, 7 + packuswb xmm0, xmm2 ;pack back to byte + +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +global sym(vp9_filter_block1d4_v2_ssse3) PRIVATE +sym(vp9_filter_block1d4_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v2_ssse3) PRIVATE +sym(vp9_filter_block1d8_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v2_ssse3) PRIVATE +sym(vp9_filter_block1d16_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_v2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d4_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d8_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d16_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h2_ssse3) PRIVATE +sym(vp9_filter_block1d4_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h2_ssse3) PRIVATE +sym(vp9_filter_block1d8_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h2_ssse3) PRIVATE +sym(vp9_filter_block1d16_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d4_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d8_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h2_avg_ssse3) PRIVATE +sym(vp9_filter_block1d16_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vp9/decoder/vp9_dboolhuff.h b/media/libvpx/vp9/decoder/vp9_dboolhuff.h deleted file mode 100644 index fd8e74ca42e5..000000000000 --- a/media/libvpx/vp9/decoder/vp9_dboolhuff.h +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_DECODER_VP9_DBOOLHUFF_H_ -#define VP9_DECODER_VP9_DBOOLHUFF_H_ - -#include -#include - -#include "./vpx_config.h" -#include "vpx_ports/mem.h" -#include "vpx/vpx_integer.h" - -typedef size_t VP9_BD_VALUE; - -#define BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT) - -typedef struct { - const uint8_t *buffer_end; - const uint8_t *buffer; - VP9_BD_VALUE value; - int count; - unsigned int range; -} vp9_reader; - -DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]); - -int vp9_reader_init(vp9_reader *r, const uint8_t *buffer, size_t size); - -void vp9_reader_fill(vp9_reader *r); - -const uint8_t *vp9_reader_find_end(vp9_reader *r); - -static int vp9_read(vp9_reader *br, int probability) { - unsigned int bit = 0; - VP9_BD_VALUE value; - VP9_BD_VALUE bigsplit; - int count; - unsigned int range; - unsigned int split = ((br->range * probability) + (256 - probability)) >> 8; - - if (br->count < 0) - vp9_reader_fill(br); - - value = br->value; - count = br->count; - - bigsplit = (VP9_BD_VALUE)split << (BD_VALUE_SIZE - 8); - - range = split; - - if (value >= bigsplit) { - range = br->range - split; - value = value - bigsplit; - bit = 1; - } - - { - register unsigned int shift = vp9_norm[range]; - range <<= shift; - value <<= shift; - count -= shift; - } - br->value = value; - br->count = count; - br->range = range; - - return bit; -} - -static int vp9_read_bit(vp9_reader *r) { - return vp9_read(r, 128); // vp9_prob_half -} - -static int vp9_read_literal(vp9_reader *br, int bits) { - int z = 0, bit; - - for (bit = bits - 1; bit >= 0; bit--) - z |= vp9_read_bit(br) << bit; - - return z; -} - -int vp9_reader_has_error(vp9_reader *r); - -#endif // VP9_DECODER_VP9_DBOOLHUFF_H_ diff --git a/media/libvpx/vp9/decoder/vp9_decodeframe.c b/media/libvpx/vp9/decoder/vp9_decodeframe.c new file mode 100644 index 000000000000..a9c03f0cc70d --- /dev/null +++ b/media/libvpx/vp9/decoder/vp9_decodeframe.c @@ -0,0 +1,1483 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include // qsort() + +#include "./vp9_rtcd.h" +#include "./vpx_scale_rtcd.h" + +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem_ops.h" +#include "vpx_scale/vpx_scale.h" + +#include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_idct.h" +#include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_thread.h" +#include "vp9/common/vp9_tile_common.h" + +#include "vp9/decoder/vp9_decodeframe.h" +#include "vp9/decoder/vp9_detokenize.h" +#include "vp9/decoder/vp9_decodemv.h" +#include "vp9/decoder/vp9_decoder.h" +#include "vp9/decoder/vp9_dsubexp.h" +#include "vp9/decoder/vp9_dthread.h" +#include "vp9/decoder/vp9_read_bit_buffer.h" +#include "vp9/decoder/vp9_reader.h" + +#define MAX_VP9_HEADER_SIZE 80 + +static int is_compound_reference_allowed(const VP9_COMMON *cm) { + int i; + for (i = 1; i < REFS_PER_FRAME; ++i) + if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]) + return 1; + + return 0; +} + +static void setup_compound_reference_mode(VP9_COMMON *cm) { + if (cm->ref_frame_sign_bias[LAST_FRAME] == + cm->ref_frame_sign_bias[GOLDEN_FRAME]) { + cm->comp_fixed_ref = ALTREF_FRAME; + cm->comp_var_ref[0] = LAST_FRAME; + cm->comp_var_ref[1] = GOLDEN_FRAME; + } else if (cm->ref_frame_sign_bias[LAST_FRAME] == + cm->ref_frame_sign_bias[ALTREF_FRAME]) { + cm->comp_fixed_ref = GOLDEN_FRAME; + cm->comp_var_ref[0] = LAST_FRAME; + cm->comp_var_ref[1] = ALTREF_FRAME; + } else { + cm->comp_fixed_ref = LAST_FRAME; + cm->comp_var_ref[0] = GOLDEN_FRAME; + cm->comp_var_ref[1] = ALTREF_FRAME; + } +} + +static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) { + return len != 0 && len <= (size_t)(end - start); +} + +static int decode_unsigned_max(struct vp9_read_bit_buffer *rb, int max) { + const int data = vp9_rb_read_literal(rb, get_unsigned_bits(max)); + return data > max ? max : data; +} + +static TX_MODE read_tx_mode(vp9_reader *r) { + TX_MODE tx_mode = vp9_read_literal(r, 2); + if (tx_mode == ALLOW_32X32) + tx_mode += vp9_read_bit(r); + return tx_mode; +} + +static void read_tx_mode_probs(struct tx_probs *tx_probs, vp9_reader *r) { + int i, j; + + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) + for (j = 0; j < TX_SIZES - 3; ++j) + vp9_diff_update_prob(r, &tx_probs->p8x8[i][j]); + + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) + for (j = 0; j < TX_SIZES - 2; ++j) + vp9_diff_update_prob(r, &tx_probs->p16x16[i][j]); + + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) + for (j = 0; j < TX_SIZES - 1; ++j) + vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]); +} + +static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) { + int i, j; + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) + for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) + vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]); +} + +static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) { + int i, j; + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) + for (j = 0; j < INTER_MODES - 1; ++j) + vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]); +} + +static REFERENCE_MODE read_frame_reference_mode(const VP9_COMMON *cm, + vp9_reader *r) { + if (is_compound_reference_allowed(cm)) { + return vp9_read_bit(r) ? (vp9_read_bit(r) ? REFERENCE_MODE_SELECT + : COMPOUND_REFERENCE) + : SINGLE_REFERENCE; + } else { + return SINGLE_REFERENCE; + } +} + +static void read_frame_reference_mode_probs(VP9_COMMON *cm, vp9_reader *r) { + FRAME_CONTEXT *const fc = &cm->fc; + int i; + + if (cm->reference_mode == REFERENCE_MODE_SELECT) + for (i = 0; i < COMP_INTER_CONTEXTS; ++i) + vp9_diff_update_prob(r, &fc->comp_inter_prob[i]); + + if (cm->reference_mode != COMPOUND_REFERENCE) + for (i = 0; i < REF_CONTEXTS; ++i) { + vp9_diff_update_prob(r, &fc->single_ref_prob[i][0]); + vp9_diff_update_prob(r, &fc->single_ref_prob[i][1]); + } + + if (cm->reference_mode != SINGLE_REFERENCE) + for (i = 0; i < REF_CONTEXTS; ++i) + vp9_diff_update_prob(r, &fc->comp_ref_prob[i]); +} + +static void update_mv_probs(vp9_prob *p, int n, vp9_reader *r) { + int i; + for (i = 0; i < n; ++i) + if (vp9_read(r, MV_UPDATE_PROB)) + p[i] = (vp9_read_literal(r, 7) << 1) | 1; +} + +static void read_mv_probs(nmv_context *ctx, int allow_hp, vp9_reader *r) { + int i, j; + + update_mv_probs(ctx->joints, MV_JOINTS - 1, r); + + for (i = 0; i < 2; ++i) { + nmv_component *const comp_ctx = &ctx->comps[i]; + update_mv_probs(&comp_ctx->sign, 1, r); + update_mv_probs(comp_ctx->classes, MV_CLASSES - 1, r); + update_mv_probs(comp_ctx->class0, CLASS0_SIZE - 1, r); + update_mv_probs(comp_ctx->bits, MV_OFFSET_BITS, r); + } + + for (i = 0; i < 2; ++i) { + nmv_component *const comp_ctx = &ctx->comps[i]; + for (j = 0; j < CLASS0_SIZE; ++j) + update_mv_probs(comp_ctx->class0_fp[j], MV_FP_SIZE - 1, r); + update_mv_probs(comp_ctx->fp, 3, r); + } + + if (allow_hp) { + for (i = 0; i < 2; ++i) { + nmv_component *const comp_ctx = &ctx->comps[i]; + update_mv_probs(&comp_ctx->class0_hp, 1, r); + update_mv_probs(&comp_ctx->hp, 1, r); + } + } +} + +static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) { + int i; + xd->plane[0].dequant = cm->y_dequant[q_index]; + + for (i = 1; i < MAX_MB_PLANE; i++) + xd->plane[i].dequant = cm->uv_dequant[q_index]; +} + +static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block, + TX_SIZE tx_size, uint8_t *dst, int stride, + int eob) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + if (eob > 0) { + TX_TYPE tx_type = DCT_DCT; + int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + if (xd->lossless) { + tx_type = DCT_DCT; + vp9_iwht4x4_add(dqcoeff, dst, stride, eob); + } else { + const PLANE_TYPE plane_type = pd->plane_type; + switch (tx_size) { + case TX_4X4: + tx_type = get_tx_type_4x4(plane_type, xd, block); + vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob); + break; + case TX_8X8: + tx_type = get_tx_type(plane_type, xd); + vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob); + break; + case TX_16X16: + tx_type = get_tx_type(plane_type, xd); + vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob); + break; + case TX_32X32: + tx_type = DCT_DCT; + vp9_idct32x32_add(dqcoeff, dst, stride, eob); + break; + default: + assert(0 && "Invalid transform size"); + } + } + + if (eob == 1) { + vpx_memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0])); + } else { + if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10) + vpx_memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0])); + else if (tx_size == TX_32X32 && eob <= 34) + vpx_memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0])); + else + vpx_memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0])); + } + } +} + +struct intra_args { + VP9_COMMON *cm; + MACROBLOCKD *xd; + vp9_reader *r; +}; + +static void predict_and_reconstruct_intra_block(int plane, int block, + BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + struct intra_args *const args = (struct intra_args *)arg; + VP9_COMMON *const cm = args->cm; + MACROBLOCKD *const xd = args->xd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + MODE_INFO *const mi = xd->mi[0]; + const PREDICTION_MODE mode = (plane == 0) ? get_y_mode(mi, block) + : mi->mbmi.uv_mode; + int x, y; + uint8_t *dst; + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y); + dst = &pd->dst.buf[4 * y * pd->dst.stride + 4 * x]; + + vp9_predict_intra_block(xd, block >> (tx_size << 1), + b_width_log2(plane_bsize), tx_size, mode, + dst, pd->dst.stride, dst, pd->dst.stride, + x, y, plane); + + if (!mi->mbmi.skip) { + const int eob = vp9_decode_block_tokens(cm, xd, plane, block, + plane_bsize, x, y, tx_size, + args->r); + inverse_transform_block(xd, plane, block, tx_size, dst, pd->dst.stride, + eob); + } +} + +struct inter_args { + VP9_COMMON *cm; + MACROBLOCKD *xd; + vp9_reader *r; + int *eobtotal; +}; + +static void reconstruct_inter_block(int plane, int block, + BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + struct inter_args *args = (struct inter_args *)arg; + VP9_COMMON *const cm = args->cm; + MACROBLOCKD *const xd = args->xd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + int x, y, eob; + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y); + eob = vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, x, y, + tx_size, args->r); + inverse_transform_block(xd, plane, block, tx_size, + &pd->dst.buf[4 * y * pd->dst.stride + 4 * x], + pd->dst.stride, eob); + *args->eobtotal += eob; +} + +static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd, + const TileInfo *const tile, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + const int bw = num_8x8_blocks_wide_lookup[bsize]; + const int bh = num_8x8_blocks_high_lookup[bsize]; + const int x_mis = MIN(bw, cm->mi_cols - mi_col); + const int y_mis = MIN(bh, cm->mi_rows - mi_row); + const int offset = mi_row * cm->mi_stride + mi_col; + int x, y; + + xd->mi = cm->mi_grid_visible + offset; + xd->mi[0] = &cm->mi[offset]; + xd->mi[0]->mbmi.sb_type = bsize; + for (y = 0; y < y_mis; ++y) + for (x = !y; x < x_mis; ++x) + xd->mi[y * cm->mi_stride + x] = xd->mi[0]; + + set_skip_context(xd, mi_row, mi_col); + + // Distance of Mb to the various image edges. These are specified to 8th pel + // as they are always compared to values that are in 1/8th pel units + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols); + + vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col); + return &xd->mi[0]->mbmi; +} + +static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd, + int idx, int mi_row, int mi_col) { + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + RefBuffer *ref_buffer = &cm->frame_refs[mbmi->ref_frame[idx] - LAST_FRAME]; + xd->block_refs[idx] = ref_buffer; + if (!vp9_is_valid_scale(&ref_buffer->sf)) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Invalid scale factors"); + if (ref_buffer->buf->corrupted) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Block reference is corrupt"); + vp9_setup_pre_planes(xd, idx, ref_buffer->buf, mi_row, mi_col, + &ref_buffer->sf); + xd->corrupted |= ref_buffer->buf->corrupted; +} + +static void decode_block(VP9_COMMON *const cm, MACROBLOCKD *const xd, + const TileInfo *const tile, + int mi_row, int mi_col, + vp9_reader *r, BLOCK_SIZE bsize) { + const int less8x8 = bsize < BLOCK_8X8; + MB_MODE_INFO *mbmi = set_offsets(cm, xd, tile, bsize, mi_row, mi_col); + vp9_read_mode_info(cm, xd, tile, mi_row, mi_col, r); + + if (less8x8) + bsize = BLOCK_8X8; + + if (mbmi->skip) { + reset_skip_context(xd, bsize); + } else { + if (cm->seg.enabled) + setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id, + cm->base_qindex)); + } + + if (!is_inter_block(mbmi)) { + struct intra_args arg = { cm, xd, r }; + vp9_foreach_transformed_block(xd, bsize, + predict_and_reconstruct_intra_block, &arg); + } else { + // Setup + set_ref(cm, xd, 0, mi_row, mi_col); + if (has_second_ref(mbmi)) + set_ref(cm, xd, 1, mi_row, mi_col); + + // Prediction + vp9_dec_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); + + // Reconstruction + if (!mbmi->skip) { + int eobtotal = 0; + struct inter_args arg = { cm, xd, r, &eobtotal }; + vp9_foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg); + if (!less8x8 && eobtotal == 0) + mbmi->skip = 1; // skip loopfilter + } + } + + xd->corrupted |= vp9_reader_has_error(r); +} + +static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs, + int mi_row, int mi_col, BLOCK_SIZE bsize, + vp9_reader *r) { + const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize); + const vp9_prob *const probs = get_partition_probs(cm, ctx); + const int has_rows = (mi_row + hbs) < cm->mi_rows; + const int has_cols = (mi_col + hbs) < cm->mi_cols; + PARTITION_TYPE p; + + if (has_rows && has_cols) + p = (PARTITION_TYPE)vp9_read_tree(r, vp9_partition_tree, probs); + else if (!has_rows && has_cols) + p = vp9_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ; + else if (has_rows && !has_cols) + p = vp9_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT; + else + p = PARTITION_SPLIT; + + if (!cm->frame_parallel_decoding_mode) + ++cm->counts.partition[ctx][p]; + + return p; +} + +static void decode_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd, + const TileInfo *const tile, + int mi_row, int mi_col, + vp9_reader* r, BLOCK_SIZE bsize) { + const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2; + PARTITION_TYPE partition; + BLOCK_SIZE subsize, uv_subsize; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) + return; + + partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r); + subsize = get_subsize(bsize, partition); + uv_subsize = ss_size_lookup[subsize][cm->subsampling_x][cm->subsampling_y]; + if (subsize >= BLOCK_8X8 && uv_subsize == BLOCK_INVALID) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid block size."); + if (subsize < BLOCK_8X8) { + decode_block(cm, xd, tile, mi_row, mi_col, r, subsize); + } else { + switch (partition) { + case PARTITION_NONE: + decode_block(cm, xd, tile, mi_row, mi_col, r, subsize); + break; + case PARTITION_HORZ: + decode_block(cm, xd, tile, mi_row, mi_col, r, subsize); + if (mi_row + hbs < cm->mi_rows) + decode_block(cm, xd, tile, mi_row + hbs, mi_col, r, subsize); + break; + case PARTITION_VERT: + decode_block(cm, xd, tile, mi_row, mi_col, r, subsize); + if (mi_col + hbs < cm->mi_cols) + decode_block(cm, xd, tile, mi_row, mi_col + hbs, r, subsize); + break; + case PARTITION_SPLIT: + decode_partition(cm, xd, tile, mi_row, mi_col, r, subsize); + decode_partition(cm, xd, tile, mi_row, mi_col + hbs, r, subsize); + decode_partition(cm, xd, tile, mi_row + hbs, mi_col, r, subsize); + decode_partition(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize); + break; + default: + assert(0 && "Invalid partition type"); + } + } + + // update partition context + if (bsize >= BLOCK_8X8 && + (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) + update_partition_context(xd, mi_row, mi_col, subsize, bsize); +} + +static void setup_token_decoder(const uint8_t *data, + const uint8_t *data_end, + size_t read_size, + struct vpx_internal_error_info *error_info, + vp9_reader *r, + vpx_decrypt_cb decrypt_cb, + void *decrypt_state) { + // Validate the calculated partition length. If the buffer + // described by the partition can't be fully read, then restrict + // it to the portion that can be (for EC mode) or throw an error. + if (!read_is_valid(data, read_size, data_end)) + vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt tile length"); + + if (vp9_reader_init(r, data, read_size, decrypt_cb, decrypt_state)) + vpx_internal_error(error_info, VPX_CODEC_MEM_ERROR, + "Failed to allocate bool decoder %d", 1); +} + +static void read_coef_probs_common(vp9_coeff_probs_model *coef_probs, + vp9_reader *r) { + int i, j, k, l, m; + + if (vp9_read_bit(r)) + for (i = 0; i < PLANE_TYPES; ++i) + for (j = 0; j < REF_TYPES; ++j) + for (k = 0; k < COEF_BANDS; ++k) + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) + for (m = 0; m < UNCONSTRAINED_NODES; ++m) + vp9_diff_update_prob(r, &coef_probs[i][j][k][l][m]); +} + +static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode, + vp9_reader *r) { + const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode]; + TX_SIZE tx_size; + for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) + read_coef_probs_common(fc->coef_probs[tx_size], r); +} + +static void setup_segmentation(struct segmentation *seg, + struct vp9_read_bit_buffer *rb) { + int i, j; + + seg->update_map = 0; + seg->update_data = 0; + + seg->enabled = vp9_rb_read_bit(rb); + if (!seg->enabled) + return; + + // Segmentation map update + seg->update_map = vp9_rb_read_bit(rb); + if (seg->update_map) { + for (i = 0; i < SEG_TREE_PROBS; i++) + seg->tree_probs[i] = vp9_rb_read_bit(rb) ? vp9_rb_read_literal(rb, 8) + : MAX_PROB; + + seg->temporal_update = vp9_rb_read_bit(rb); + if (seg->temporal_update) { + for (i = 0; i < PREDICTION_PROBS; i++) + seg->pred_probs[i] = vp9_rb_read_bit(rb) ? vp9_rb_read_literal(rb, 8) + : MAX_PROB; + } else { + for (i = 0; i < PREDICTION_PROBS; i++) + seg->pred_probs[i] = MAX_PROB; + } + } + + // Segmentation data update + seg->update_data = vp9_rb_read_bit(rb); + if (seg->update_data) { + seg->abs_delta = vp9_rb_read_bit(rb); + + vp9_clearall_segfeatures(seg); + + for (i = 0; i < MAX_SEGMENTS; i++) { + for (j = 0; j < SEG_LVL_MAX; j++) { + int data = 0; + const int feature_enabled = vp9_rb_read_bit(rb); + if (feature_enabled) { + vp9_enable_segfeature(seg, i, j); + data = decode_unsigned_max(rb, vp9_seg_feature_data_max(j)); + if (vp9_is_segfeature_signed(j)) + data = vp9_rb_read_bit(rb) ? -data : data; + } + vp9_set_segdata(seg, i, j, data); + } + } + } +} + +static void setup_loopfilter(struct loopfilter *lf, + struct vp9_read_bit_buffer *rb) { + lf->filter_level = vp9_rb_read_literal(rb, 6); + lf->sharpness_level = vp9_rb_read_literal(rb, 3); + + // Read in loop filter deltas applied at the MB level based on mode or ref + // frame. + lf->mode_ref_delta_update = 0; + + lf->mode_ref_delta_enabled = vp9_rb_read_bit(rb); + if (lf->mode_ref_delta_enabled) { + lf->mode_ref_delta_update = vp9_rb_read_bit(rb); + if (lf->mode_ref_delta_update) { + int i; + + for (i = 0; i < MAX_REF_LF_DELTAS; i++) + if (vp9_rb_read_bit(rb)) + lf->ref_deltas[i] = vp9_rb_read_signed_literal(rb, 6); + + for (i = 0; i < MAX_MODE_LF_DELTAS; i++) + if (vp9_rb_read_bit(rb)) + lf->mode_deltas[i] = vp9_rb_read_signed_literal(rb, 6); + } + } +} + +static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) { + const int old = *delta_q; + *delta_q = vp9_rb_read_bit(rb) ? vp9_rb_read_signed_literal(rb, 4) : 0; + return old != *delta_q; +} + +static void setup_quantization(VP9_COMMON *const cm, MACROBLOCKD *const xd, + struct vp9_read_bit_buffer *rb) { + int update = 0; + + cm->base_qindex = vp9_rb_read_literal(rb, QINDEX_BITS); + update |= read_delta_q(rb, &cm->y_dc_delta_q); + update |= read_delta_q(rb, &cm->uv_dc_delta_q); + update |= read_delta_q(rb, &cm->uv_ac_delta_q); + if (update) + vp9_init_dequantizer(cm); + + xd->lossless = cm->base_qindex == 0 && + cm->y_dc_delta_q == 0 && + cm->uv_dc_delta_q == 0 && + cm->uv_ac_delta_q == 0; +} + +static INTERP_FILTER read_interp_filter(struct vp9_read_bit_buffer *rb) { + const INTERP_FILTER literal_to_filter[] = { EIGHTTAP_SMOOTH, + EIGHTTAP, + EIGHTTAP_SHARP, + BILINEAR }; + return vp9_rb_read_bit(rb) ? SWITCHABLE + : literal_to_filter[vp9_rb_read_literal(rb, 2)]; +} + +void vp9_read_frame_size(struct vp9_read_bit_buffer *rb, + int *width, int *height) { + const int w = vp9_rb_read_literal(rb, 16) + 1; + const int h = vp9_rb_read_literal(rb, 16) + 1; + *width = w; + *height = h; +} + +static void setup_display_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { + cm->display_width = cm->width; + cm->display_height = cm->height; + if (vp9_rb_read_bit(rb)) + vp9_read_frame_size(rb, &cm->display_width, &cm->display_height); +} + +static void resize_context_buffers(VP9_COMMON *cm, int width, int height) { +#if CONFIG_SIZE_LIMIT + if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Width and height beyond allowed size."); +#endif + if (cm->width != width || cm->height != height) { + const int new_mi_rows = + ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2) >> MI_SIZE_LOG2; + const int new_mi_cols = + ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2) >> MI_SIZE_LOG2; + + // Allocations in vp9_alloc_context_buffers() depend on individual + // dimensions as well as the overall size. + if (new_mi_cols > cm->mi_cols || new_mi_rows > cm->mi_rows) { + if (vp9_alloc_context_buffers(cm, width, height)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate context buffers"); + } else { + vp9_set_mb_mi(cm, width, height); + } + vp9_init_context_buffers(cm); + cm->width = width; + cm->height = height; + } +} + +static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { + int width, height; + vp9_read_frame_size(rb, &width, &height); + resize_context_buffers(cm, width, height); + setup_display_size(cm, rb); + + if (vp9_realloc_frame_buffer( + get_frame_new_buffer(cm), cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_DEC_BORDER_IN_PIXELS, + &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb, + cm->cb_priv)) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + } +} + +static void setup_frame_size_with_refs(VP9_COMMON *cm, + struct vp9_read_bit_buffer *rb) { + int width, height; + int found = 0, i; + int has_valid_ref_frame = 0; + for (i = 0; i < REFS_PER_FRAME; ++i) { + if (vp9_rb_read_bit(rb)) { + YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf; + width = buf->y_crop_width; + height = buf->y_crop_height; + if (buf->corrupted) { + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Frame reference is corrupt"); + } + found = 1; + break; + } + } + + if (!found) + vp9_read_frame_size(rb, &width, &height); + + if (width <=0 || height <= 0) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid frame size"); + + // Check to make sure at least one of frames that this frame references + // has valid dimensions. + for (i = 0; i < REFS_PER_FRAME; ++i) { + RefBuffer *const ref_frame = &cm->frame_refs[i]; + has_valid_ref_frame |= valid_ref_frame_size(ref_frame->buf->y_crop_width, + ref_frame->buf->y_crop_height, + width, height); + } + if (!has_valid_ref_frame) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Referenced frame has invalid size"); + + resize_context_buffers(cm, width, height); + setup_display_size(cm, rb); + + if (vp9_realloc_frame_buffer( + get_frame_new_buffer(cm), cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_DEC_BORDER_IN_PIXELS, + &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb, + cm->cb_priv)) { + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + } +} + +static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { + int min_log2_tile_cols, max_log2_tile_cols, max_ones; + vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); + + // columns + max_ones = max_log2_tile_cols - min_log2_tile_cols; + cm->log2_tile_cols = min_log2_tile_cols; + while (max_ones-- && vp9_rb_read_bit(rb)) + cm->log2_tile_cols++; + + if (cm->log2_tile_cols > 6) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid number of tile columns"); + + // rows + cm->log2_tile_rows = vp9_rb_read_bit(rb); + if (cm->log2_tile_rows) + cm->log2_tile_rows += vp9_rb_read_bit(rb); +} + +typedef struct TileBuffer { + const uint8_t *data; + size_t size; + int col; // only used with multi-threaded decoding +} TileBuffer; + +// Reads the next tile returning its size and adjusting '*data' accordingly +// based on 'is_last'. +static void get_tile_buffer(const uint8_t *const data_end, + int is_last, + struct vpx_internal_error_info *error_info, + const uint8_t **data, + vpx_decrypt_cb decrypt_cb, void *decrypt_state, + TileBuffer *buf) { + size_t size; + + if (!is_last) { + if (!read_is_valid(*data, 4, data_end)) + vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt tile length"); + + if (decrypt_cb) { + uint8_t be_data[4]; + decrypt_cb(decrypt_state, *data, be_data, 4); + size = mem_get_be32(be_data); + } else { + size = mem_get_be32(*data); + } + *data += 4; + + if (size > (size_t)(data_end - *data)) + vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt tile size"); + } else { + size = data_end - *data; + } + + buf->data = *data; + buf->size = size; + + *data += size; +} + +static void get_tile_buffers(VP9Decoder *pbi, + const uint8_t *data, const uint8_t *data_end, + int tile_cols, int tile_rows, + TileBuffer (*tile_buffers)[1 << 6]) { + int r, c; + + for (r = 0; r < tile_rows; ++r) { + for (c = 0; c < tile_cols; ++c) { + const int is_last = (r == tile_rows - 1) && (c == tile_cols - 1); + TileBuffer *const buf = &tile_buffers[r][c]; + buf->col = c; + get_tile_buffer(data_end, is_last, &pbi->common.error, &data, + pbi->decrypt_cb, pbi->decrypt_state, buf); + } + } +} + +static const uint8_t *decode_tiles(VP9Decoder *pbi, + const uint8_t *data, + const uint8_t *data_end) { + VP9_COMMON *const cm = &pbi->common; + const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); + const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + TileBuffer tile_buffers[4][1 << 6]; + int tile_row, tile_col; + int mi_row, mi_col; + TileData *tile_data = NULL; + + if (cm->lf.filter_level && pbi->lf_worker.data1 == NULL) { + CHECK_MEM_ERROR(cm, pbi->lf_worker.data1, + vpx_memalign(32, sizeof(LFWorkerData))); + pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker; + if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Loop filter thread creation failed"); + } + } + + if (cm->lf.filter_level) { + LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; + // Be sure to sync as we might be resuming after a failed frame decode. + winterface->sync(&pbi->lf_worker); + lf_data->frame_buffer = get_frame_new_buffer(cm); + lf_data->cm = cm; + vp9_copy(lf_data->planes, pbi->mb.plane); + lf_data->stop = 0; + lf_data->y_only = 0; + vp9_loop_filter_frame_init(cm, cm->lf.filter_level); + } + + assert(tile_rows <= 4); + assert(tile_cols <= (1 << 6)); + + // Note: this memset assumes above_context[0], [1] and [2] + // are allocated as part of the same buffer. + vpx_memset(cm->above_context, 0, + sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_cols); + + vpx_memset(cm->above_seg_context, 0, + sizeof(*cm->above_seg_context) * aligned_cols); + + get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers); + + if (pbi->tile_data == NULL || + (tile_cols * tile_rows) != pbi->total_tiles) { + vpx_free(pbi->tile_data); + CHECK_MEM_ERROR( + cm, + pbi->tile_data, + vpx_memalign(32, tile_cols * tile_rows * (sizeof(*pbi->tile_data)))); + pbi->total_tiles = tile_rows * tile_cols; + } + + // Load all tile information into tile_data. + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileInfo tile; + const TileBuffer *const buf = &tile_buffers[tile_row][tile_col]; + tile_data = pbi->tile_data + tile_cols * tile_row + tile_col; + tile_data->cm = cm; + tile_data->xd = pbi->mb; + tile_data->xd.corrupted = 0; + vp9_tile_init(&tile, tile_data->cm, tile_row, tile_col); + setup_token_decoder(buf->data, data_end, buf->size, &cm->error, + &tile_data->bit_reader, pbi->decrypt_cb, + pbi->decrypt_state); + init_macroblockd(cm, &tile_data->xd); + vp9_zero(tile_data->xd.dqcoeff); + } + } + + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + TileInfo tile; + vp9_tile_set_row(&tile, cm, tile_row); + for (mi_row = tile.mi_row_start; mi_row < tile.mi_row_end; + mi_row += MI_BLOCK_SIZE) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + const int col = pbi->inv_tile_order ? + tile_cols - tile_col - 1 : tile_col; + tile_data = pbi->tile_data + tile_cols * tile_row + col; + vp9_tile_set_col(&tile, tile_data->cm, col); + vp9_zero(tile_data->xd.left_context); + vp9_zero(tile_data->xd.left_seg_context); + for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end; + mi_col += MI_BLOCK_SIZE) { + decode_partition(tile_data->cm, &tile_data->xd, &tile, mi_row, mi_col, + &tile_data->bit_reader, BLOCK_64X64); + } + pbi->mb.corrupted |= tile_data->xd.corrupted; + } + // Loopfilter one row. + if (cm->lf.filter_level && !pbi->mb.corrupted) { + const int lf_start = mi_row - MI_BLOCK_SIZE; + LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; + + // delay the loopfilter by 1 macroblock row. + if (lf_start < 0) continue; + + // decoding has completed: finish up the loop filter in this thread. + if (mi_row + MI_BLOCK_SIZE >= cm->mi_rows) continue; + + winterface->sync(&pbi->lf_worker); + lf_data->start = lf_start; + lf_data->stop = mi_row; + if (pbi->max_threads > 1) { + winterface->launch(&pbi->lf_worker); + } else { + winterface->execute(&pbi->lf_worker); + } + } + } + } + + // Loopfilter remaining rows in the frame. + if (cm->lf.filter_level && !pbi->mb.corrupted) { + LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; + winterface->sync(&pbi->lf_worker); + lf_data->start = lf_data->stop; + lf_data->stop = cm->mi_rows; + winterface->execute(&pbi->lf_worker); + } + + // Get last tile data. + tile_data = pbi->tile_data + tile_cols * tile_rows - 1; + + return vp9_reader_find_end(&tile_data->bit_reader); +} + +static int tile_worker_hook(void *arg1, void *arg2) { + TileWorkerData *const tile_data = (TileWorkerData*)arg1; + const TileInfo *const tile = (TileInfo*)arg2; + int mi_row, mi_col; + + for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; + mi_row += MI_BLOCK_SIZE) { + vp9_zero(tile_data->xd.left_context); + vp9_zero(tile_data->xd.left_seg_context); + for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; + mi_col += MI_BLOCK_SIZE) { + decode_partition(tile_data->cm, &tile_data->xd, tile, + mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64); + } + } + return !tile_data->xd.corrupted; +} + +// sorts in descending order +static int compare_tile_buffers(const void *a, const void *b) { + const TileBuffer *const buf1 = (const TileBuffer*)a; + const TileBuffer *const buf2 = (const TileBuffer*)b; + if (buf1->size < buf2->size) { + return 1; + } else if (buf1->size == buf2->size) { + return 0; + } else { + return -1; + } +} + +static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, + const uint8_t *data, + const uint8_t *data_end) { + VP9_COMMON *const cm = &pbi->common; + const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); + const uint8_t *bit_reader_end = NULL; + const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + const int num_workers = MIN(pbi->max_threads & ~1, tile_cols); + TileBuffer tile_buffers[1][1 << 6]; + int n; + int final_worker = -1; + + assert(tile_cols <= (1 << 6)); + assert(tile_rows == 1); + (void)tile_rows; + + // TODO(jzern): See if we can remove the restriction of passing in max + // threads to the decoder. + if (pbi->num_tile_workers == 0) { + const int num_threads = pbi->max_threads & ~1; + int i; + // TODO(jzern): Allocate one less worker, as in the current code we only + // use num_threads - 1 workers. + CHECK_MEM_ERROR(cm, pbi->tile_workers, + vpx_malloc(num_threads * sizeof(*pbi->tile_workers))); + for (i = 0; i < num_threads; ++i) { + VP9Worker *const worker = &pbi->tile_workers[i]; + ++pbi->num_tile_workers; + + winterface->init(worker); + CHECK_MEM_ERROR(cm, worker->data1, + vpx_memalign(32, sizeof(TileWorkerData))); + CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo))); + if (i < num_threads - 1 && !winterface->reset(worker)) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Tile decoder thread creation failed"); + } + } + } + + // Reset tile decoding hook + for (n = 0; n < num_workers; ++n) { + winterface->sync(&pbi->tile_workers[n]); + pbi->tile_workers[n].hook = (VP9WorkerHook)tile_worker_hook; + } + + // Note: this memset assumes above_context[0], [1] and [2] + // are allocated as part of the same buffer. + vpx_memset(cm->above_context, 0, + sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols); + vpx_memset(cm->above_seg_context, 0, + sizeof(*cm->above_seg_context) * aligned_mi_cols); + + // Load tile data into tile_buffers + get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers); + + // Sort the buffers based on size in descending order. + qsort(tile_buffers[0], tile_cols, sizeof(tile_buffers[0][0]), + compare_tile_buffers); + + // Rearrange the tile buffers such that per-tile group the largest, and + // presumably the most difficult, tile will be decoded in the main thread. + // This should help minimize the number of instances where the main thread is + // waiting for a worker to complete. + { + int group_start = 0; + while (group_start < tile_cols) { + const TileBuffer largest = tile_buffers[0][group_start]; + const int group_end = MIN(group_start + num_workers, tile_cols) - 1; + memmove(tile_buffers[0] + group_start, tile_buffers[0] + group_start + 1, + (group_end - group_start) * sizeof(tile_buffers[0][0])); + tile_buffers[0][group_end] = largest; + group_start = group_end + 1; + } + } + + n = 0; + while (n < tile_cols) { + int i; + for (i = 0; i < num_workers && n < tile_cols; ++i) { + VP9Worker *const worker = &pbi->tile_workers[i]; + TileWorkerData *const tile_data = (TileWorkerData*)worker->data1; + TileInfo *const tile = (TileInfo*)worker->data2; + TileBuffer *const buf = &tile_buffers[0][n]; + + tile_data->cm = cm; + tile_data->xd = pbi->mb; + tile_data->xd.corrupted = 0; + vp9_tile_init(tile, tile_data->cm, 0, buf->col); + setup_token_decoder(buf->data, data_end, buf->size, &cm->error, + &tile_data->bit_reader, pbi->decrypt_cb, + pbi->decrypt_state); + init_macroblockd(cm, &tile_data->xd); + vp9_zero(tile_data->xd.dqcoeff); + + worker->had_error = 0; + if (i == num_workers - 1 || n == tile_cols - 1) { + winterface->execute(worker); + } else { + winterface->launch(worker); + } + + if (buf->col == tile_cols - 1) { + final_worker = i; + } + + ++n; + } + + for (; i > 0; --i) { + VP9Worker *const worker = &pbi->tile_workers[i - 1]; + pbi->mb.corrupted |= !winterface->sync(worker); + } + if (final_worker > -1) { + TileWorkerData *const tile_data = + (TileWorkerData*)pbi->tile_workers[final_worker].data1; + bit_reader_end = vp9_reader_find_end(&tile_data->bit_reader); + final_worker = -1; + } + } + + return bit_reader_end; +} + +static void error_handler(void *data) { + VP9_COMMON *const cm = (VP9_COMMON *)data; + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet"); +} + +int vp9_read_sync_code(struct vp9_read_bit_buffer *const rb) { + return vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_0 && + vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_1 && + vp9_rb_read_literal(rb, 8) == VP9_SYNC_CODE_2; +} + +BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb) { + int profile = vp9_rb_read_bit(rb); + profile |= vp9_rb_read_bit(rb) << 1; + if (profile > 2) + profile += vp9_rb_read_bit(rb); + return (BITSTREAM_PROFILE) profile; +} + +static void read_bitdepth_colorspace_sampling( + VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { + if (cm->profile >= PROFILE_2) + cm->bit_depth = vp9_rb_read_bit(rb) ? VPX_BITS_12 : VPX_BITS_10; + cm->color_space = (COLOR_SPACE)vp9_rb_read_literal(rb, 3); + if (cm->color_space != SRGB) { + vp9_rb_read_bit(rb); // [16,235] (including xvycc) vs [0,255] range + if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) { + cm->subsampling_x = vp9_rb_read_bit(rb); + cm->subsampling_y = vp9_rb_read_bit(rb); + if (cm->subsampling_x == 1 && cm->subsampling_y == 1) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "4:2:0 color not supported in profile 1 or 3"); + if (vp9_rb_read_bit(rb)) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Reserved bit set"); + } else { + cm->subsampling_y = cm->subsampling_x = 1; + } + } else { + if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) { + // Note if colorspace is SRGB then 4:4:4 chroma sampling is assumed. + // 4:2:2 or 4:4:0 chroma sampling is not allowed. + cm->subsampling_y = cm->subsampling_x = 0; + if (vp9_rb_read_bit(rb)) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Reserved bit set"); + } else { + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "4:4:4 color not supported in profile 0 or 2"); + } + } +} + +static size_t read_uncompressed_header(VP9Decoder *pbi, + struct vp9_read_bit_buffer *rb) { + VP9_COMMON *const cm = &pbi->common; + size_t sz; + int i; + + cm->last_frame_type = cm->frame_type; + + if (vp9_rb_read_literal(rb, 2) != VP9_FRAME_MARKER) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Invalid frame marker"); + + cm->profile = vp9_read_profile(rb); + + if (cm->profile >= MAX_PROFILES) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Unsupported bitstream profile"); + + cm->show_existing_frame = vp9_rb_read_bit(rb); + if (cm->show_existing_frame) { + // Show an existing frame directly. + const int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)]; + + if (frame_to_show < 0 || cm->frame_bufs[frame_to_show].ref_count < 1) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Buffer %d does not contain a decoded frame", + frame_to_show); + + ref_cnt_fb(cm->frame_bufs, &cm->new_fb_idx, frame_to_show); + pbi->refresh_frame_flags = 0; + cm->lf.filter_level = 0; + cm->show_frame = 1; + return 0; + } + + cm->frame_type = (FRAME_TYPE) vp9_rb_read_bit(rb); + cm->show_frame = vp9_rb_read_bit(rb); + cm->error_resilient_mode = vp9_rb_read_bit(rb); + + if (cm->frame_type == KEY_FRAME) { + if (!vp9_read_sync_code(rb)) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Invalid frame sync code"); + + read_bitdepth_colorspace_sampling(cm, rb); + pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1; + + for (i = 0; i < REFS_PER_FRAME; ++i) { + cm->frame_refs[i].idx = -1; + cm->frame_refs[i].buf = NULL; + } + + setup_frame_size(cm, rb); + } else { + cm->intra_only = cm->show_frame ? 0 : vp9_rb_read_bit(rb); + + cm->reset_frame_context = cm->error_resilient_mode ? + 0 : vp9_rb_read_literal(rb, 2); + + if (cm->intra_only) { + if (!vp9_read_sync_code(rb)) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Invalid frame sync code"); + if (cm->profile > PROFILE_0) { + read_bitdepth_colorspace_sampling(cm, rb); + } else { + // NOTE: The intra-only frame header does not include the specification + // of either the color format or color sub-sampling in profile 0. VP9 + // specifies that the default color space should be YUV 4:2:0 in this + // case (normative). + cm->color_space = BT_601; + cm->subsampling_y = cm->subsampling_x = 1; + } + + pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES); + setup_frame_size(cm, rb); + } else { + pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES); + for (i = 0; i < REFS_PER_FRAME; ++i) { + const int ref = vp9_rb_read_literal(rb, REF_FRAMES_LOG2); + const int idx = cm->ref_frame_map[ref]; + RefBuffer *const ref_frame = &cm->frame_refs[i]; + ref_frame->idx = idx; + ref_frame->buf = &cm->frame_bufs[idx].buf; + cm->ref_frame_sign_bias[LAST_FRAME + i] = vp9_rb_read_bit(rb); + } + + setup_frame_size_with_refs(cm, rb); + + cm->allow_high_precision_mv = vp9_rb_read_bit(rb); + cm->interp_filter = read_interp_filter(rb); + + for (i = 0; i < REFS_PER_FRAME; ++i) { + RefBuffer *const ref_buf = &cm->frame_refs[i]; + vp9_setup_scale_factors_for_frame(&ref_buf->sf, + ref_buf->buf->y_crop_width, + ref_buf->buf->y_crop_height, + cm->width, cm->height); + if (vp9_is_scaled(&ref_buf->sf)) + vp9_extend_frame_borders(ref_buf->buf); + } + } + } + + if (!cm->error_resilient_mode) { + cm->refresh_frame_context = vp9_rb_read_bit(rb); + cm->frame_parallel_decoding_mode = vp9_rb_read_bit(rb); + } else { + cm->refresh_frame_context = 0; + cm->frame_parallel_decoding_mode = 1; + } + + // This flag will be overridden by the call to vp9_setup_past_independence + // below, forcing the use of context 0 for those frame types. + cm->frame_context_idx = vp9_rb_read_literal(rb, FRAME_CONTEXTS_LOG2); + + if (frame_is_intra_only(cm) || cm->error_resilient_mode) + vp9_setup_past_independence(cm); + + setup_loopfilter(&cm->lf, rb); + setup_quantization(cm, &pbi->mb, rb); + setup_segmentation(&cm->seg, rb); + + setup_tile_info(cm, rb); + sz = vp9_rb_read_literal(rb, 16); + + if (sz == 0) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid header size"); + + return sz; +} + +static int read_compressed_header(VP9Decoder *pbi, const uint8_t *data, + size_t partition_size) { + VP9_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; + FRAME_CONTEXT *const fc = &cm->fc; + vp9_reader r; + int k; + + if (vp9_reader_init(&r, data, partition_size, pbi->decrypt_cb, + pbi->decrypt_state)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate bool decoder 0"); + + cm->tx_mode = xd->lossless ? ONLY_4X4 : read_tx_mode(&r); + if (cm->tx_mode == TX_MODE_SELECT) + read_tx_mode_probs(&fc->tx_probs, &r); + read_coef_probs(fc, cm->tx_mode, &r); + + for (k = 0; k < SKIP_CONTEXTS; ++k) + vp9_diff_update_prob(&r, &fc->skip_probs[k]); + + if (!frame_is_intra_only(cm)) { + nmv_context *const nmvc = &fc->nmvc; + int i, j; + + read_inter_mode_probs(fc, &r); + + if (cm->interp_filter == SWITCHABLE) + read_switchable_interp_probs(fc, &r); + + for (i = 0; i < INTRA_INTER_CONTEXTS; i++) + vp9_diff_update_prob(&r, &fc->intra_inter_prob[i]); + + cm->reference_mode = read_frame_reference_mode(cm, &r); + if (cm->reference_mode != SINGLE_REFERENCE) + setup_compound_reference_mode(cm); + read_frame_reference_mode_probs(cm, &r); + + for (j = 0; j < BLOCK_SIZE_GROUPS; j++) + for (i = 0; i < INTRA_MODES - 1; ++i) + vp9_diff_update_prob(&r, &fc->y_mode_prob[j][i]); + + for (j = 0; j < PARTITION_CONTEXTS; ++j) + for (i = 0; i < PARTITION_TYPES - 1; ++i) + vp9_diff_update_prob(&r, &fc->partition_prob[j][i]); + + read_mv_probs(nmvc, cm->allow_high_precision_mv, &r); + } + + return vp9_reader_has_error(&r); +} + +void vp9_init_dequantizer(VP9_COMMON *cm) { + int q; + + for (q = 0; q < QINDEX_RANGE; q++) { + cm->y_dequant[q][0] = vp9_dc_quant(q, cm->y_dc_delta_q); + cm->y_dequant[q][1] = vp9_ac_quant(q, 0); + + cm->uv_dequant[q][0] = vp9_dc_quant(q, cm->uv_dc_delta_q); + cm->uv_dequant[q][1] = vp9_ac_quant(q, cm->uv_ac_delta_q); + } +} + +#ifdef NDEBUG +#define debug_check_frame_counts(cm) (void)0 +#else // !NDEBUG +// Counts should only be incremented when frame_parallel_decoding_mode and +// error_resilient_mode are disabled. +static void debug_check_frame_counts(const VP9_COMMON *const cm) { + FRAME_COUNTS zero_counts; + vp9_zero(zero_counts); + assert(cm->frame_parallel_decoding_mode || cm->error_resilient_mode); + assert(!memcmp(cm->counts.y_mode, zero_counts.y_mode, + sizeof(cm->counts.y_mode))); + assert(!memcmp(cm->counts.uv_mode, zero_counts.uv_mode, + sizeof(cm->counts.uv_mode))); + assert(!memcmp(cm->counts.partition, zero_counts.partition, + sizeof(cm->counts.partition))); + assert(!memcmp(cm->counts.coef, zero_counts.coef, + sizeof(cm->counts.coef))); + assert(!memcmp(cm->counts.eob_branch, zero_counts.eob_branch, + sizeof(cm->counts.eob_branch))); + assert(!memcmp(cm->counts.switchable_interp, zero_counts.switchable_interp, + sizeof(cm->counts.switchable_interp))); + assert(!memcmp(cm->counts.inter_mode, zero_counts.inter_mode, + sizeof(cm->counts.inter_mode))); + assert(!memcmp(cm->counts.intra_inter, zero_counts.intra_inter, + sizeof(cm->counts.intra_inter))); + assert(!memcmp(cm->counts.comp_inter, zero_counts.comp_inter, + sizeof(cm->counts.comp_inter))); + assert(!memcmp(cm->counts.single_ref, zero_counts.single_ref, + sizeof(cm->counts.single_ref))); + assert(!memcmp(cm->counts.comp_ref, zero_counts.comp_ref, + sizeof(cm->counts.comp_ref))); + assert(!memcmp(&cm->counts.tx, &zero_counts.tx, sizeof(cm->counts.tx))); + assert(!memcmp(cm->counts.skip, zero_counts.skip, sizeof(cm->counts.skip))); + assert(!memcmp(&cm->counts.mv, &zero_counts.mv, sizeof(cm->counts.mv))); +} +#endif // NDEBUG + +static struct vp9_read_bit_buffer* init_read_bit_buffer( + VP9Decoder *pbi, + struct vp9_read_bit_buffer *rb, + const uint8_t *data, + const uint8_t *data_end, + uint8_t *clear_data /* buffer size MAX_VP9_HEADER_SIZE */) { + rb->bit_offset = 0; + rb->error_handler = error_handler; + rb->error_handler_data = &pbi->common; + if (pbi->decrypt_cb) { + const int n = (int)MIN(MAX_VP9_HEADER_SIZE, data_end - data); + pbi->decrypt_cb(pbi->decrypt_state, data, clear_data, n); + rb->bit_buffer = clear_data; + rb->bit_buffer_end = clear_data + n; + } else { + rb->bit_buffer = data; + rb->bit_buffer_end = data_end; + } + return rb; +} + +void vp9_decode_frame(VP9Decoder *pbi, + const uint8_t *data, const uint8_t *data_end, + const uint8_t **p_data_end) { + VP9_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; + struct vp9_read_bit_buffer rb = { NULL, NULL, 0, NULL, 0}; + + uint8_t clear_data[MAX_VP9_HEADER_SIZE]; + const size_t first_partition_size = read_uncompressed_header(pbi, + init_read_bit_buffer(pbi, &rb, data, data_end, clear_data)); + const int tile_rows = 1 << cm->log2_tile_rows; + const int tile_cols = 1 << cm->log2_tile_cols; + YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); + xd->cur_buf = new_fb; + + if (!first_partition_size) { + // showing a frame directly + *p_data_end = data + (cm->profile <= PROFILE_2 ? 1 : 2); + return; + } + + data += vp9_rb_bytes_read(&rb); + if (!read_is_valid(data, first_partition_size, data_end)) + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Truncated packet or corrupt header length"); + + init_macroblockd(cm, &pbi->mb); + + if (!cm->error_resilient_mode) + set_prev_mi(cm); + else + cm->prev_mi = NULL; + + setup_plane_dequants(cm, xd, cm->base_qindex); + vp9_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y); + + cm->fc = cm->frame_contexts[cm->frame_context_idx]; + vp9_zero(cm->counts); + vp9_zero(xd->dqcoeff); + + xd->corrupted = 0; + new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size); + + // TODO(jzern): remove frame_parallel_decoding_mode restriction for + // single-frame tile decoding. + if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1 && + cm->frame_parallel_decoding_mode) { + *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end); + if (!xd->corrupted) { + // If multiple threads are used to decode tiles, then we use those threads + // to do parallel loopfiltering. + vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0); + } + } else { + *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end); + } + + new_fb->corrupted |= xd->corrupted; + + if (!new_fb->corrupted) { + if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) { + vp9_adapt_coef_probs(cm); + + if (!frame_is_intra_only(cm)) { + vp9_adapt_mode_probs(cm); + vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv); + } + } else { + debug_check_frame_counts(cm); + } + } else { + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Decode failed. Frame data is corrupted."); + } + + if (cm->refresh_frame_context) + cm->frame_contexts[cm->frame_context_idx] = cm->fc; +} diff --git a/media/libvpx/vp9/decoder/vp9_decodeframe.h b/media/libvpx/vp9/decoder/vp9_decodeframe.h new file mode 100644 index 000000000000..10a9e34629b4 --- /dev/null +++ b/media/libvpx/vp9/decoder/vp9_decodeframe.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP9_DECODER_VP9_DECODEFRAME_H_ +#define VP9_DECODER_VP9_DECODEFRAME_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP9Common; +struct VP9Decoder; +struct vp9_read_bit_buffer; + +void vp9_init_dequantizer(struct VP9Common *cm); + +void vp9_decode_frame(struct VP9Decoder *pbi, + const uint8_t *data, const uint8_t *data_end, + const uint8_t **p_data_end); + +int vp9_read_sync_code(struct vp9_read_bit_buffer *const rb); +void vp9_read_frame_size(struct vp9_read_bit_buffer *rb, + int *width, int *height); +BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_DECODER_VP9_DECODEFRAME_H_ diff --git a/media/libvpx/vp9/decoder/vp9_decodemv.c b/media/libvpx/vp9/decoder/vp9_decodemv.c index b948429b3a41..32e80f93bbe4 100644 --- a/media/libvpx/vp9/decoder/vp9_decodemv.c +++ b/media/libvpx/vp9/decoder/vp9_decodemv.c @@ -14,43 +14,40 @@ #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_entropymv.h" -#include "vp9/common/vp9_findnearmv.h" #include "vp9/common/vp9_mvref_common.h" #include "vp9/common/vp9_pred_common.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_seg_common.h" #include "vp9/decoder/vp9_decodemv.h" -#include "vp9/decoder/vp9_decodframe.h" -#include "vp9/decoder/vp9_onyxd_int.h" -#include "vp9/decoder/vp9_treereader.h" +#include "vp9/decoder/vp9_decodeframe.h" +#include "vp9/decoder/vp9_reader.h" -static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) { - return (MB_PREDICTION_MODE)treed_read(r, vp9_intra_mode_tree, p); +static PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) { + return (PREDICTION_MODE)vp9_read_tree(r, vp9_intra_mode_tree, p); } -static MB_PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, vp9_reader *r, +static PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, vp9_reader *r, int size_group) { - const MB_PREDICTION_MODE y_mode = read_intra_mode(r, - cm->fc.y_mode_prob[size_group]); + const PREDICTION_MODE y_mode = + read_intra_mode(r, cm->fc.y_mode_prob[size_group]); if (!cm->frame_parallel_decoding_mode) ++cm->counts.y_mode[size_group][y_mode]; return y_mode; } -static MB_PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, vp9_reader *r, - MB_PREDICTION_MODE y_mode) { - const MB_PREDICTION_MODE uv_mode = read_intra_mode(r, +static PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, vp9_reader *r, + PREDICTION_MODE y_mode) { + const PREDICTION_MODE uv_mode = read_intra_mode(r, cm->fc.uv_mode_prob[y_mode]); if (!cm->frame_parallel_decoding_mode) ++cm->counts.uv_mode[y_mode][uv_mode]; return uv_mode; } -static MB_PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r, - int ctx) { - const int mode = treed_read(r, vp9_inter_mode_tree, - cm->fc.inter_mode_probs[ctx]); +static PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r, int ctx) { + const int mode = vp9_read_tree(r, vp9_inter_mode_tree, + cm->fc.inter_mode_probs[ctx]); if (!cm->frame_parallel_decoding_mode) ++cm->counts.inter_mode[ctx][mode]; @@ -58,14 +55,14 @@ static MB_PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r, } static int read_segment_id(vp9_reader *r, const struct segmentation *seg) { - return treed_read(r, vp9_segment_tree, seg->tree_probs); + return vp9_read_tree(r, vp9_segment_tree, seg->tree_probs); } static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, TX_SIZE max_tx_size, vp9_reader *r) { - const int ctx = vp9_get_pred_context_tx_size(xd); + const int ctx = vp9_get_tx_size_context(xd); const vp9_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc.tx_probs); - TX_SIZE tx_size = vp9_read(r, tx_probs[0]); + int tx_size = vp9_read(r, tx_probs[0]); if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) { tx_size += vp9_read(r, tx_probs[1]); if (tx_size != TX_8X8 && max_tx_size >= TX_32X32) @@ -74,7 +71,7 @@ static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, if (!cm->frame_parallel_decoding_mode) ++get_tx_counts(max_tx_size, ctx, &cm->counts.tx)[tx_size]; - return tx_size; + return (TX_SIZE)tx_size; } static TX_SIZE read_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, TX_MODE tx_mode, @@ -106,7 +103,7 @@ static int read_intra_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd, int mi_row, int mi_col, vp9_reader *r) { struct segmentation *const seg = &cm->seg; - const BLOCK_SIZE bsize = xd->mi_8x8[0]->mbmi.sb_type; + const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type; int segment_id; if (!seg->enabled) @@ -123,23 +120,23 @@ static int read_intra_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd, static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd, int mi_row, int mi_col, vp9_reader *r) { struct segmentation *const seg = &cm->seg; - const BLOCK_SIZE bsize = xd->mi_8x8[0]->mbmi.sb_type; - int pred_segment_id, segment_id; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const BLOCK_SIZE bsize = mbmi->sb_type; + int predicted_segment_id, segment_id; if (!seg->enabled) return 0; // Default for disabled segmentation - pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map, - bsize, mi_row, mi_col); + predicted_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map, + bsize, mi_row, mi_col); if (!seg->update_map) - return pred_segment_id; + return predicted_segment_id; if (seg->temporal_update) { const vp9_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd); - const int pred_flag = vp9_read(r, pred_prob); - vp9_set_pred_flag_seg_id(xd, pred_flag); - segment_id = pred_flag ? pred_segment_id - : read_segment_id(r, seg); + mbmi->seg_id_predicted = vp9_read(r, pred_prob); + segment_id = mbmi->seg_id_predicted ? predicted_segment_id + : read_segment_id(r, seg); } else { segment_id = read_segment_id(r, seg); } @@ -147,60 +144,57 @@ static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd, return segment_id; } -static int read_skip_coeff(VP9_COMMON *cm, const MACROBLOCKD *xd, - int segment_id, vp9_reader *r) { +static int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd, + int segment_id, vp9_reader *r) { if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { return 1; } else { - const int ctx = vp9_get_pred_context_mbskip(xd); - const int skip = vp9_read(r, cm->fc.mbskip_probs[ctx]); + const int ctx = vp9_get_skip_context(xd); + const int skip = vp9_read(r, cm->fc.skip_probs[ctx]); if (!cm->frame_parallel_decoding_mode) - ++cm->counts.mbskip[ctx][skip]; + ++cm->counts.skip[ctx][skip]; return skip; } } static void read_intra_frame_mode_info(VP9_COMMON *const cm, MACROBLOCKD *const xd, - MODE_INFO *const m, int mi_row, int mi_col, vp9_reader *r) { - MB_MODE_INFO *const mbmi = &m->mbmi; + MODE_INFO *const mi = xd->mi[0]; + MB_MODE_INFO *const mbmi = &mi->mbmi; + const MODE_INFO *above_mi = xd->mi[-cm->mi_stride]; + const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL; const BLOCK_SIZE bsize = mbmi->sb_type; - const MODE_INFO *above_mi = xd->mi_8x8[-cm->mode_info_stride]; - const MODE_INFO *left_mi = xd->left_available ? xd->mi_8x8[-1] : NULL; + int i; mbmi->segment_id = read_intra_segment_id(cm, xd, mi_row, mi_col, r); - mbmi->skip_coeff = read_skip_coeff(cm, xd, mbmi->segment_id, r); + mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r); mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, bsize, 1, r); mbmi->ref_frame[0] = INTRA_FRAME; mbmi->ref_frame[1] = NONE; - if (bsize >= BLOCK_8X8) { - const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, 0); - const MB_PREDICTION_MODE L = left_block_mode(m, left_mi, 0); - mbmi->mode = read_intra_mode(r, vp9_kf_y_mode_prob[A][L]); - } else { - // Only 4x4, 4x8, 8x4 blocks - const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; // 1 or 2 - const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; // 1 or 2 - int idx, idy; - - for (idy = 0; idy < 2; idy += num_4x4_h) { - for (idx = 0; idx < 2; idx += num_4x4_w) { - const int ib = idy * 2 + idx; - const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, ib); - const MB_PREDICTION_MODE L = left_block_mode(m, left_mi, ib); - const MB_PREDICTION_MODE b_mode = read_intra_mode(r, - vp9_kf_y_mode_prob[A][L]); - m->bmi[ib].as_mode = b_mode; - if (num_4x4_h == 2) - m->bmi[ib + 2].as_mode = b_mode; - if (num_4x4_w == 2) - m->bmi[ib + 1].as_mode = b_mode; - } - } - - mbmi->mode = m->bmi[3].as_mode; + switch (bsize) { + case BLOCK_4X4: + for (i = 0; i < 4; ++i) + mi->bmi[i].as_mode = + read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, i)); + mbmi->mode = mi->bmi[3].as_mode; + break; + case BLOCK_4X8: + mi->bmi[0].as_mode = mi->bmi[2].as_mode = + read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 0)); + mi->bmi[1].as_mode = mi->bmi[3].as_mode = mbmi->mode = + read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 1)); + break; + case BLOCK_8X4: + mi->bmi[0].as_mode = mi->bmi[1].as_mode = + read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 0)); + mi->bmi[2].as_mode = mi->bmi[3].as_mode = mbmi->mode = + read_intra_mode(r, get_y_mode_probs(mi, above_mi, left_mi, 2)); + break; + default: + mbmi->mode = read_intra_mode(r, + get_y_mode_probs(mi, above_mi, left_mi, 0)); } mbmi->uv_mode = read_intra_mode(r, vp9_kf_uv_mode_prob[mbmi->mode]); @@ -210,12 +204,12 @@ static int read_mv_component(vp9_reader *r, const nmv_component *mvcomp, int usehp) { int mag, d, fr, hp; const int sign = vp9_read(r, mvcomp->sign); - const int mv_class = treed_read(r, vp9_mv_class_tree, mvcomp->classes); + const int mv_class = vp9_read_tree(r, vp9_mv_class_tree, mvcomp->classes); const int class0 = mv_class == MV_CLASS_0; // Integer part if (class0) { - d = treed_read(r, vp9_mv_class0_tree, mvcomp->class0); + d = vp9_read_tree(r, vp9_mv_class0_tree, mvcomp->class0); } else { int i; const int n = mv_class + CLASS0_BITS - 1; // number of bits @@ -226,8 +220,8 @@ static int read_mv_component(vp9_reader *r, } // Fractional part - fr = treed_read(r, vp9_mv_fp_tree, - class0 ? mvcomp->class0_fp[d] : mvcomp->fp); + fr = vp9_read_tree(r, vp9_mv_fp_tree, class0 ? mvcomp->class0_fp[d] + : mvcomp->fp); // High precision part (if hp is not used, the default value of the hp is 1) @@ -242,14 +236,15 @@ static int read_mv_component(vp9_reader *r, static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref, const nmv_context *ctx, nmv_context_counts *counts, int allow_hp) { - const MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, ctx->joints); + const MV_JOINT_TYPE joint_type = + (MV_JOINT_TYPE)vp9_read_tree(r, vp9_mv_joint_tree, ctx->joints); const int use_hp = allow_hp && vp9_use_mv_hp(ref); MV diff = {0, 0}; - if (mv_joint_vertical(j)) + if (mv_joint_vertical(joint_type)) diff.row = read_mv_component(r, &ctx->comps[0], use_hp); - if (mv_joint_horizontal(j)) + if (mv_joint_horizontal(joint_type)) diff.col = read_mv_component(r, &ctx->comps[1], use_hp); vp9_inc_mv(&diff, counts); @@ -258,14 +253,19 @@ static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref, mv->col = ref->col + diff.col; } -static COMPPREDMODE_TYPE read_reference_mode(VP9_COMMON *cm, - const MACROBLOCKD *xd, - vp9_reader *r) { - const int ctx = vp9_get_pred_context_comp_inter_inter(cm, xd); - const int mode = vp9_read(r, cm->fc.comp_inter_prob[ctx]); - if (!cm->frame_parallel_decoding_mode) - ++cm->counts.comp_inter[ctx][mode]; - return mode; // SINGLE_PREDICTION_ONLY or COMP_PREDICTION_ONLY +static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm, + const MACROBLOCKD *xd, + vp9_reader *r) { + if (cm->reference_mode == REFERENCE_MODE_SELECT) { + const int ctx = vp9_get_reference_mode_context(cm, xd); + const REFERENCE_MODE mode = + (REFERENCE_MODE)vp9_read(r, cm->fc.comp_inter_prob[ctx]); + if (!cm->frame_parallel_decoding_mode) + ++cm->counts.comp_inter[ctx][mode]; + return mode; // SINGLE_REFERENCE or COMPOUND_REFERENCE + } else { + return cm->reference_mode; + } } // Read the referncence frame @@ -276,15 +276,13 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd, FRAME_COUNTS *const counts = &cm->counts; if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { - ref_frame[0] = vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME); + ref_frame[0] = (MV_REFERENCE_FRAME)vp9_get_segdata(&cm->seg, segment_id, + SEG_LVL_REF_FRAME); ref_frame[1] = NONE; } else { - const COMPPREDMODE_TYPE mode = (cm->comp_pred_mode == HYBRID_PREDICTION) - ? read_reference_mode(cm, xd, r) - : cm->comp_pred_mode; - + const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r); // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding - if (mode == COMP_PREDICTION_ONLY) { + if (mode == COMPOUND_REFERENCE) { const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref]; const int ctx = vp9_get_pred_context_comp_ref_p(cm, xd); const int bit = vp9_read(r, fc->comp_ref_prob[ctx]); @@ -292,7 +290,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd, ++counts->comp_ref[ctx][bit]; ref_frame[idx] = cm->comp_fixed_ref; ref_frame[!idx] = cm->comp_var_ref[bit]; - } else if (mode == SINGLE_PREDICTION_ONLY) { + } else if (mode == SINGLE_REFERENCE) { const int ctx0 = vp9_get_pred_context_single_ref_p1(xd); const int bit0 = vp9_read(r, fc->single_ref_prob[ctx0][0]); if (!cm->frame_parallel_decoding_mode) @@ -309,17 +307,18 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd, ref_frame[1] = NONE; } else { - assert(!"Invalid prediction mode."); + assert(0 && "Invalid prediction mode."); } } } -static INLINE INTERPOLATION_TYPE read_switchable_filter_type( +static INLINE INTERP_FILTER read_switchable_interp_filter( VP9_COMMON *const cm, MACROBLOCKD *const xd, vp9_reader *r) { const int ctx = vp9_get_pred_context_switchable_interp(xd); - const int type = treed_read(r, vp9_switchable_interp_tree, - cm->fc.switchable_interp_prob[ctx]); + const INTERP_FILTER type = + (INTERP_FILTER)vp9_read_tree(r, vp9_switchable_interp_tree, + cm->fc.switchable_interp_prob[ctx]); if (!cm->frame_parallel_decoding_mode) ++cm->counts.switchable_interp[ctx][type]; return type; @@ -329,39 +328,43 @@ static void read_intra_block_mode_info(VP9_COMMON *const cm, MODE_INFO *mi, vp9_reader *r) { MB_MODE_INFO *const mbmi = &mi->mbmi; const BLOCK_SIZE bsize = mi->mbmi.sb_type; + int i; mbmi->ref_frame[0] = INTRA_FRAME; mbmi->ref_frame[1] = NONE; - if (bsize >= BLOCK_8X8) { - mbmi->mode = read_intra_mode_y(cm, r, size_group_lookup[bsize]); - } else { - // Only 4x4, 4x8, 8x4 blocks - const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; // 1 or 2 - const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; // 1 or 2 - int idx, idy; - - for (idy = 0; idy < 2; idy += num_4x4_h) { - for (idx = 0; idx < 2; idx += num_4x4_w) { - const int ib = idy * 2 + idx; - const int b_mode = read_intra_mode_y(cm, r, 0); - mi->bmi[ib].as_mode = b_mode; - if (num_4x4_h == 2) - mi->bmi[ib + 2].as_mode = b_mode; - if (num_4x4_w == 2) - mi->bmi[ib + 1].as_mode = b_mode; - } - } - mbmi->mode = mi->bmi[3].as_mode; + switch (bsize) { + case BLOCK_4X4: + for (i = 0; i < 4; ++i) + mi->bmi[i].as_mode = read_intra_mode_y(cm, r, 0); + mbmi->mode = mi->bmi[3].as_mode; + break; + case BLOCK_4X8: + mi->bmi[0].as_mode = mi->bmi[2].as_mode = read_intra_mode_y(cm, r, 0); + mi->bmi[1].as_mode = mi->bmi[3].as_mode = mbmi->mode = + read_intra_mode_y(cm, r, 0); + break; + case BLOCK_8X4: + mi->bmi[0].as_mode = mi->bmi[1].as_mode = read_intra_mode_y(cm, r, 0); + mi->bmi[2].as_mode = mi->bmi[3].as_mode = mbmi->mode = + read_intra_mode_y(cm, r, 0); + break; + default: + mbmi->mode = read_intra_mode_y(cm, r, size_group_lookup[bsize]); } mbmi->uv_mode = read_intra_mode_uv(cm, r, mbmi->mode); } -static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode, - int_mv mv[2], int_mv best_mv[2], - int_mv nearest_mv[2], int_mv near_mv[2], - int is_compound, int allow_hp, vp9_reader *r) { +static INLINE int is_mv_valid(const MV *mv) { + return mv->row > MV_LOW && mv->row < MV_UPP && + mv->col > MV_LOW && mv->col < MV_UPP; +} + +static INLINE int assign_mv(VP9_COMMON *cm, PREDICTION_MODE mode, + int_mv mv[2], int_mv ref_mv[2], + int_mv nearest_mv[2], int_mv near_mv[2], + int is_compound, int allow_hp, vp9_reader *r) { int i; int ret = 1; @@ -369,30 +372,29 @@ static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode, case NEWMV: { nmv_context_counts *const mv_counts = cm->frame_parallel_decoding_mode ? NULL : &cm->counts.mv; - read_mv(r, &mv[0].as_mv, &best_mv[0].as_mv, - &cm->fc.nmvc, mv_counts, allow_hp); - if (is_compound) - read_mv(r, &mv[1].as_mv, &best_mv[1].as_mv, - &cm->fc.nmvc, mv_counts, allow_hp); for (i = 0; i < 1 + is_compound; ++i) { - ret = ret && mv[i].as_mv.row < MV_UPP && mv[i].as_mv.row > MV_LOW; - ret = ret && mv[i].as_mv.col < MV_UPP && mv[i].as_mv.col > MV_LOW; + read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc.nmvc, mv_counts, + allow_hp); + ret = ret && is_mv_valid(&mv[i].as_mv); } break; } case NEARESTMV: { mv[0].as_int = nearest_mv[0].as_int; - if (is_compound) mv[1].as_int = nearest_mv[1].as_int; + if (is_compound) + mv[1].as_int = nearest_mv[1].as_int; break; } case NEARMV: { mv[0].as_int = near_mv[0].as_int; - if (is_compound) mv[1].as_int = near_mv[1].as_int; + if (is_compound) + mv[1].as_int = near_mv[1].as_int; break; } case ZEROMV: { mv[0].as_int = 0; - if (is_compound) mv[1].as_int = 0; + if (is_compound) + mv[1].as_int = 0; break; } default: { @@ -408,8 +410,8 @@ static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd, return vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME; } else { - const int ctx = vp9_get_pred_context_intra_inter(xd); - const int is_inter = vp9_read(r, vp9_get_pred_prob_intra_inter(cm, xd)); + const int ctx = vp9_get_intra_inter_context(xd); + const int is_inter = vp9_read(r, cm->fc.intra_inter_prob[ctx]); if (!cm->frame_parallel_decoding_mode) ++cm->counts.intra_inter[ctx][is_inter]; return is_inter; @@ -425,20 +427,24 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, const BLOCK_SIZE bsize = mbmi->sb_type; const int allow_hp = cm->allow_high_precision_mv; - int_mv nearest[2], nearmv[2], best[2]; - uint8_t inter_mode_ctx; - MV_REFERENCE_FRAME ref0; - int is_compound; + int_mv nearestmv[2], nearmv[2]; + int inter_mode_ctx, ref, is_compound; - mbmi->uv_mode = DC_PRED; read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame); - ref0 = mbmi->ref_frame[0]; is_compound = has_second_ref(mbmi); - vp9_find_mv_refs(cm, xd, tile, mi, xd->last_mi, ref0, mbmi->ref_mvs[ref0], - mi_row, mi_col); + for (ref = 0; ref < 1 + is_compound; ++ref) { + const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref]; + const int ref_idx = frame - LAST_FRAME; + if (cm->frame_refs[ref_idx].sf.x_scale_fp == REF_INVALID_SCALE || + cm->frame_refs[ref_idx].sf.y_scale_fp == REF_INVALID_SCALE ) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Reference frame has invalid dimensions"); + vp9_find_mv_refs(cm, xd, tile, mi, frame, mbmi->ref_mvs[frame], + mi_row, mi_col); + } - inter_mode_ctx = mbmi->mode_context[ref0]; + inter_mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]]; if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { mbmi->mode = ZEROMV; @@ -452,58 +458,42 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, mbmi->mode = read_inter_mode(cm, r, inter_mode_ctx); } - // nearest, nearby if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) { - vp9_find_best_ref_mvs(xd, allow_hp, - mbmi->ref_mvs[ref0], &nearest[0], &nearmv[0]); - best[0].as_int = nearest[0].as_int; - } - - if (is_compound) { - const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1]; - vp9_find_mv_refs(cm, xd, tile, mi, xd->last_mi, - ref1, mbmi->ref_mvs[ref1], mi_row, mi_col); - - if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) { - vp9_find_best_ref_mvs(xd, allow_hp, - mbmi->ref_mvs[ref1], &nearest[1], &nearmv[1]); - best[1].as_int = nearest[1].as_int; + for (ref = 0; ref < 1 + is_compound; ++ref) { + vp9_find_best_ref_mvs(xd, allow_hp, mbmi->ref_mvs[mbmi->ref_frame[ref]], + &nearestmv[ref], &nearmv[ref]); } } - mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE) - ? read_switchable_filter_type(cm, xd, r) - : cm->mcomp_filter_type; + mbmi->interp_filter = (cm->interp_filter == SWITCHABLE) + ? read_switchable_interp_filter(cm, xd, r) + : cm->interp_filter; if (bsize < BLOCK_8X8) { const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; // 1 or 2 const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; // 1 or 2 int idx, idy; - int b_mode; + PREDICTION_MODE b_mode; + int_mv nearest_sub8x8[2], near_sub8x8[2]; for (idy = 0; idy < 2; idy += num_4x4_h) { for (idx = 0; idx < 2; idx += num_4x4_w) { int_mv block[2]; const int j = idy * 2 + idx; b_mode = read_inter_mode(cm, r, inter_mode_ctx); - if (b_mode == NEARESTMV || b_mode == NEARMV) { - vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, &nearest[0], - &nearmv[0], j, 0, - mi_row, mi_col); + if (b_mode == NEARESTMV || b_mode == NEARMV) + for (ref = 0; ref < 1 + is_compound; ++ref) + vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, j, ref, mi_row, mi_col, + &nearest_sub8x8[ref], + &near_sub8x8[ref]); - if (is_compound) - vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, &nearest[1], - &nearmv[1], j, 1, - mi_row, mi_col); - } - - if (!assign_mv(cm, b_mode, block, best, nearest, nearmv, + if (!assign_mv(cm, b_mode, block, nearestmv, + nearest_sub8x8, near_sub8x8, is_compound, allow_hp, r)) { xd->corrupted |= 1; break; }; - mi->bmi[j].as_mv[0].as_int = block[0].as_int; if (is_compound) mi->bmi[j].as_mv[1].as_int = block[1].as_int; @@ -520,27 +510,26 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int; mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int; } else { - xd->corrupted |= !assign_mv(cm, mbmi->mode, mbmi->mv, - best, nearest, nearmv, - is_compound, allow_hp, r); + xd->corrupted |= !assign_mv(cm, mbmi->mode, mbmi->mv, nearestmv, + nearestmv, nearmv, is_compound, allow_hp, r); } } static void read_inter_frame_mode_info(VP9_COMMON *const cm, MACROBLOCKD *const xd, const TileInfo *const tile, - MODE_INFO *const mi, int mi_row, int mi_col, vp9_reader *r) { + MODE_INFO *const mi = xd->mi[0]; MB_MODE_INFO *const mbmi = &mi->mbmi; int inter_block; mbmi->mv[0].as_int = 0; mbmi->mv[1].as_int = 0; mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r); - mbmi->skip_coeff = read_skip_coeff(cm, xd, mbmi->segment_id, r); + mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r); inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r); mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, mbmi->sb_type, - !mbmi->skip_coeff || !inter_block, r); + !mbmi->skip || !inter_block, r); if (inter_block) read_inter_block_mode_info(cm, xd, tile, mi, mi_row, mi_col, r); @@ -551,22 +540,8 @@ static void read_inter_frame_mode_info(VP9_COMMON *const cm, void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd, const TileInfo *const tile, int mi_row, int mi_col, vp9_reader *r) { - MODE_INFO *const mi = xd->mi_8x8[0]; - const BLOCK_SIZE bsize = mi->mbmi.sb_type; - const int bw = num_8x8_blocks_wide_lookup[bsize]; - const int bh = num_8x8_blocks_high_lookup[bsize]; - const int y_mis = MIN(bh, cm->mi_rows - mi_row); - const int x_mis = MIN(bw, cm->mi_cols - mi_col); - int x, y, z; - if (frame_is_intra_only(cm)) - read_intra_frame_mode_info(cm, xd, mi, mi_row, mi_col, r); + read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r); else - read_inter_frame_mode_info(cm, xd, tile, mi, mi_row, mi_col, r); - - for (y = 0, z = 0; y < y_mis; y++, z += cm->mode_info_stride) { - for (x = !y; x < x_mis; x++) { - xd->mi_8x8[z + x] = mi; - } - } + read_inter_frame_mode_info(cm, xd, tile, mi_row, mi_col, r); } diff --git a/media/libvpx/vp9/decoder/vp9_decodemv.h b/media/libvpx/vp9/decoder/vp9_decodemv.h index 8e9ae4a548e7..7394b62b4517 100644 --- a/media/libvpx/vp9/decoder/vp9_decodemv.h +++ b/media/libvpx/vp9/decoder/vp9_decodemv.h @@ -11,8 +11,11 @@ #ifndef VP9_DECODER_VP9_DECODEMV_H_ #define VP9_DECODER_VP9_DECODEMV_H_ -#include "vp9/decoder/vp9_onyxd_int.h" -#include "vp9/decoder/vp9_dboolhuff.h" +#include "vp9/decoder/vp9_reader.h" + +#ifdef __cplusplus +extern "C" { +#endif struct TileInfo; @@ -20,4 +23,8 @@ void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd, const struct TileInfo *const tile, int mi_row, int mi_col, vp9_reader *r); +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_DECODER_VP9_DECODEMV_H_ diff --git a/media/libvpx/vp9/decoder/vp9_decoder.c b/media/libvpx/vp9/decoder/vp9_decoder.c new file mode 100644 index 000000000000..9106b0d14b9a --- /dev/null +++ b/media/libvpx/vp9/decoder/vp9_decoder.c @@ -0,0 +1,380 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "./vpx_scale_rtcd.h" + +#include "vpx_mem/vpx_mem.h" +#include "vpx_ports/vpx_timer.h" +#include "vpx_scale/vpx_scale.h" + +#include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vp9/common/vp9_onyxc_int.h" +#if CONFIG_VP9_POSTPROC +#include "vp9/common/vp9_postproc.h" +#endif +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_systemdependent.h" + +#include "vp9/decoder/vp9_decodeframe.h" +#include "vp9/decoder/vp9_decoder.h" +#include "vp9/decoder/vp9_detokenize.h" +#include "vp9/decoder/vp9_dthread.h" + +static void initialize_dec() { + static int init_done = 0; + + if (!init_done) { + vp9_rtcd(); + vp9_init_neighbors(); + vp9_init_intra_predictors(); + init_done = 1; + } +} + +VP9Decoder *vp9_decoder_create() { + VP9Decoder *const pbi = vpx_memalign(32, sizeof(*pbi)); + VP9_COMMON *const cm = pbi ? &pbi->common : NULL; + + if (!cm) + return NULL; + + vp9_zero(*pbi); + + if (setjmp(cm->error.jmp)) { + cm->error.setjmp = 0; + vp9_decoder_remove(pbi); + return NULL; + } + + cm->error.setjmp = 1; + initialize_dec(); + + // Initialize the references to not point to any frame buffers. + vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map)); + + cm->current_video_frame = 0; + pbi->ready_for_new_data = 1; + cm->bit_depth = VPX_BITS_8; + + // vp9_init_dequantizer() is first called here. Add check in + // frame_init_dequantizer() to avoid unnecessary calling of + // vp9_init_dequantizer() for every frame. + vp9_init_dequantizer(cm); + + vp9_loop_filter_init(cm); + + cm->error.setjmp = 0; + + vp9_get_worker_interface()->init(&pbi->lf_worker); + + return pbi; +} + +void vp9_decoder_remove(VP9Decoder *pbi) { + VP9_COMMON *const cm = &pbi->common; + int i; + + vp9_get_worker_interface()->end(&pbi->lf_worker); + vpx_free(pbi->lf_worker.data1); + vpx_free(pbi->tile_data); + for (i = 0; i < pbi->num_tile_workers; ++i) { + VP9Worker *const worker = &pbi->tile_workers[i]; + vp9_get_worker_interface()->end(worker); + vpx_free(worker->data1); + vpx_free(worker->data2); + } + vpx_free(pbi->tile_workers); + + if (pbi->num_tile_workers > 0) { + vp9_loop_filter_dealloc(&pbi->lf_row_sync); + } + + vp9_remove_common(cm); + vpx_free(pbi); +} + +static int equal_dimensions(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + return a->y_height == b->y_height && a->y_width == b->y_width && + a->uv_height == b->uv_height && a->uv_width == b->uv_width; +} + +vpx_codec_err_t vp9_copy_reference_dec(VP9Decoder *pbi, + VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd) { + VP9_COMMON *cm = &pbi->common; + + /* TODO(jkoleszar): The decoder doesn't have any real knowledge of what the + * encoder is using the frame buffers for. This is just a stub to keep the + * vpxenc --test-decode functionality working, and will be replaced in a + * later commit that adds VP9-specific controls for this functionality. + */ + if (ref_frame_flag == VP9_LAST_FLAG) { + const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, 0); + if (cfg == NULL) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "No 'last' reference frame"); + return VPX_CODEC_ERROR; + } + if (!equal_dimensions(cfg, sd)) + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Incorrect buffer dimensions"); + else + vp8_yv12_copy_frame(cfg, sd); + } else { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Invalid reference frame"); + } + + return cm->error.error_code; +} + + +vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm, + VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd) { + RefBuffer *ref_buf = NULL; + + // TODO(jkoleszar): The decoder doesn't have any real knowledge of what the + // encoder is using the frame buffers for. This is just a stub to keep the + // vpxenc --test-decode functionality working, and will be replaced in a + // later commit that adds VP9-specific controls for this functionality. + if (ref_frame_flag == VP9_LAST_FLAG) { + ref_buf = &cm->frame_refs[0]; + } else if (ref_frame_flag == VP9_GOLD_FLAG) { + ref_buf = &cm->frame_refs[1]; + } else if (ref_frame_flag == VP9_ALT_FLAG) { + ref_buf = &cm->frame_refs[2]; + } else { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Invalid reference frame"); + return cm->error.error_code; + } + + if (!equal_dimensions(ref_buf->buf, sd)) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Incorrect buffer dimensions"); + } else { + int *ref_fb_ptr = &ref_buf->idx; + + // Find an empty frame buffer. + const int free_fb = get_free_fb(cm); + // Decrease ref_count since it will be increased again in + // ref_cnt_fb() below. + cm->frame_bufs[free_fb].ref_count--; + + // Manage the reference counters and copy image. + ref_cnt_fb(cm->frame_bufs, ref_fb_ptr, free_fb); + ref_buf->buf = &cm->frame_bufs[*ref_fb_ptr].buf; + vp8_yv12_copy_frame(sd, ref_buf->buf); + } + + return cm->error.error_code; +} + +/* If any buffer updating is signaled it should be done here. */ +static void swap_frame_buffers(VP9Decoder *pbi) { + int ref_index = 0, mask; + VP9_COMMON *const cm = &pbi->common; + + for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { + if (mask & 1) { + const int old_idx = cm->ref_frame_map[ref_index]; + ref_cnt_fb(cm->frame_bufs, &cm->ref_frame_map[ref_index], + cm->new_fb_idx); + if (old_idx >= 0 && cm->frame_bufs[old_idx].ref_count == 0) + cm->release_fb_cb(cm->cb_priv, + &cm->frame_bufs[old_idx].raw_frame_buffer); + } + ++ref_index; + } + + cm->frame_to_show = get_frame_new_buffer(cm); + cm->frame_bufs[cm->new_fb_idx].ref_count--; + + // Invalidate these references until the next frame starts. + for (ref_index = 0; ref_index < 3; ref_index++) + cm->frame_refs[ref_index].idx = INT_MAX; +} + +int vp9_receive_compressed_data(VP9Decoder *pbi, + size_t size, const uint8_t **psource) { + VP9_COMMON *const cm = &pbi->common; + const uint8_t *source = *psource; + int retcode = 0; + + cm->error.error_code = VPX_CODEC_OK; + + if (size == 0) { + // This is used to signal that we are missing frames. + // We do not know if the missing frame(s) was supposed to update + // any of the reference buffers, but we act conservative and + // mark only the last buffer as corrupted. + // + // TODO(jkoleszar): Error concealment is undefined and non-normative + // at this point, but if it becomes so, [0] may not always be the correct + // thing to do here. + if (cm->frame_refs[0].idx != INT_MAX) + cm->frame_refs[0].buf->corrupted = 1; + } + + // Check if the previous frame was a frame without any references to it. + if (cm->new_fb_idx >= 0 && cm->frame_bufs[cm->new_fb_idx].ref_count == 0) + cm->release_fb_cb(cm->cb_priv, + &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer); + cm->new_fb_idx = get_free_fb(cm); + + if (setjmp(cm->error.jmp)) { + cm->error.setjmp = 0; + vp9_clear_system_state(); + + // We do not know if the missing frame(s) was supposed to update + // any of the reference buffers, but we act conservative and + // mark only the last buffer as corrupted. + // + // TODO(jkoleszar): Error concealment is undefined and non-normative + // at this point, but if it becomes so, [0] may not always be the correct + // thing to do here. + if (cm->frame_refs[0].idx != INT_MAX && cm->frame_refs[0].buf != NULL) + cm->frame_refs[0].buf->corrupted = 1; + + if (cm->new_fb_idx > 0 && cm->frame_bufs[cm->new_fb_idx].ref_count > 0) + cm->frame_bufs[cm->new_fb_idx].ref_count--; + + return -1; + } + + cm->error.setjmp = 1; + + vp9_decode_frame(pbi, source, source + size, psource); + + swap_frame_buffers(pbi); + + vp9_clear_system_state(); + + cm->last_width = cm->width; + cm->last_height = cm->height; + + if (!cm->show_existing_frame) + cm->last_show_frame = cm->show_frame; + if (cm->show_frame) { + if (!cm->show_existing_frame) + vp9_swap_mi_and_prev_mi(cm); + + cm->current_video_frame++; + } + + pbi->ready_for_new_data = 0; + + cm->error.setjmp = 0; + return retcode; +} + +int vp9_get_raw_frame(VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd, + vp9_ppflags_t *flags) { + VP9_COMMON *const cm = &pbi->common; + int ret = -1; +#if !CONFIG_VP9_POSTPROC + (void)*flags; +#endif + + if (pbi->ready_for_new_data == 1) + return ret; + + /* no raw frame to show!!! */ + if (!cm->show_frame) + return ret; + + pbi->ready_for_new_data = 1; + +#if CONFIG_VP9_POSTPROC + if (!cm->show_existing_frame) { + ret = vp9_post_proc_frame(cm, sd, flags); + } else { + *sd = *cm->frame_to_show; + ret = 0; + } +#else + *sd = *cm->frame_to_show; + ret = 0; +#endif /*!CONFIG_POSTPROC*/ + vp9_clear_system_state(); + return ret; +} + +vpx_codec_err_t vp9_parse_superframe_index(const uint8_t *data, + size_t data_sz, + uint32_t sizes[8], int *count, + vpx_decrypt_cb decrypt_cb, + void *decrypt_state) { + // A chunk ending with a byte matching 0xc0 is an invalid chunk unless + // it is a super frame index. If the last byte of real video compression + // data is 0xc0 the encoder must add a 0 byte. If we have the marker but + // not the associated matching marker byte at the front of the index we have + // an invalid bitstream and need to return an error. + + uint8_t marker; + + assert(data_sz); + marker = read_marker(decrypt_cb, decrypt_state, data + data_sz - 1); + *count = 0; + + if ((marker & 0xe0) == 0xc0) { + const uint32_t frames = (marker & 0x7) + 1; + const uint32_t mag = ((marker >> 3) & 0x3) + 1; + const size_t index_sz = 2 + mag * frames; + + // This chunk is marked as having a superframe index but doesn't have + // enough data for it, thus it's an invalid superframe index. + if (data_sz < index_sz) + return VPX_CODEC_CORRUPT_FRAME; + + { + const uint8_t marker2 = read_marker(decrypt_cb, decrypt_state, + data + data_sz - index_sz); + + // This chunk is marked as having a superframe index but doesn't have + // the matching marker byte at the front of the index therefore it's an + // invalid chunk. + if (marker != marker2) + return VPX_CODEC_CORRUPT_FRAME; + } + + { + // Found a valid superframe index. + uint32_t i, j; + const uint8_t *x = &data[data_sz - index_sz + 1]; + + // Frames has a maximum of 8 and mag has a maximum of 4. + uint8_t clear_buffer[32]; + assert(sizeof(clear_buffer) >= frames * mag); + if (decrypt_cb) { + decrypt_cb(decrypt_state, x, clear_buffer, frames * mag); + x = clear_buffer; + } + + for (i = 0; i < frames; ++i) { + uint32_t this_sz = 0; + + for (j = 0; j < mag; ++j) + this_sz |= (*x++) << (j * 8); + sizes[i] = this_sz; + } + *count = frames; + } + } + return VPX_CODEC_OK; +} diff --git a/media/libvpx/vp9/decoder/vp9_decoder.h b/media/libvpx/vp9/decoder/vp9_decoder.h new file mode 100644 index 000000000000..848d212e621f --- /dev/null +++ b/media/libvpx/vp9/decoder/vp9_decoder.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_DECODER_VP9_DECODER_H_ +#define VP9_DECODER_VP9_DECODER_H_ + +#include "./vpx_config.h" + +#include "vpx/vpx_codec.h" +#include "vpx_scale/yv12config.h" + +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_ppflags.h" +#include "vp9/common/vp9_thread.h" + +#include "vp9/decoder/vp9_dthread.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// TODO(hkuang): combine this with TileWorkerData. +typedef struct TileData { + VP9_COMMON *cm; + vp9_reader bit_reader; + DECLARE_ALIGNED(16, MACROBLOCKD, xd); +} TileData; + +typedef struct VP9Decoder { + DECLARE_ALIGNED(16, MACROBLOCKD, mb); + + DECLARE_ALIGNED(16, VP9_COMMON, common); + + int ready_for_new_data; + + int refresh_frame_flags; + + int frame_parallel_decode; // frame-based threading. + + VP9Worker lf_worker; + VP9Worker *tile_workers; + int num_tile_workers; + + TileData *tile_data; + int total_tiles; + + VP9LfSync lf_row_sync; + + vpx_decrypt_cb decrypt_cb; + void *decrypt_state; + + int max_threads; + int inv_tile_order; +} VP9Decoder; + +int vp9_receive_compressed_data(struct VP9Decoder *pbi, + size_t size, const uint8_t **dest); + +int vp9_get_raw_frame(struct VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd, + vp9_ppflags_t *flags); + +vpx_codec_err_t vp9_copy_reference_dec(struct VP9Decoder *pbi, + VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd); + +vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm, + VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd); + +struct VP9Decoder *vp9_decoder_create(); + +void vp9_decoder_remove(struct VP9Decoder *pbi); + +static INLINE uint8_t read_marker(vpx_decrypt_cb decrypt_cb, + void *decrypt_state, + const uint8_t *data) { + if (decrypt_cb) { + uint8_t marker; + decrypt_cb(decrypt_state, data, &marker, 1); + return marker; + } + return *data; +} + +// This function is exposed for use in tests, as well as the inlined function +// "read_marker". +vpx_codec_err_t vp9_parse_superframe_index(const uint8_t *data, + size_t data_sz, + uint32_t sizes[8], int *count, + vpx_decrypt_cb decrypt_cb, + void *decrypt_state); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_DECODER_VP9_DECODER_H_ diff --git a/media/libvpx/vp9/decoder/vp9_decodframe.c b/media/libvpx/vp9/decoder/vp9_decodframe.c deleted file mode 100644 index d20593a11194..000000000000 --- a/media/libvpx/vp9/decoder/vp9_decodframe.c +++ /dev/null @@ -1,1389 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -#include "./vp9_rtcd.h" -#include "vpx_mem/vpx_mem.h" -#include "vpx_scale/vpx_scale.h" - -#include "vp9/common/vp9_alloccommon.h" -#include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_entropy.h" -#include "vp9/common/vp9_entropymode.h" -#include "vp9/common/vp9_extend.h" -#include "vp9/common/vp9_idct.h" -#include "vp9/common/vp9_pred_common.h" -#include "vp9/common/vp9_quant_common.h" -#include "vp9/common/vp9_reconintra.h" -#include "vp9/common/vp9_reconinter.h" -#include "vp9/common/vp9_seg_common.h" -#include "vp9/common/vp9_tile_common.h" - -#include "vp9/decoder/vp9_dboolhuff.h" -#include "vp9/decoder/vp9_decodframe.h" -#include "vp9/decoder/vp9_detokenize.h" -#include "vp9/decoder/vp9_decodemv.h" -#include "vp9/decoder/vp9_dsubexp.h" -#include "vp9/decoder/vp9_onyxd_int.h" -#include "vp9/decoder/vp9_read_bit_buffer.h" -#include "vp9/decoder/vp9_thread.h" -#include "vp9/decoder/vp9_treereader.h" - -typedef struct TileWorkerData { - VP9_COMMON *cm; - vp9_reader bit_reader; - DECLARE_ALIGNED(16, MACROBLOCKD, xd); - DECLARE_ALIGNED(16, unsigned char, token_cache[1024]); - DECLARE_ALIGNED(16, int16_t, qcoeff[MAX_MB_PLANE][64 * 64]); - DECLARE_ALIGNED(16, int16_t, dqcoeff[MAX_MB_PLANE][64 * 64]); - DECLARE_ALIGNED(16, uint16_t, eobs[MAX_MB_PLANE][256]); -} TileWorkerData; - -static int read_be32(const uint8_t *p) { - return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; -} - -static int is_compound_prediction_allowed(const VP9_COMMON *cm) { - int i; - for (i = 1; i < ALLOWED_REFS_PER_FRAME; ++i) - if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]) - return 1; - - return 0; -} - -static void setup_compound_prediction(VP9_COMMON *cm) { - if (cm->ref_frame_sign_bias[LAST_FRAME] == - cm->ref_frame_sign_bias[GOLDEN_FRAME]) { - cm->comp_fixed_ref = ALTREF_FRAME; - cm->comp_var_ref[0] = LAST_FRAME; - cm->comp_var_ref[1] = GOLDEN_FRAME; - } else if (cm->ref_frame_sign_bias[LAST_FRAME] == - cm->ref_frame_sign_bias[ALTREF_FRAME]) { - cm->comp_fixed_ref = GOLDEN_FRAME; - cm->comp_var_ref[0] = LAST_FRAME; - cm->comp_var_ref[1] = ALTREF_FRAME; - } else { - cm->comp_fixed_ref = LAST_FRAME; - cm->comp_var_ref[0] = GOLDEN_FRAME; - cm->comp_var_ref[1] = ALTREF_FRAME; - } -} - -// len == 0 is not allowed -static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) { - return start + len > start && start + len <= end; -} - -static int decode_unsigned_max(struct vp9_read_bit_buffer *rb, int max) { - const int data = vp9_rb_read_literal(rb, get_unsigned_bits(max)); - return data > max ? max : data; -} - -static TX_MODE read_tx_mode(vp9_reader *r) { - TX_MODE tx_mode = vp9_read_literal(r, 2); - if (tx_mode == ALLOW_32X32) - tx_mode += vp9_read_bit(r); - return tx_mode; -} - -static void read_tx_probs(struct tx_probs *tx_probs, vp9_reader *r) { - int i, j; - - for (i = 0; i < TX_SIZE_CONTEXTS; ++i) - for (j = 0; j < TX_SIZES - 3; ++j) - vp9_diff_update_prob(r, &tx_probs->p8x8[i][j]); - - for (i = 0; i < TX_SIZE_CONTEXTS; ++i) - for (j = 0; j < TX_SIZES - 2; ++j) - vp9_diff_update_prob(r, &tx_probs->p16x16[i][j]); - - for (i = 0; i < TX_SIZE_CONTEXTS; ++i) - for (j = 0; j < TX_SIZES - 1; ++j) - vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]); -} - -static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) { - int i, j; - for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) - for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) - vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]); -} - -static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) { - int i, j; - for (i = 0; i < INTER_MODE_CONTEXTS; ++i) - for (j = 0; j < INTER_MODES - 1; ++j) - vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]); -} - -static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) { - COMPPREDMODE_TYPE mode = vp9_read_bit(r); - if (mode) - mode += vp9_read_bit(r); - return mode; -} - -static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) { - int i; - - const int compound_allowed = is_compound_prediction_allowed(cm); - cm->comp_pred_mode = compound_allowed ? read_comp_pred_mode(r) - : SINGLE_PREDICTION_ONLY; - if (compound_allowed) - setup_compound_prediction(cm); - - if (cm->comp_pred_mode == HYBRID_PREDICTION) - for (i = 0; i < COMP_INTER_CONTEXTS; i++) - vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]); - - if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) - for (i = 0; i < REF_CONTEXTS; i++) { - vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]); - vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]); - } - - if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY) - for (i = 0; i < REF_CONTEXTS; i++) - vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]); -} - -static void update_mv_probs(vp9_prob *p, int n, vp9_reader *r) { - int i; - for (i = 0; i < n; ++i) - if (vp9_read(r, NMV_UPDATE_PROB)) - p[i] = (vp9_read_literal(r, 7) << 1) | 1; -} - -static void read_mv_probs(nmv_context *ctx, int allow_hp, vp9_reader *r) { - int i, j; - - update_mv_probs(ctx->joints, MV_JOINTS - 1, r); - - for (i = 0; i < 2; ++i) { - nmv_component *const comp_ctx = &ctx->comps[i]; - update_mv_probs(&comp_ctx->sign, 1, r); - update_mv_probs(comp_ctx->classes, MV_CLASSES - 1, r); - update_mv_probs(comp_ctx->class0, CLASS0_SIZE - 1, r); - update_mv_probs(comp_ctx->bits, MV_OFFSET_BITS, r); - } - - for (i = 0; i < 2; ++i) { - nmv_component *const comp_ctx = &ctx->comps[i]; - for (j = 0; j < CLASS0_SIZE; ++j) - update_mv_probs(comp_ctx->class0_fp[j], 3, r); - update_mv_probs(comp_ctx->fp, 3, r); - } - - if (allow_hp) { - for (i = 0; i < 2; ++i) { - nmv_component *const comp_ctx = &ctx->comps[i]; - update_mv_probs(&comp_ctx->class0_hp, 1, r); - update_mv_probs(&comp_ctx->hp, 1, r); - } - } -} - -static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) { - int i; - xd->plane[0].dequant = cm->y_dequant[q_index]; - - for (i = 1; i < MAX_MB_PLANE; i++) - xd->plane[i].dequant = cm->uv_dequant[q_index]; -} - -// Allocate storage for each tile column. -// TODO(jzern): when max_threads <= 1 the same storage could be used for each -// tile. -static void alloc_tile_storage(VP9D_COMP *pbi, int tile_rows, int tile_cols) { - VP9_COMMON *const cm = &pbi->common; - const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); - int i, tile_row, tile_col; - - CHECK_MEM_ERROR(cm, pbi->mi_streams, - vpx_realloc(pbi->mi_streams, tile_rows * tile_cols * - sizeof(*pbi->mi_streams))); - for (tile_row = 0; tile_row < tile_rows; ++tile_row) { - for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - TileInfo tile; - vp9_tile_init(&tile, cm, tile_row, tile_col); - pbi->mi_streams[tile_row * tile_cols + tile_col] = - &cm->mi[tile.mi_row_start * cm->mode_info_stride - + tile.mi_col_start]; - } - } - - // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm - // block where mi unit size is 8x8. - CHECK_MEM_ERROR(cm, pbi->above_context[0], - vpx_realloc(pbi->above_context[0], - sizeof(*pbi->above_context[0]) * MAX_MB_PLANE * - 2 * aligned_mi_cols)); - for (i = 1; i < MAX_MB_PLANE; ++i) { - pbi->above_context[i] = pbi->above_context[0] + - i * sizeof(*pbi->above_context[0]) * - 2 * aligned_mi_cols; - } - - // This is sized based on the entire frame. Each tile operates within its - // column bounds. - CHECK_MEM_ERROR(cm, pbi->above_seg_context, - vpx_realloc(pbi->above_seg_context, - sizeof(*pbi->above_seg_context) * - aligned_mi_cols)); -} - -static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block, - TX_SIZE tx_size, int x, int y) { - struct macroblockd_plane *const pd = &xd->plane[plane]; - const int eob = pd->eobs[block]; - if (eob > 0) { - TX_TYPE tx_type; - const int plane_type = pd->plane_type; - const int stride = pd->dst.stride; - int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - uint8_t *const dst = &pd->dst.buf[4 * y * stride + 4 * x]; - - switch (tx_size) { - case TX_4X4: - tx_type = get_tx_type_4x4(plane_type, xd, block); - if (tx_type == DCT_DCT) - xd->itxm_add(dqcoeff, dst, stride, eob); - else - vp9_iht4x4_16_add(dqcoeff, dst, stride, tx_type); - break; - case TX_8X8: - tx_type = get_tx_type_8x8(plane_type, xd); - vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob); - break; - case TX_16X16: - tx_type = get_tx_type_16x16(plane_type, xd); - vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob); - break; - case TX_32X32: - tx_type = DCT_DCT; - vp9_idct32x32_add(dqcoeff, dst, stride, eob); - break; - default: - assert(!"Invalid transform size"); - } - - if (eob == 1) { - vpx_memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0])); - } else { - if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10) - vpx_memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0])); - else if (tx_size == TX_32X32 && eob <= 34) - vpx_memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0])); - else - vpx_memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0])); - } - } -} - -struct intra_args { - VP9_COMMON *cm; - MACROBLOCKD *xd; - vp9_reader *r; - uint8_t *token_cache; -}; - -static void predict_and_reconstruct_intra_block(int plane, int block, - BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, void *arg) { - struct intra_args *const args = arg; - VP9_COMMON *const cm = args->cm; - MACROBLOCKD *const xd = args->xd; - struct macroblockd_plane *const pd = &xd->plane[plane]; - MODE_INFO *const mi = xd->mi_8x8[0]; - const MB_PREDICTION_MODE mode = (plane == 0) - ? ((mi->mbmi.sb_type < BLOCK_8X8) ? mi->bmi[block].as_mode - : mi->mbmi.mode) - : mi->mbmi.uv_mode; - int x, y; - uint8_t *dst; - txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y); - dst = &pd->dst.buf[4 * y * pd->dst.stride + 4 * x]; - - if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) - extend_for_intra(xd, plane_bsize, plane, block, tx_size); - - vp9_predict_intra_block(xd, block >> (tx_size << 1), - b_width_log2(plane_bsize), tx_size, mode, - dst, pd->dst.stride, dst, pd->dst.stride); - - if (!mi->mbmi.skip_coeff) { - vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, x, y, tx_size, - args->r, args->token_cache); - inverse_transform_block(xd, plane, block, tx_size, x, y); - } -} - -struct inter_args { - VP9_COMMON *cm; - MACROBLOCKD *xd; - vp9_reader *r; - int *eobtotal; - uint8_t *token_cache; -}; - -static void reconstruct_inter_block(int plane, int block, - BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, void *arg) { - struct inter_args *args = arg; - VP9_COMMON *const cm = args->cm; - MACROBLOCKD *const xd = args->xd; - int x, y; - txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y); - - *args->eobtotal += vp9_decode_block_tokens(cm, xd, plane, block, - plane_bsize, x, y, tx_size, - args->r, args->token_cache); - inverse_transform_block(xd, plane, block, tx_size, x, y); -} - -static void set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd, - const TileInfo *const tile, - BLOCK_SIZE bsize, int mi_row, int mi_col) { - const int bh = num_8x8_blocks_high_lookup[bsize]; - const int bw = num_8x8_blocks_wide_lookup[bsize]; - const int offset = mi_row * cm->mode_info_stride + mi_col; - const int tile_offset = tile->mi_row_start * cm->mode_info_stride + - tile->mi_col_start; - - xd->mi_8x8 = cm->mi_grid_visible + offset; - xd->prev_mi_8x8 = cm->prev_mi_grid_visible + offset; - - // we are using the mode info context stream here - xd->mi_8x8[0] = xd->mi_stream + offset - tile_offset; - xd->mi_8x8[0]->mbmi.sb_type = bsize; - - // Special case: if prev_mi is NULL, the previous mode info context - // cannot be used. - xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL; - - set_skip_context(xd, xd->above_context, xd->left_context, mi_row, mi_col); - - // Distance of Mb to the various image edges. These are specified to 8th pel - // as they are always compared to values that are in 1/8th pel units - set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols); - - setup_dst_planes(xd, get_frame_new_buffer(cm), mi_row, mi_col); -} - -static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd, - int idx, int mi_row, int mi_col) { - MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; - const int ref = mbmi->ref_frame[idx] - LAST_FRAME; - const YV12_BUFFER_CONFIG *cfg = get_frame_ref_buffer(cm, ref); - const struct scale_factors_common *sfc = &cm->active_ref_scale_comm[ref]; - if (!vp9_is_valid_scale(sfc)) - vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, - "Invalid scale factors"); - - xd->scale_factor[idx].sfc = sfc; - setup_pre_planes(xd, idx, cfg, mi_row, mi_col, &xd->scale_factor[idx]); - xd->corrupted |= cfg->corrupted; -} - -static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd, - const TileInfo *const tile, - int mi_row, int mi_col, - vp9_reader *r, BLOCK_SIZE bsize, - uint8_t *token_cache) { - const int less8x8 = bsize < BLOCK_8X8; - MB_MODE_INFO *mbmi; - - set_offsets(cm, xd, tile, bsize, mi_row, mi_col); - vp9_read_mode_info(cm, xd, tile, mi_row, mi_col, r); - - if (less8x8) - bsize = BLOCK_8X8; - - // Has to be called after set_offsets - mbmi = &xd->mi_8x8[0]->mbmi; - - if (mbmi->skip_coeff) { - reset_skip_context(xd, bsize); - } else { - if (cm->seg.enabled) - setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id, - cm->base_qindex)); - } - - if (!is_inter_block(mbmi)) { - struct intra_args arg = { - cm, xd, r, token_cache - }; - foreach_transformed_block(xd, bsize, predict_and_reconstruct_intra_block, - &arg); - } else { - // Setup - set_ref(cm, xd, 0, mi_row, mi_col); - if (has_second_ref(mbmi)) - set_ref(cm, xd, 1, mi_row, mi_col); - - xd->subpix.filter_x = xd->subpix.filter_y = - vp9_get_filter_kernel(mbmi->interp_filter); - - // Prediction - vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); - - // Reconstruction - if (!mbmi->skip_coeff) { - int eobtotal = 0; - struct inter_args arg = { - cm, xd, r, &eobtotal, token_cache - }; - foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg); - if (!less8x8 && eobtotal == 0) - mbmi->skip_coeff = 1; // skip loopfilter - } - } - - xd->corrupted |= vp9_reader_has_error(r); -} - -static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs, - int mi_row, int mi_col, BLOCK_SIZE bsize, - vp9_reader *r) { - const int ctx = partition_plane_context(xd->above_seg_context, - xd->left_seg_context, - mi_row, mi_col, bsize); - const vp9_prob *const probs = get_partition_probs(cm, ctx); - const int has_rows = (mi_row + hbs) < cm->mi_rows; - const int has_cols = (mi_col + hbs) < cm->mi_cols; - PARTITION_TYPE p; - - if (has_rows && has_cols) - p = treed_read(r, vp9_partition_tree, probs); - else if (!has_rows && has_cols) - p = vp9_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ; - else if (has_rows && !has_cols) - p = vp9_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT; - else - p = PARTITION_SPLIT; - - if (!cm->frame_parallel_decoding_mode) - ++cm->counts.partition[ctx][p]; - - return p; -} - -static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd, - const TileInfo *const tile, - int mi_row, int mi_col, - vp9_reader* r, BLOCK_SIZE bsize, - uint8_t *token_cache) { - const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2; - PARTITION_TYPE partition; - BLOCK_SIZE subsize; - - if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) - return; - - partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r); - subsize = get_subsize(bsize, partition); - if (subsize < BLOCK_8X8) { - decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache); - } else { - switch (partition) { - case PARTITION_NONE: - decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache); - break; - case PARTITION_HORZ: - decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache); - if (mi_row + hbs < cm->mi_rows) - decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize, - token_cache); - break; - case PARTITION_VERT: - decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache); - if (mi_col + hbs < cm->mi_cols) - decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize, - token_cache); - break; - case PARTITION_SPLIT: - decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize, - token_cache); - decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize, - token_cache); - decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize, - token_cache); - decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize, - token_cache); - break; - default: - assert(!"Invalid partition type"); - } - } - - // update partition context - if (bsize >= BLOCK_8X8 && - (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) - update_partition_context(xd->above_seg_context, xd->left_seg_context, - mi_row, mi_col, subsize, bsize); -} - -static void setup_token_decoder(const uint8_t *data, - const uint8_t *data_end, - size_t read_size, - struct vpx_internal_error_info *error_info, - vp9_reader *r) { - // Validate the calculated partition length. If the buffer - // described by the partition can't be fully read, then restrict - // it to the portion that can be (for EC mode) or throw an error. - if (!read_is_valid(data, read_size, data_end)) - vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME, - "Truncated packet or corrupt tile length"); - - if (vp9_reader_init(r, data, read_size)) - vpx_internal_error(error_info, VPX_CODEC_MEM_ERROR, - "Failed to allocate bool decoder %d", 1); -} - -static void read_coef_probs_common(vp9_coeff_probs_model *coef_probs, - vp9_reader *r) { - int i, j, k, l, m; - - if (vp9_read_bit(r)) - for (i = 0; i < BLOCK_TYPES; i++) - for (j = 0; j < REF_TYPES; j++) - for (k = 0; k < COEF_BANDS; k++) - for (l = 0; l < PREV_COEF_CONTEXTS; l++) - if (k > 0 || l < 3) - for (m = 0; m < UNCONSTRAINED_NODES; m++) - vp9_diff_update_prob(r, &coef_probs[i][j][k][l][m]); -} - -static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode, - vp9_reader *r) { - const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode]; - TX_SIZE tx_size; - for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) - read_coef_probs_common(fc->coef_probs[tx_size], r); -} - -static void setup_segmentation(struct segmentation *seg, - struct vp9_read_bit_buffer *rb) { - int i, j; - - seg->update_map = 0; - seg->update_data = 0; - - seg->enabled = vp9_rb_read_bit(rb); - if (!seg->enabled) - return; - - // Segmentation map update - seg->update_map = vp9_rb_read_bit(rb); - if (seg->update_map) { - for (i = 0; i < SEG_TREE_PROBS; i++) - seg->tree_probs[i] = vp9_rb_read_bit(rb) ? vp9_rb_read_literal(rb, 8) - : MAX_PROB; - - seg->temporal_update = vp9_rb_read_bit(rb); - if (seg->temporal_update) { - for (i = 0; i < PREDICTION_PROBS; i++) - seg->pred_probs[i] = vp9_rb_read_bit(rb) ? vp9_rb_read_literal(rb, 8) - : MAX_PROB; - } else { - for (i = 0; i < PREDICTION_PROBS; i++) - seg->pred_probs[i] = MAX_PROB; - } - } - - // Segmentation data update - seg->update_data = vp9_rb_read_bit(rb); - if (seg->update_data) { - seg->abs_delta = vp9_rb_read_bit(rb); - - vp9_clearall_segfeatures(seg); - - for (i = 0; i < MAX_SEGMENTS; i++) { - for (j = 0; j < SEG_LVL_MAX; j++) { - int data = 0; - const int feature_enabled = vp9_rb_read_bit(rb); - if (feature_enabled) { - vp9_enable_segfeature(seg, i, j); - data = decode_unsigned_max(rb, vp9_seg_feature_data_max(j)); - if (vp9_is_segfeature_signed(j)) - data = vp9_rb_read_bit(rb) ? -data : data; - } - vp9_set_segdata(seg, i, j, data); - } - } - } -} - -static void setup_loopfilter(struct loopfilter *lf, - struct vp9_read_bit_buffer *rb) { - lf->filter_level = vp9_rb_read_literal(rb, 6); - lf->sharpness_level = vp9_rb_read_literal(rb, 3); - - // Read in loop filter deltas applied at the MB level based on mode or ref - // frame. - lf->mode_ref_delta_update = 0; - - lf->mode_ref_delta_enabled = vp9_rb_read_bit(rb); - if (lf->mode_ref_delta_enabled) { - lf->mode_ref_delta_update = vp9_rb_read_bit(rb); - if (lf->mode_ref_delta_update) { - int i; - - for (i = 0; i < MAX_REF_LF_DELTAS; i++) - if (vp9_rb_read_bit(rb)) - lf->ref_deltas[i] = vp9_rb_read_signed_literal(rb, 6); - - for (i = 0; i < MAX_MODE_LF_DELTAS; i++) - if (vp9_rb_read_bit(rb)) - lf->mode_deltas[i] = vp9_rb_read_signed_literal(rb, 6); - } - } -} - -static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) { - const int old = *delta_q; - *delta_q = vp9_rb_read_bit(rb) ? vp9_rb_read_signed_literal(rb, 4) : 0; - return old != *delta_q; -} - -static void setup_quantization(VP9_COMMON *const cm, MACROBLOCKD *const xd, - struct vp9_read_bit_buffer *rb) { - int update = 0; - - cm->base_qindex = vp9_rb_read_literal(rb, QINDEX_BITS); - update |= read_delta_q(rb, &cm->y_dc_delta_q); - update |= read_delta_q(rb, &cm->uv_dc_delta_q); - update |= read_delta_q(rb, &cm->uv_ac_delta_q); - if (update) - vp9_init_dequantizer(cm); - - xd->lossless = cm->base_qindex == 0 && - cm->y_dc_delta_q == 0 && - cm->uv_dc_delta_q == 0 && - cm->uv_ac_delta_q == 0; - - xd->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; -} - -static INTERPOLATION_TYPE read_interp_filter_type( - struct vp9_read_bit_buffer *rb) { - const INTERPOLATION_TYPE literal_to_type[] = { EIGHTTAP_SMOOTH, - EIGHTTAP, - EIGHTTAP_SHARP, - BILINEAR }; - return vp9_rb_read_bit(rb) ? SWITCHABLE - : literal_to_type[vp9_rb_read_literal(rb, 2)]; -} - -static void read_frame_size(struct vp9_read_bit_buffer *rb, - int *width, int *height) { - const int w = vp9_rb_read_literal(rb, 16) + 1; - const int h = vp9_rb_read_literal(rb, 16) + 1; - *width = w; - *height = h; -} - -static void setup_display_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { - cm->display_width = cm->width; - cm->display_height = cm->height; - if (vp9_rb_read_bit(rb)) - read_frame_size(rb, &cm->display_width, &cm->display_height); -} - -static void apply_frame_size(VP9D_COMP *pbi, int width, int height) { - VP9_COMMON *cm = &pbi->common; - - if (cm->width != width || cm->height != height) { - // Change in frame size. - if (cm->width == 0 || cm->height == 0) { - // Assign new frame buffer on first call. - cm->new_fb_idx = NUM_YV12_BUFFERS - 1; - cm->fb_idx_ref_cnt[cm->new_fb_idx] = 1; - } - - // TODO(agrange) Don't test width/height, check overall size. - if (width > cm->width || height > cm->height) { - // Rescale frame buffers only if they're not big enough already. - if (vp9_resize_frame_buffers(cm, width, height)) - vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to allocate frame buffers"); - } - - cm->width = width; - cm->height = height; - - vp9_update_frame_size(cm); - } - - vp9_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, - VP9BORDERINPIXELS); -} - -static void setup_frame_size(VP9D_COMP *pbi, - struct vp9_read_bit_buffer *rb) { - int width, height; - read_frame_size(rb, &width, &height); - apply_frame_size(pbi, width, height); - setup_display_size(&pbi->common, rb); -} - -static void setup_frame_size_with_refs(VP9D_COMP *pbi, - struct vp9_read_bit_buffer *rb) { - VP9_COMMON *const cm = &pbi->common; - - int width, height; - int found = 0, i; - for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) { - if (vp9_rb_read_bit(rb)) { - YV12_BUFFER_CONFIG *const cfg = get_frame_ref_buffer(cm, i); - width = cfg->y_crop_width; - height = cfg->y_crop_height; - found = 1; - break; - } - } - - if (!found) - read_frame_size(rb, &width, &height); - - if (!width || !height) - vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "Referenced frame with invalid size"); - - apply_frame_size(pbi, width, height); - setup_display_size(cm, rb); -} - -static void setup_tile_context(VP9D_COMP *const pbi, MACROBLOCKD *const xd, - int tile_row, int tile_col) { - int i; - const int tile_cols = 1 << pbi->common.log2_tile_cols; - xd->mi_stream = pbi->mi_streams[tile_row * tile_cols + tile_col]; - - for (i = 0; i < MAX_MB_PLANE; ++i) { - xd->above_context[i] = pbi->above_context[i]; - } - // see note in alloc_tile_storage(). - xd->above_seg_context = pbi->above_seg_context; -} - -static void decode_tile(VP9D_COMP *pbi, const TileInfo *const tile, - vp9_reader *r) { - const int num_threads = pbi->oxcf.max_threads; - VP9_COMMON *const cm = &pbi->common; - int mi_row, mi_col; - MACROBLOCKD *xd = &pbi->mb; - - if (pbi->do_loopfilter_inline) { - LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; - lf_data->frame_buffer = get_frame_new_buffer(cm); - lf_data->cm = cm; - lf_data->xd = pbi->mb; - lf_data->stop = 0; - lf_data->y_only = 0; - vp9_loop_filter_frame_init(cm, cm->lf.filter_level); - } - - for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; - mi_row += MI_BLOCK_SIZE) { - // For a SB there are 2 left contexts, each pertaining to a MB row within - vp9_zero(xd->left_context); - vp9_zero(xd->left_seg_context); - for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; - mi_col += MI_BLOCK_SIZE) { - decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64, - pbi->token_cache); - } - - if (pbi->do_loopfilter_inline) { - const int lf_start = mi_row - MI_BLOCK_SIZE; - LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; - - // delay the loopfilter by 1 macroblock row. - if (lf_start < 0) continue; - - // decoding has completed: finish up the loop filter in this thread. - if (mi_row + MI_BLOCK_SIZE >= tile->mi_row_end) continue; - - vp9_worker_sync(&pbi->lf_worker); - lf_data->start = lf_start; - lf_data->stop = mi_row; - if (num_threads > 1) { - vp9_worker_launch(&pbi->lf_worker); - } else { - vp9_worker_execute(&pbi->lf_worker); - } - } - } - - if (pbi->do_loopfilter_inline) { - LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; - - vp9_worker_sync(&pbi->lf_worker); - lf_data->start = lf_data->stop; - lf_data->stop = cm->mi_rows; - vp9_worker_execute(&pbi->lf_worker); - } -} - -static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { - int min_log2_tile_cols, max_log2_tile_cols, max_ones; - vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); - - // columns - max_ones = max_log2_tile_cols - min_log2_tile_cols; - cm->log2_tile_cols = min_log2_tile_cols; - while (max_ones-- && vp9_rb_read_bit(rb)) - cm->log2_tile_cols++; - - // rows - cm->log2_tile_rows = vp9_rb_read_bit(rb); - if (cm->log2_tile_rows) - cm->log2_tile_rows += vp9_rb_read_bit(rb); -} - -// Reads the next tile returning its size and adjusting '*data' accordingly -// based on 'is_last'. -static size_t get_tile(const uint8_t *const data_end, - int is_last, - struct vpx_internal_error_info *error_info, - const uint8_t **data) { - size_t size; - - if (!is_last) { - if (!read_is_valid(*data, 4, data_end)) - vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME, - "Truncated packet or corrupt tile length"); - - size = read_be32(*data); - *data += 4; - - if (size > data_end - *data) { - vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME, - "Truncated packet or corrupt tile size"); - } - } else { - size = data_end - *data; - } - return size; -} - -typedef struct TileBuffer { - const uint8_t *data; - size_t size; -} TileBuffer; - -static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) { - VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; - const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); - const int tile_cols = 1 << cm->log2_tile_cols; - const int tile_rows = 1 << cm->log2_tile_rows; - TileBuffer tile_buffers[4][1 << 6]; - int tile_row, tile_col; - const uint8_t *const data_end = pbi->source + pbi->source_sz; - const uint8_t *end = NULL; - vp9_reader r; - - assert(tile_rows <= 4); - assert(tile_cols <= (1 << 6)); - - // Note: this memset assumes above_context[0], [1] and [2] - // are allocated as part of the same buffer. - vpx_memset(pbi->above_context[0], 0, - sizeof(*pbi->above_context[0]) * MAX_MB_PLANE * 2 * aligned_cols); - - vpx_memset(pbi->above_seg_context, 0, - sizeof(*pbi->above_seg_context) * aligned_cols); - - // Load tile data into tile_buffers - for (tile_row = 0; tile_row < tile_rows; ++tile_row) { - for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - const int last_tile = tile_row == tile_rows - 1 && - tile_col == tile_cols - 1; - const size_t size = get_tile(data_end, last_tile, &cm->error, &data); - TileBuffer *const buf = &tile_buffers[tile_row][tile_col]; - buf->data = data; - buf->size = size; - data += size; - } - } - - // Decode tiles using data from tile_buffers - for (tile_row = 0; tile_row < tile_rows; ++tile_row) { - for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - const int col = pbi->oxcf.inv_tile_order ? tile_cols - tile_col - 1 - : tile_col; - const int last_tile = tile_row == tile_rows - 1 && - col == tile_cols - 1; - const TileBuffer *const buf = &tile_buffers[tile_row][col]; - TileInfo tile; - - vp9_tile_init(&tile, cm, tile_row, col); - setup_token_decoder(buf->data, data_end, buf->size, &cm->error, &r); - setup_tile_context(pbi, xd, tile_row, col); - decode_tile(pbi, &tile, &r); - - if (last_tile) - end = vp9_reader_find_end(&r); - } - } - - return end; -} - -static void setup_tile_macroblockd(TileWorkerData *const tile_data) { - MACROBLOCKD *xd = &tile_data->xd; - struct macroblockd_plane *const pd = xd->plane; - int i; - - for (i = 0; i < MAX_MB_PLANE; ++i) { - pd[i].qcoeff = tile_data->qcoeff[i]; - pd[i].dqcoeff = tile_data->dqcoeff[i]; - pd[i].eobs = tile_data->eobs[i]; - vpx_memset(xd->plane[i].dqcoeff, 0, 64 * 64 * sizeof(int16_t)); - } -} - -static int tile_worker_hook(void *arg1, void *arg2) { - TileWorkerData *const tile_data = (TileWorkerData*)arg1; - const TileInfo *const tile = (TileInfo*)arg2; - int mi_row, mi_col; - - for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; - mi_row += MI_BLOCK_SIZE) { - vp9_zero(tile_data->xd.left_context); - vp9_zero(tile_data->xd.left_seg_context); - for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; - mi_col += MI_BLOCK_SIZE) { - decode_modes_sb(tile_data->cm, &tile_data->xd, tile, - mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64, - tile_data->token_cache); - } - } - return !tile_data->xd.corrupted; -} - -static const uint8_t *decode_tiles_mt(VP9D_COMP *pbi, const uint8_t *data) { - VP9_COMMON *const cm = &pbi->common; - const uint8_t *const data_end = pbi->source + pbi->source_sz; - const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); - const int tile_cols = 1 << cm->log2_tile_cols; - const int tile_rows = 1 << cm->log2_tile_rows; - const int num_workers = MIN(pbi->oxcf.max_threads & ~1, tile_cols); - int tile_col = 0; - - assert(tile_rows == 1); - (void)tile_rows; - - if (num_workers > pbi->num_tile_workers) { - int i; - CHECK_MEM_ERROR(cm, pbi->tile_workers, - vpx_realloc(pbi->tile_workers, - num_workers * sizeof(*pbi->tile_workers))); - for (i = pbi->num_tile_workers; i < num_workers; ++i) { - VP9Worker *const worker = &pbi->tile_workers[i]; - ++pbi->num_tile_workers; - - vp9_worker_init(worker); - worker->hook = (VP9WorkerHook)tile_worker_hook; - CHECK_MEM_ERROR(cm, worker->data1, - vpx_memalign(32, sizeof(TileWorkerData))); - CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo))); - if (i < num_workers - 1 && !vp9_worker_reset(worker)) { - vpx_internal_error(&cm->error, VPX_CODEC_ERROR, - "Tile decoder thread creation failed"); - } - } - } - - // Note: this memset assumes above_context[0], [1] and [2] - // are allocated as part of the same buffer. - vpx_memset(pbi->above_context[0], 0, - sizeof(*pbi->above_context[0]) * MAX_MB_PLANE * - 2 * aligned_mi_cols); - vpx_memset(pbi->above_seg_context, 0, - sizeof(*pbi->above_seg_context) * aligned_mi_cols); - - while (tile_col < tile_cols) { - int i; - for (i = 0; i < num_workers && tile_col < tile_cols; ++i) { - VP9Worker *const worker = &pbi->tile_workers[i]; - TileWorkerData *const tile_data = (TileWorkerData*)worker->data1; - TileInfo *const tile = (TileInfo*)worker->data2; - const size_t size = - get_tile(data_end, tile_col == tile_cols - 1, &cm->error, &data); - - tile_data->cm = cm; - tile_data->xd = pbi->mb; - tile_data->xd.corrupted = 0; - vp9_tile_init(tile, tile_data->cm, 0, tile_col); - - setup_token_decoder(data, data_end, size, &cm->error, - &tile_data->bit_reader); - setup_tile_context(pbi, &tile_data->xd, 0, tile_col); - setup_tile_macroblockd(tile_data); - - worker->had_error = 0; - if (i == num_workers - 1 || tile_col == tile_cols - 1) { - vp9_worker_execute(worker); - } else { - vp9_worker_launch(worker); - } - - data += size; - ++tile_col; - } - - for (; i > 0; --i) { - VP9Worker *const worker = &pbi->tile_workers[i - 1]; - pbi->mb.corrupted |= !vp9_worker_sync(worker); - } - } - - { - const int final_worker = (tile_cols + num_workers - 1) % num_workers; - TileWorkerData *const tile_data = - (TileWorkerData*)pbi->tile_workers[final_worker].data1; - return vp9_reader_find_end(&tile_data->bit_reader); - } -} - -static void check_sync_code(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { - if (vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_0 || - vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_1 || - vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_2) { - vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, - "Invalid frame sync code"); - } -} - -static void error_handler(void *data, size_t bit_offset) { - VP9_COMMON *const cm = (VP9_COMMON *)data; - vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet"); -} - -#define RESERVED \ - if (vp9_rb_read_bit(rb)) \ - vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, \ - "Reserved bit must be unset") - -static size_t read_uncompressed_header(VP9D_COMP *pbi, - struct vp9_read_bit_buffer *rb) { - VP9_COMMON *const cm = &pbi->common; - size_t sz; - int i; - - cm->last_frame_type = cm->frame_type; - - if (vp9_rb_read_literal(rb, 2) != VP9_FRAME_MARKER) - vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, - "Invalid frame marker"); - - cm->version = vp9_rb_read_bit(rb); - RESERVED; - - if (vp9_rb_read_bit(rb)) { - // show an existing frame directly - int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)]; - ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->new_fb_idx, frame_to_show); - pbi->refresh_frame_flags = 0; - cm->lf.filter_level = 0; - return 0; - } - - cm->frame_type = (FRAME_TYPE) vp9_rb_read_bit(rb); - cm->show_frame = vp9_rb_read_bit(rb); - cm->error_resilient_mode = vp9_rb_read_bit(rb); - - if (cm->frame_type == KEY_FRAME) { - check_sync_code(cm, rb); - - cm->color_space = vp9_rb_read_literal(rb, 3); // colorspace - if (cm->color_space != SRGB) { - vp9_rb_read_bit(rb); // [16,235] (including xvycc) vs [0,255] range - if (cm->version == 1) { - cm->subsampling_x = vp9_rb_read_bit(rb); - cm->subsampling_y = vp9_rb_read_bit(rb); - vp9_rb_read_bit(rb); // has extra plane - } else { - cm->subsampling_y = cm->subsampling_x = 1; - } - } else { - if (cm->version == 1) { - cm->subsampling_y = cm->subsampling_x = 0; - vp9_rb_read_bit(rb); // has extra plane - } else { - vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, - "RGB not supported in profile 0"); - } - } - - pbi->refresh_frame_flags = (1 << NUM_REF_FRAMES) - 1; - - for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) - cm->active_ref_idx[i] = cm->new_fb_idx; - - setup_frame_size(pbi, rb); - } else { - cm->intra_only = cm->show_frame ? 0 : vp9_rb_read_bit(rb); - - cm->reset_frame_context = cm->error_resilient_mode ? - 0 : vp9_rb_read_literal(rb, 2); - - if (cm->intra_only) { - check_sync_code(cm, rb); - - pbi->refresh_frame_flags = vp9_rb_read_literal(rb, NUM_REF_FRAMES); - setup_frame_size(pbi, rb); - } else { - pbi->refresh_frame_flags = vp9_rb_read_literal(rb, NUM_REF_FRAMES); - - for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) { - const int ref = vp9_rb_read_literal(rb, NUM_REF_FRAMES_LOG2); - cm->active_ref_idx[i] = cm->ref_frame_map[ref]; - cm->ref_frame_sign_bias[LAST_FRAME + i] = vp9_rb_read_bit(rb); - } - - setup_frame_size_with_refs(pbi, rb); - - cm->allow_high_precision_mv = vp9_rb_read_bit(rb); - cm->mcomp_filter_type = read_interp_filter_type(rb); - - for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) - vp9_setup_scale_factors(cm, i); - } - } - - if (!cm->error_resilient_mode) { - cm->refresh_frame_context = vp9_rb_read_bit(rb); - cm->frame_parallel_decoding_mode = vp9_rb_read_bit(rb); - } else { - cm->refresh_frame_context = 0; - cm->frame_parallel_decoding_mode = 1; - } - - // This flag will be overridden by the call to vp9_setup_past_independence - // below, forcing the use of context 0 for those frame types. - cm->frame_context_idx = vp9_rb_read_literal(rb, NUM_FRAME_CONTEXTS_LOG2); - - if (frame_is_intra_only(cm) || cm->error_resilient_mode) - vp9_setup_past_independence(cm); - - setup_loopfilter(&cm->lf, rb); - setup_quantization(cm, &pbi->mb, rb); - setup_segmentation(&cm->seg, rb); - - setup_tile_info(cm, rb); - sz = vp9_rb_read_literal(rb, 16); - - if (sz == 0) - vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "Invalid header size"); - - return sz; -} - -static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data, - size_t partition_size) { - VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; - FRAME_CONTEXT *const fc = &cm->fc; - vp9_reader r; - int k; - - if (vp9_reader_init(&r, data, partition_size)) - vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to allocate bool decoder 0"); - - cm->tx_mode = xd->lossless ? ONLY_4X4 : read_tx_mode(&r); - if (cm->tx_mode == TX_MODE_SELECT) - read_tx_probs(&fc->tx_probs, &r); - read_coef_probs(fc, cm->tx_mode, &r); - - for (k = 0; k < MBSKIP_CONTEXTS; ++k) - vp9_diff_update_prob(&r, &fc->mbskip_probs[k]); - - if (!frame_is_intra_only(cm)) { - nmv_context *const nmvc = &fc->nmvc; - int i, j; - - read_inter_mode_probs(fc, &r); - - if (cm->mcomp_filter_type == SWITCHABLE) - read_switchable_interp_probs(fc, &r); - - for (i = 0; i < INTRA_INTER_CONTEXTS; i++) - vp9_diff_update_prob(&r, &fc->intra_inter_prob[i]); - - read_comp_pred(cm, &r); - - for (j = 0; j < BLOCK_SIZE_GROUPS; j++) - for (i = 0; i < INTRA_MODES - 1; ++i) - vp9_diff_update_prob(&r, &fc->y_mode_prob[j][i]); - - for (j = 0; j < PARTITION_CONTEXTS; ++j) - for (i = 0; i < PARTITION_TYPES - 1; ++i) - vp9_diff_update_prob(&r, &fc->partition_prob[j][i]); - - read_mv_probs(nmvc, cm->allow_high_precision_mv, &r); - } - - return vp9_reader_has_error(&r); -} - -void vp9_init_dequantizer(VP9_COMMON *cm) { - int q; - - for (q = 0; q < QINDEX_RANGE; q++) { - cm->y_dequant[q][0] = vp9_dc_quant(q, cm->y_dc_delta_q); - cm->y_dequant[q][1] = vp9_ac_quant(q, 0); - - cm->uv_dequant[q][0] = vp9_dc_quant(q, cm->uv_dc_delta_q); - cm->uv_dequant[q][1] = vp9_ac_quant(q, cm->uv_ac_delta_q); - } -} - -#ifdef NDEBUG -#define debug_check_frame_counts(cm) (void)0 -#else // !NDEBUG -// Counts should only be incremented when frame_parallel_decoding_mode and -// error_resilient_mode are disabled. -static void debug_check_frame_counts(const VP9_COMMON *const cm) { - FRAME_COUNTS zero_counts; - vp9_zero(zero_counts); - assert(cm->frame_parallel_decoding_mode || cm->error_resilient_mode); - assert(!memcmp(cm->counts.y_mode, zero_counts.y_mode, - sizeof(cm->counts.y_mode))); - assert(!memcmp(cm->counts.uv_mode, zero_counts.uv_mode, - sizeof(cm->counts.uv_mode))); - assert(!memcmp(cm->counts.partition, zero_counts.partition, - sizeof(cm->counts.partition))); - assert(!memcmp(cm->counts.coef, zero_counts.coef, - sizeof(cm->counts.coef))); - assert(!memcmp(cm->counts.eob_branch, zero_counts.eob_branch, - sizeof(cm->counts.eob_branch))); - assert(!memcmp(cm->counts.switchable_interp, zero_counts.switchable_interp, - sizeof(cm->counts.switchable_interp))); - assert(!memcmp(cm->counts.inter_mode, zero_counts.inter_mode, - sizeof(cm->counts.inter_mode))); - assert(!memcmp(cm->counts.intra_inter, zero_counts.intra_inter, - sizeof(cm->counts.intra_inter))); - assert(!memcmp(cm->counts.comp_inter, zero_counts.comp_inter, - sizeof(cm->counts.comp_inter))); - assert(!memcmp(cm->counts.single_ref, zero_counts.single_ref, - sizeof(cm->counts.single_ref))); - assert(!memcmp(cm->counts.comp_ref, zero_counts.comp_ref, - sizeof(cm->counts.comp_ref))); - assert(!memcmp(&cm->counts.tx, &zero_counts.tx, sizeof(cm->counts.tx))); - assert(!memcmp(cm->counts.mbskip, zero_counts.mbskip, - sizeof(cm->counts.mbskip))); - assert(!memcmp(&cm->counts.mv, &zero_counts.mv, sizeof(cm->counts.mv))); -} -#endif // NDEBUG - -int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { - int i; - VP9_COMMON *const cm = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; - - const uint8_t *data = pbi->source; - const uint8_t *const data_end = pbi->source + pbi->source_sz; - - struct vp9_read_bit_buffer rb = { data, data_end, 0, cm, error_handler }; - const size_t first_partition_size = read_uncompressed_header(pbi, &rb); - const int keyframe = cm->frame_type == KEY_FRAME; - const int tile_rows = 1 << cm->log2_tile_rows; - const int tile_cols = 1 << cm->log2_tile_cols; - YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); - - if (!first_partition_size) { - // showing a frame directly - *p_data_end = data + 1; - return 0; - } - - if (!pbi->decoded_key_frame && !keyframe) - return -1; - - data += vp9_rb_bytes_read(&rb); - if (!read_is_valid(data, first_partition_size, data_end)) - vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "Truncated packet or corrupt header length"); - - pbi->do_loopfilter_inline = - (cm->log2_tile_rows | cm->log2_tile_cols) == 0 && cm->lf.filter_level; - if (pbi->do_loopfilter_inline && pbi->lf_worker.data1 == NULL) { - CHECK_MEM_ERROR(cm, pbi->lf_worker.data1, vpx_malloc(sizeof(LFWorkerData))); - pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker; - if (pbi->oxcf.max_threads > 1 && !vp9_worker_reset(&pbi->lf_worker)) { - vpx_internal_error(&cm->error, VPX_CODEC_ERROR, - "Loop filter thread creation failed"); - } - } - - alloc_tile_storage(pbi, tile_rows, tile_cols); - - xd->mode_info_stride = cm->mode_info_stride; - set_prev_mi(cm); - - setup_plane_dequants(cm, xd, cm->base_qindex); - setup_block_dptrs(xd, cm->subsampling_x, cm->subsampling_y); - - cm->fc = cm->frame_contexts[cm->frame_context_idx]; - vp9_zero(cm->counts); - for (i = 0; i < MAX_MB_PLANE; ++i) - vpx_memset(xd->plane[i].dqcoeff, 0, 64 * 64 * sizeof(int16_t)); - - xd->corrupted = 0; - new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size); - - // TODO(jzern): remove frame_parallel_decoding_mode restriction for - // single-frame tile decoding. - if (pbi->oxcf.max_threads > 1 && tile_rows == 1 && tile_cols > 1 && - cm->frame_parallel_decoding_mode) { - *p_data_end = decode_tiles_mt(pbi, data + first_partition_size); - } else { - *p_data_end = decode_tiles(pbi, data + first_partition_size); - } - - cm->last_width = cm->width; - cm->last_height = cm->height; - - new_fb->corrupted |= xd->corrupted; - - if (!pbi->decoded_key_frame) { - if (keyframe && !new_fb->corrupted) - pbi->decoded_key_frame = 1; - else - vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "A stream must start with a complete key frame"); - } - - if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) { - vp9_adapt_coef_probs(cm); - - if (!frame_is_intra_only(cm)) { - vp9_adapt_mode_probs(cm); - vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv); - } - } else { - debug_check_frame_counts(cm); - } - - if (cm->refresh_frame_context) - cm->frame_contexts[cm->frame_context_idx] = cm->fc; - - return 0; -} diff --git a/media/libvpx/vp9/decoder/vp9_decodframe.h b/media/libvpx/vp9/decoder/vp9_decodframe.h deleted file mode 100644 index c665f6fc1911..000000000000 --- a/media/libvpx/vp9/decoder/vp9_decodframe.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_DECODER_VP9_DECODFRAME_H_ -#define VP9_DECODER_VP9_DECODFRAME_H_ - -struct VP9Common; -struct VP9Decompressor; - -void vp9_init_dequantizer(struct VP9Common *cm); -int vp9_decode_frame(struct VP9Decompressor *cpi, const uint8_t **p_data_end); - -#endif // VP9_DECODER_VP9_DECODFRAME_H_ diff --git a/media/libvpx/vp9/decoder/vp9_detokenize.c b/media/libvpx/vp9/decoder/vp9_detokenize.c index 05a2b427bb01..91cdf3860e87 100644 --- a/media/libvpx/vp9/decoder/vp9_detokenize.c +++ b/media/libvpx/vp9/decoder/vp9_detokenize.c @@ -13,74 +13,35 @@ #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_seg_common.h" -#include "vp9/decoder/vp9_dboolhuff.h" #include "vp9/decoder/vp9_detokenize.h" -#include "vp9/decoder/vp9_onyxd_int.h" -#include "vp9/decoder/vp9_treereader.h" #define EOB_CONTEXT_NODE 0 #define ZERO_CONTEXT_NODE 1 #define ONE_CONTEXT_NODE 2 -#define LOW_VAL_CONTEXT_NODE 3 -#define TWO_CONTEXT_NODE 4 -#define THREE_CONTEXT_NODE 5 -#define HIGH_LOW_CONTEXT_NODE 6 -#define CAT_ONE_CONTEXT_NODE 7 -#define CAT_THREEFOUR_CONTEXT_NODE 8 -#define CAT_THREE_CONTEXT_NODE 9 -#define CAT_FIVE_CONTEXT_NODE 10 - -#define CAT1_MIN_VAL 5 -#define CAT2_MIN_VAL 7 -#define CAT3_MIN_VAL 11 -#define CAT4_MIN_VAL 19 -#define CAT5_MIN_VAL 35 -#define CAT6_MIN_VAL 67 -#define CAT1_PROB0 159 -#define CAT2_PROB0 145 -#define CAT2_PROB1 165 - -#define CAT3_PROB0 140 -#define CAT3_PROB1 148 -#define CAT3_PROB2 173 - -#define CAT4_PROB0 135 -#define CAT4_PROB1 140 -#define CAT4_PROB2 155 -#define CAT4_PROB3 176 - -#define CAT5_PROB0 130 -#define CAT5_PROB1 134 -#define CAT5_PROB2 141 -#define CAT5_PROB3 157 -#define CAT5_PROB4 180 - -static const vp9_prob cat6_prob[15] = { - 254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 -}; - -static const int token_to_counttoken[MAX_ENTROPY_TOKENS] = { - ZERO_TOKEN, ONE_TOKEN, TWO_TOKEN, TWO_TOKEN, - TWO_TOKEN, TWO_TOKEN, TWO_TOKEN, TWO_TOKEN, - TWO_TOKEN, TWO_TOKEN, TWO_TOKEN, DCT_EOB_MODEL_TOKEN -}; +#define LOW_VAL_CONTEXT_NODE 0 +#define TWO_CONTEXT_NODE 1 +#define THREE_CONTEXT_NODE 2 +#define HIGH_LOW_CONTEXT_NODE 3 +#define CAT_ONE_CONTEXT_NODE 4 +#define CAT_THREEFOUR_CONTEXT_NODE 5 +#define CAT_THREE_CONTEXT_NODE 6 +#define CAT_FIVE_CONTEXT_NODE 7 #define INCREMENT_COUNT(token) \ do { \ if (!cm->frame_parallel_decoding_mode) \ - ++coef_counts[band][pt][token_to_counttoken[token]]; \ + ++coef_counts[band][ctx][token]; \ } while (0) - #define WRITE_COEF_CONTINUE(val, token) \ { \ - dqcoeff_ptr[scan[c]] = (vp9_read_bit(r) ? -val : val) * \ - dq[c > 0] / (1 + (tx_size == TX_32X32)); \ - INCREMENT_COUNT(token); \ + v = (val * dqv) >> dq_shift; \ + dqcoeff[scan[c]] = vp9_read_bit(r) ? -v : v; \ token_cache[scan[c]] = vp9_pt_energy_class[token]; \ ++c; \ + ctx = get_coef_context(nb, token_cache, c); \ + dqv = dq[1]; \ continue; \ } @@ -89,69 +50,61 @@ static const int token_to_counttoken[MAX_ENTROPY_TOKENS] = { val += (vp9_read(r, prob) << bits_count); \ } while (0) -static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, - vp9_reader *r, int block_idx, - PLANE_TYPE type, int seg_eob, int16_t *dqcoeff_ptr, - TX_SIZE tx_size, const int16_t *dq, int pt, - uint8_t *token_cache) { +static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type, + int16_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq, + int ctx, const int16_t *scan, const int16_t *nb, + vp9_reader *r) { + const int max_eob = 16 << (tx_size << 1); const FRAME_CONTEXT *const fc = &cm->fc; FRAME_COUNTS *const counts = &cm->counts; - const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi); + const int ref = is_inter_block(&xd->mi[0]->mbmi); int band, c = 0; - const vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES] = + const vp9_prob (*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] = fc->coef_probs[tx_size][type][ref]; - vp9_prob coef_probs_full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]; - uint8_t load_map[COEF_BANDS][PREV_COEF_CONTEXTS] = { { 0 } }; const vp9_prob *prob; - unsigned int (*coef_counts)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES + 1] = + unsigned int (*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1] = counts->coef[tx_size][type][ref]; - unsigned int (*eob_branch_count)[PREV_COEF_CONTEXTS] = + unsigned int (*eob_branch_count)[COEFF_CONTEXTS] = counts->eob_branch[tx_size][type][ref]; - const int16_t *scan, *nb; - const uint8_t *cat6; + uint8_t token_cache[32 * 32]; const uint8_t *band_translate = get_band_translate(tx_size); - get_scan(xd, tx_size, type, block_idx, &scan, &nb); + const int dq_shift = (tx_size == TX_32X32); + int v; + int16_t dqv = dq[0]; - while (c < seg_eob) { + while (c < max_eob) { int val; - if (c) - pt = get_coef_context(nb, token_cache, c); band = *band_translate++; - prob = coef_probs[band][pt]; + prob = coef_probs[band][ctx]; if (!cm->frame_parallel_decoding_mode) - ++eob_branch_count[band][pt]; - if (!vp9_read(r, prob[EOB_CONTEXT_NODE])) + ++eob_branch_count[band][ctx]; + if (!vp9_read(r, prob[EOB_CONTEXT_NODE])) { + INCREMENT_COUNT(EOB_MODEL_TOKEN); break; - goto DECODE_ZERO; + } - SKIP_START: - if (c >= seg_eob) - break; - if (c) - pt = get_coef_context(nb, token_cache, c); - band = *band_translate++; - prob = coef_probs[band][pt]; - - DECODE_ZERO: - if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) { + while (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) { INCREMENT_COUNT(ZERO_TOKEN); - token_cache[scan[c]] = vp9_pt_energy_class[ZERO_TOKEN]; + dqv = dq[1]; + token_cache[scan[c]] = 0; ++c; - goto SKIP_START; + if (c >= max_eob) + return c; // zero tokens at the end (no eob token) + ctx = get_coef_context(nb, token_cache, c); + band = *band_translate++; + prob = coef_probs[band][ctx]; } // ONE_CONTEXT_NODE_0_ if (!vp9_read(r, prob[ONE_CONTEXT_NODE])) { + INCREMENT_COUNT(ONE_TOKEN); WRITE_COEF_CONTINUE(1, ONE_TOKEN); } - // Load full probabilities if not already loaded - if (!load_map[band][pt]) { - vp9_model_to_full_probs(coef_probs[band][pt], - coef_probs_full[band][pt]); - load_map[band][pt] = 1; - } - prob = coef_probs_full[band][pt]; - // LOW_VAL_CONTEXT_NODE_0_ + + INCREMENT_COUNT(TWO_TOKEN); + + prob = vp9_pareto8_full[prob[PIVOT_NODE] - 1]; + if (!vp9_read(r, prob[LOW_VAL_CONTEXT_NODE])) { if (!vp9_read(r, prob[TWO_CONTEXT_NODE])) { WRITE_COEF_CONTINUE(2, TWO_TOKEN); @@ -161,56 +114,62 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, } WRITE_COEF_CONTINUE(4, FOUR_TOKEN); } - // HIGH_LOW_CONTEXT_NODE_0_ + if (!vp9_read(r, prob[HIGH_LOW_CONTEXT_NODE])) { if (!vp9_read(r, prob[CAT_ONE_CONTEXT_NODE])) { val = CAT1_MIN_VAL; - ADJUST_COEF(CAT1_PROB0, 0); - WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY1); + ADJUST_COEF(vp9_cat1_prob[0], 0); + WRITE_COEF_CONTINUE(val, CATEGORY1_TOKEN); } val = CAT2_MIN_VAL; - ADJUST_COEF(CAT2_PROB1, 1); - ADJUST_COEF(CAT2_PROB0, 0); - WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY2); + ADJUST_COEF(vp9_cat2_prob[0], 1); + ADJUST_COEF(vp9_cat2_prob[1], 0); + WRITE_COEF_CONTINUE(val, CATEGORY2_TOKEN); } - // CAT_THREEFOUR_CONTEXT_NODE_0_ + if (!vp9_read(r, prob[CAT_THREEFOUR_CONTEXT_NODE])) { if (!vp9_read(r, prob[CAT_THREE_CONTEXT_NODE])) { val = CAT3_MIN_VAL; - ADJUST_COEF(CAT3_PROB2, 2); - ADJUST_COEF(CAT3_PROB1, 1); - ADJUST_COEF(CAT3_PROB0, 0); - WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY3); + ADJUST_COEF(vp9_cat3_prob[0], 2); + ADJUST_COEF(vp9_cat3_prob[1], 1); + ADJUST_COEF(vp9_cat3_prob[2], 0); + WRITE_COEF_CONTINUE(val, CATEGORY3_TOKEN); } val = CAT4_MIN_VAL; - ADJUST_COEF(CAT4_PROB3, 3); - ADJUST_COEF(CAT4_PROB2, 2); - ADJUST_COEF(CAT4_PROB1, 1); - ADJUST_COEF(CAT4_PROB0, 0); - WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY4); + ADJUST_COEF(vp9_cat4_prob[0], 3); + ADJUST_COEF(vp9_cat4_prob[1], 2); + ADJUST_COEF(vp9_cat4_prob[2], 1); + ADJUST_COEF(vp9_cat4_prob[3], 0); + WRITE_COEF_CONTINUE(val, CATEGORY4_TOKEN); } - // CAT_FIVE_CONTEXT_NODE_0_: + if (!vp9_read(r, prob[CAT_FIVE_CONTEXT_NODE])) { val = CAT5_MIN_VAL; - ADJUST_COEF(CAT5_PROB4, 4); - ADJUST_COEF(CAT5_PROB3, 3); - ADJUST_COEF(CAT5_PROB2, 2); - ADJUST_COEF(CAT5_PROB1, 1); - ADJUST_COEF(CAT5_PROB0, 0); - WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY5); + ADJUST_COEF(vp9_cat5_prob[0], 4); + ADJUST_COEF(vp9_cat5_prob[1], 3); + ADJUST_COEF(vp9_cat5_prob[2], 2); + ADJUST_COEF(vp9_cat5_prob[3], 1); + ADJUST_COEF(vp9_cat5_prob[4], 0); + WRITE_COEF_CONTINUE(val, CATEGORY5_TOKEN); } val = 0; - cat6 = cat6_prob; - while (*cat6) - val = (val << 1) | vp9_read(r, *cat6++); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[0]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[1]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[2]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[3]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[4]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[5]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[6]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[7]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[8]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[9]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[10]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[11]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[12]); + val = (val << 1) | vp9_read(r, vp9_cat6_prob[13]); val += CAT6_MIN_VAL; - WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY6); - } - - if (c < seg_eob) { - if (!cm->frame_parallel_decoding_mode) - ++coef_counts[band][pt][DCT_EOB_MODEL_TOKEN]; + WRITE_COEF_CONTINUE(val, CATEGORY6_TOKEN); } return c; @@ -218,18 +177,15 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd, int plane, int block, BLOCK_SIZE plane_bsize, - int x, int y, TX_SIZE tx_size, vp9_reader *r, - uint8_t *token_cache) { + int x, int y, TX_SIZE tx_size, vp9_reader *r) { struct macroblockd_plane *const pd = &xd->plane[plane]; - const int seg_eob = get_tx_eob(&cm->seg, xd->mi_8x8[0]->mbmi.segment_id, - tx_size); - const int pt = get_entropy_context(tx_size, pd->above_context + x, - pd->left_context + y); - const int eob = decode_coefs(cm, xd, r, block, pd->plane_type, seg_eob, + const int ctx = get_entropy_context(tx_size, pd->above_context + x, + pd->left_context + y); + const scan_order *so = get_scan(xd, tx_size, pd->plane_type, block); + const int eob = decode_coefs(cm, xd, pd->plane_type, BLOCK_OFFSET(pd->dqcoeff, block), tx_size, - pd->dequant, pt, token_cache); - set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, x, y); - pd->eobs[block] = eob; + pd->dequant, ctx, so->scan, so->neighbors, r); + vp9_set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, x, y); return eob; } diff --git a/media/libvpx/vp9/decoder/vp9_detokenize.h b/media/libvpx/vp9/decoder/vp9_detokenize.h index e858a19f755d..5278e97a3028 100644 --- a/media/libvpx/vp9/decoder/vp9_detokenize.h +++ b/media/libvpx/vp9/decoder/vp9_detokenize.h @@ -12,12 +12,19 @@ #ifndef VP9_DECODER_VP9_DETOKENIZE_H_ #define VP9_DECODER_VP9_DETOKENIZE_H_ -#include "vp9/decoder/vp9_onyxd_int.h" -#include "vp9/decoder/vp9_dboolhuff.h" +#include "vp9/decoder/vp9_decoder.h" +#include "vp9/decoder/vp9_reader.h" + +#ifdef __cplusplus +extern "C" { +#endif int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd, int plane, int block, BLOCK_SIZE plane_bsize, - int x, int y, TX_SIZE tx_size, vp9_reader *r, - uint8_t *token_cache); + int x, int y, TX_SIZE tx_size, vp9_reader *r); + +#ifdef __cplusplus +} // extern "C" +#endif #endif // VP9_DECODER_VP9_DETOKENIZE_H_ diff --git a/media/libvpx/vp9/decoder/vp9_dsubexp.c b/media/libvpx/vp9/decoder/vp9_dsubexp.c index fcca01729a48..c22617edb16c 100644 --- a/media/libvpx/vp9/decoder/vp9_dsubexp.c +++ b/media/libvpx/vp9/decoder/vp9_dsubexp.c @@ -19,33 +19,13 @@ static int inv_recenter_nonneg(int v, int m) { return v % 2 ? m - (v + 1) / 2 : m + v / 2; } -static int decode_uniform(vp9_reader *r, int n) { - int v; - const int l = get_unsigned_bits(n); - const int m = (1 << l) - n; - if (!l) - return 0; - - v = vp9_read_literal(r, l - 1); +static int decode_uniform(vp9_reader *r) { + const int l = 8; + const int m = (1 << l) - 191; + const int v = vp9_read_literal(r, l - 1); return v < m ? v : (v << 1) - m + vp9_read_bit(r); } - -static int merge_index(int v, int n, int modulus) { - int max1 = (n - 1 - modulus / 2) / modulus + 1; - if (v < max1) { - v = v * modulus + modulus / 2; - } else { - int w; - v -= max1; - w = v; - v += (v + modulus - modulus / 2) / modulus; - while (v % modulus == modulus / 2 || - w != v - (v + modulus - modulus / 2) / modulus) v++; - } - return v; -} - static int inv_remap_prob(int v, int m) { static int inv_map_table[MAX_PROB - 1] = { 6, 19, 32, 45, 58, 71, 84, 97, 110, 123, 136, 149, 162, 175, 188, @@ -78,30 +58,19 @@ static int inv_remap_prob(int v, int m) { } } -static int decode_term_subexp(vp9_reader *r, int k, int num_syms) { - int i = 0, mk = 0, word; - while (1) { - const int b = i ? k + i - 1 : k; - const int a = 1 << b; - if (num_syms <= mk + 3 * a) { - word = decode_uniform(r, num_syms - mk) + mk; - break; - } else { - if (vp9_read_bit(r)) { - i++; - mk += a; - } else { - word = vp9_read_literal(r, b) + mk; - break; - } - } - } - return word; +static int decode_term_subexp(vp9_reader *r) { + if (!vp9_read_bit(r)) + return vp9_read_literal(r, 4); + if (!vp9_read_bit(r)) + return vp9_read_literal(r, 4) + 16; + if (!vp9_read_bit(r)) + return vp9_read_literal(r, 5) + 32; + return decode_uniform(r) + 64; } void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p) { if (vp9_read(r, DIFF_UPDATE_PROB)) { - const int delp = decode_term_subexp(r, SUBEXP_PARAM, 255); + const int delp = decode_term_subexp(r); *p = (vp9_prob)inv_remap_prob(delp, *p); } } diff --git a/media/libvpx/vp9/decoder/vp9_dsubexp.h b/media/libvpx/vp9/decoder/vp9_dsubexp.h index aeb9399d0278..436f434fb18b 100644 --- a/media/libvpx/vp9/decoder/vp9_dsubexp.h +++ b/media/libvpx/vp9/decoder/vp9_dsubexp.h @@ -12,8 +12,16 @@ #ifndef VP9_DECODER_VP9_DSUBEXP_H_ #define VP9_DECODER_VP9_DSUBEXP_H_ -#include "vp9/decoder/vp9_dboolhuff.h" +#include "vp9/decoder/vp9_reader.h" + +#ifdef __cplusplus +extern "C" { +#endif void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p); +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_DECODER_VP9_DSUBEXP_H_ diff --git a/media/libvpx/vp9/decoder/vp9_dthread.c b/media/libvpx/vp9/decoder/vp9_dthread.c new file mode 100644 index 000000000000..b82ea6a79f5a --- /dev/null +++ b/media/libvpx/vp9/decoder/vp9_dthread.c @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" + +#include "vpx_mem/vpx_mem.h" + +#include "vp9/common/vp9_reconinter.h" + +#include "vp9/decoder/vp9_dthread.h" +#include "vp9/decoder/vp9_decoder.h" + +#if CONFIG_MULTITHREAD +static INLINE void mutex_lock(pthread_mutex_t *const mutex) { + const int kMaxTryLocks = 4000; + int locked = 0; + int i; + + for (i = 0; i < kMaxTryLocks; ++i) { + if (!pthread_mutex_trylock(mutex)) { + locked = 1; + break; + } + } + + if (!locked) + pthread_mutex_lock(mutex); +} +#endif // CONFIG_MULTITHREAD + +static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) { +#if CONFIG_MULTITHREAD + const int nsync = lf_sync->sync_range; + + if (r && !(c & (nsync - 1))) { + pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1]; + mutex_lock(mutex); + + while (c > lf_sync->cur_sb_col[r - 1] - nsync) { + pthread_cond_wait(&lf_sync->cond_[r - 1], mutex); + } + pthread_mutex_unlock(mutex); + } +#else + (void)lf_sync; + (void)r; + (void)c; +#endif // CONFIG_MULTITHREAD +} + +static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c, + const int sb_cols) { +#if CONFIG_MULTITHREAD + const int nsync = lf_sync->sync_range; + int cur; + // Only signal when there are enough filtered SB for next row to run. + int sig = 1; + + if (c < sb_cols - 1) { + cur = c; + if (c % nsync) + sig = 0; + } else { + cur = sb_cols + nsync; + } + + if (sig) { + mutex_lock(&lf_sync->mutex_[r]); + + lf_sync->cur_sb_col[r] = cur; + + pthread_cond_signal(&lf_sync->cond_[r]); + pthread_mutex_unlock(&lf_sync->mutex_[r]); + } +#else + (void)lf_sync; + (void)r; + (void)c; + (void)sb_cols; +#endif // CONFIG_MULTITHREAD +} + +// Implement row loopfiltering for each thread. +static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer, + VP9_COMMON *const cm, + struct macroblockd_plane planes[MAX_MB_PLANE], + int start, int stop, int y_only, + VP9LfSync *const lf_sync, int num_lf_workers) { + const int num_planes = y_only ? 1 : MAX_MB_PLANE; + int r, c; // SB row and col + const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2; + + for (r = start; r < stop; r += num_lf_workers) { + const int mi_row = r << MI_BLOCK_SIZE_LOG2; + MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride; + + for (c = 0; c < sb_cols; ++c) { + const int mi_col = c << MI_BLOCK_SIZE_LOG2; + LOOP_FILTER_MASK lfm; + int plane; + + sync_read(lf_sync, r, c); + + vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col); + vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm); + + for (plane = 0; plane < num_planes; ++plane) { + vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm); + } + + sync_write(lf_sync, r, c, sb_cols); + } + } +} + +// Row-based multi-threaded loopfilter hook +static int loop_filter_row_worker(void *arg1, void *arg2) { + TileWorkerData *const tile_data = (TileWorkerData*)arg1; + LFWorkerData *const lf_data = &tile_data->lfdata; + (void) arg2; + loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes, + lf_data->start, lf_data->stop, lf_data->y_only, + lf_data->lf_sync, lf_data->num_lf_workers); + return 1; +} + +// VP9 decoder: Implement multi-threaded loopfilter that uses the tile +// threads. +void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, + VP9Decoder *pbi, VP9_COMMON *cm, + int frame_filter_level, + int y_only) { + VP9LfSync *const lf_sync = &pbi->lf_row_sync; + const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); + // Number of superblock rows and cols + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + const int tile_cols = 1 << cm->log2_tile_cols; + const int num_workers = MIN(pbi->max_threads & ~1, tile_cols); + int i; + + // Allocate memory used in thread synchronization. + // This always needs to be done even if frame_filter_level is 0. + if (!lf_sync->sync_range || cm->last_height != cm->height) { + vp9_loop_filter_dealloc(lf_sync); + vp9_loop_filter_alloc(cm, lf_sync, sb_rows, cm->width); + } + + if (!frame_filter_level) return; + + vp9_loop_filter_frame_init(cm, frame_filter_level); + + // Initialize cur_sb_col to -1 for all SB rows. + vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); + + // Set up loopfilter thread data. + // The decoder is using num_workers instead of pbi->num_tile_workers + // because it has been observed that using more threads on the + // loopfilter, than there are tile columns in the frame will hurt + // performance on Android. This is because the system will only + // schedule the tile decode workers on cores equal to the number + // of tile columns. Then if the decoder tries to use more threads for the + // loopfilter, it will hurt performance because of contention. If the + // multithreading code changes in the future then the number of workers + // used by the loopfilter should be revisited. + for (i = 0; i < num_workers; ++i) { + VP9Worker *const worker = &pbi->tile_workers[i]; + TileWorkerData *const tile_data = (TileWorkerData*)worker->data1; + LFWorkerData *const lf_data = &tile_data->lfdata; + + worker->hook = (VP9WorkerHook)loop_filter_row_worker; + + // Loopfilter data + lf_data->frame_buffer = frame; + lf_data->cm = cm; + vp9_copy(lf_data->planes, pbi->mb.plane); + lf_data->start = i; + lf_data->stop = sb_rows; + lf_data->y_only = y_only; // always do all planes in decoder + + lf_data->lf_sync = lf_sync; + lf_data->num_lf_workers = num_workers; + + // Start loopfiltering + if (i == num_workers - 1) { + winterface->execute(worker); + } else { + winterface->launch(worker); + } + } + + // Wait till all rows are finished + for (i = 0; i < num_workers; ++i) { + winterface->sync(&pbi->tile_workers[i]); + } +} + +// Set up nsync by width. +static int get_sync_range(int width) { + // nsync numbers are picked by testing. For example, for 4k + // video, using 4 gives best performance. + if (width < 640) + return 1; + else if (width <= 1280) + return 2; + else if (width <= 4096) + return 4; + else + return 8; +} + +// Allocate memory for lf row synchronization +void vp9_loop_filter_alloc(VP9_COMMON *cm, VP9LfSync *lf_sync, int rows, + int width) { + lf_sync->rows = rows; +#if CONFIG_MULTITHREAD + { + int i; + + CHECK_MEM_ERROR(cm, lf_sync->mutex_, + vpx_malloc(sizeof(*lf_sync->mutex_) * rows)); + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&lf_sync->mutex_[i], NULL); + } + + CHECK_MEM_ERROR(cm, lf_sync->cond_, + vpx_malloc(sizeof(*lf_sync->cond_) * rows)); + for (i = 0; i < rows; ++i) { + pthread_cond_init(&lf_sync->cond_[i], NULL); + } + } +#endif // CONFIG_MULTITHREAD + + CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col, + vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows)); + + // Set up nsync. + lf_sync->sync_range = get_sync_range(width); +} + +// Deallocate lf synchronization related mutex and data +void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) { + if (lf_sync != NULL) { +#if CONFIG_MULTITHREAD + int i; + + if (lf_sync->mutex_ != NULL) { + for (i = 0; i < lf_sync->rows; ++i) { + pthread_mutex_destroy(&lf_sync->mutex_[i]); + } + vpx_free(lf_sync->mutex_); + } + if (lf_sync->cond_ != NULL) { + for (i = 0; i < lf_sync->rows; ++i) { + pthread_cond_destroy(&lf_sync->cond_[i]); + } + vpx_free(lf_sync->cond_); + } +#endif // CONFIG_MULTITHREAD + vpx_free(lf_sync->cur_sb_col); + // clear the structure as the source of this call may be a resize in which + // case this call will be followed by an _alloc() which may fail. + vp9_zero(*lf_sync); + } +} diff --git a/media/libvpx/vp9/decoder/vp9_dthread.h b/media/libvpx/vp9/decoder/vp9_dthread.h new file mode 100644 index 000000000000..8b02ef71ecf8 --- /dev/null +++ b/media/libvpx/vp9/decoder/vp9_dthread.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_DECODER_VP9_DTHREAD_H_ +#define VP9_DECODER_VP9_DTHREAD_H_ + +#include "./vpx_config.h" +#include "vp9/common/vp9_thread.h" +#include "vp9/decoder/vp9_reader.h" + +struct VP9Common; +struct VP9Decoder; + +typedef struct TileWorkerData { + struct VP9Common *cm; + vp9_reader bit_reader; + DECLARE_ALIGNED(16, struct macroblockd, xd); + + // Row-based parallel loopfilter data + LFWorkerData lfdata; +} TileWorkerData; + +// Loopfilter row synchronization +typedef struct VP9LfSyncData { +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex_; + pthread_cond_t *cond_; +#endif + // Allocate memory to store the loop-filtered superblock index in each row. + int *cur_sb_col; + // The optimal sync_range for different resolution and platform should be + // determined by testing. Currently, it is chosen to be a power-of-2 number. + int sync_range; + int rows; +} VP9LfSync; + +// Allocate memory for loopfilter row synchronization. +void vp9_loop_filter_alloc(struct VP9Common *cm, VP9LfSync *lf_sync, + int rows, int width); + +// Deallocate loopfilter synchronization related mutex and data. +void vp9_loop_filter_dealloc(VP9LfSync *lf_sync); + +// Multi-threaded loopfilter that uses the tile threads. +void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, + struct VP9Decoder *pbi, + struct VP9Common *cm, + int frame_filter_level, + int y_only); + +#endif // VP9_DECODER_VP9_DTHREAD_H_ diff --git a/media/libvpx/vp9/decoder/vp9_onyxd.h b/media/libvpx/vp9/decoder/vp9_onyxd.h deleted file mode 100644 index a4b9c24fc9b2..000000000000 --- a/media/libvpx/vp9/decoder/vp9_onyxd.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_DECODER_VP9_ONYXD_H_ -#define VP9_DECODER_VP9_ONYXD_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -#include "vpx_scale/yv12config.h" -#include "vp9/common/vp9_ppflags.h" -#include "vpx/vpx_codec.h" - -typedef void *VP9D_PTR; - -typedef struct { - int width; - int height; - int version; - int postprocess; - int max_threads; - int inv_tile_order; - int input_partition; -} VP9D_CONFIG; - -typedef enum { - VP9_LAST_FLAG = 1, - VP9_GOLD_FLAG = 2, - VP9_ALT_FLAG = 4 -} VP9_REFFRAME; - -void vp9_initialize_dec(); - -int vp9_receive_compressed_data(VP9D_PTR comp, - size_t size, const uint8_t **dest, - int64_t time_stamp); - -int vp9_get_raw_frame(VP9D_PTR comp, YV12_BUFFER_CONFIG *sd, - int64_t *time_stamp, int64_t *time_end_stamp, - vp9_ppflags_t *flags); - -vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR comp, - VP9_REFFRAME ref_frame_flag, - YV12_BUFFER_CONFIG *sd); - -vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR comp, - VP9_REFFRAME ref_frame_flag, - YV12_BUFFER_CONFIG *sd); - -int vp9_get_reference_dec(VP9D_PTR ptr, int index, YV12_BUFFER_CONFIG **fb); - - -VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf); - -void vp9_remove_decompressor(VP9D_PTR comp); - -#ifdef __cplusplus -} -#endif - -#endif // VP9_DECODER_VP9_ONYXD_H_ diff --git a/media/libvpx/vp9/decoder/vp9_onyxd_if.c b/media/libvpx/vp9/decoder/vp9_onyxd_if.c deleted file mode 100644 index cb45d3702e89..000000000000 --- a/media/libvpx/vp9/decoder/vp9_onyxd_if.c +++ /dev/null @@ -1,460 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include -#include - -#include "vp9/common/vp9_onyxc_int.h" -#if CONFIG_VP9_POSTPROC -#include "vp9/common/vp9_postproc.h" -#endif -#include "vp9/decoder/vp9_onyxd.h" -#include "vp9/decoder/vp9_onyxd_int.h" -#include "vpx_mem/vpx_mem.h" -#include "vp9/common/vp9_alloccommon.h" -#include "vp9/common/vp9_loopfilter.h" -#include "vp9/common/vp9_quant_common.h" -#include "vpx_scale/vpx_scale.h" -#include "vp9/common/vp9_systemdependent.h" -#include "vpx_ports/vpx_timer.h" -#include "vp9/decoder/vp9_decodframe.h" -#include "vp9/decoder/vp9_detokenize.h" -#include "./vpx_scale_rtcd.h" - -#define WRITE_RECON_BUFFER 0 -#if WRITE_RECON_BUFFER == 1 -static void recon_write_yuv_frame(const char *name, - const YV12_BUFFER_CONFIG *s, - int w, int _h) { - FILE *yuv_file = fopen(name, "ab"); - const uint8_t *src = s->y_buffer; - int h = _h; - - do { - fwrite(src, w, 1, yuv_file); - src += s->y_stride; - } while (--h); - - src = s->u_buffer; - h = (_h + 1) >> 1; - w = (w + 1) >> 1; - - do { - fwrite(src, w, 1, yuv_file); - src += s->uv_stride; - } while (--h); - - src = s->v_buffer; - h = (_h + 1) >> 1; - - do { - fwrite(src, w, 1, yuv_file); - src += s->uv_stride; - } while (--h); - - fclose(yuv_file); -} -#endif -#if WRITE_RECON_BUFFER == 2 -void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) { - // write the frame - FILE *yframe; - int i; - char filename[255]; - - snprintf(filename, sizeof(filename)-1, "dx\\y%04d.raw", this_frame); - yframe = fopen(filename, "wb"); - - for (i = 0; i < frame->y_height; i++) - fwrite(frame->y_buffer + i * frame->y_stride, - frame->y_width, 1, yframe); - - fclose(yframe); - snprintf(filename, sizeof(filename)-1, "dx\\u%04d.raw", this_frame); - yframe = fopen(filename, "wb"); - - for (i = 0; i < frame->uv_height; i++) - fwrite(frame->u_buffer + i * frame->uv_stride, - frame->uv_width, 1, yframe); - - fclose(yframe); - snprintf(filename, sizeof(filename)-1, "dx\\v%04d.raw", this_frame); - yframe = fopen(filename, "wb"); - - for (i = 0; i < frame->uv_height; i++) - fwrite(frame->v_buffer + i * frame->uv_stride, - frame->uv_width, 1, yframe); - - fclose(yframe); -} -#endif - -void vp9_initialize_dec() { - static int init_done = 0; - - if (!init_done) { - vp9_initialize_common(); - vp9_init_quant_tables(); - init_done = 1; - } -} - -static void init_macroblockd(VP9D_COMP *const pbi) { - MACROBLOCKD *xd = &pbi->mb; - struct macroblockd_plane *const pd = xd->plane; - int i; - - for (i = 0; i < MAX_MB_PLANE; ++i) { - pd[i].qcoeff = pbi->qcoeff[i]; - pd[i].dqcoeff = pbi->dqcoeff[i]; - pd[i].eobs = pbi->eobs[i]; - } -} - -VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) { - VP9D_COMP *const pbi = vpx_memalign(32, sizeof(VP9D_COMP)); - VP9_COMMON *const cm = pbi ? &pbi->common : NULL; - - if (!cm) - return NULL; - - vp9_zero(*pbi); - - if (setjmp(cm->error.jmp)) { - cm->error.setjmp = 0; - vp9_remove_decompressor(pbi); - return NULL; - } - - cm->error.setjmp = 1; - vp9_initialize_dec(); - - vp9_create_common(cm); - - pbi->oxcf = *oxcf; - pbi->ready_for_new_data = 1; - cm->current_video_frame = 0; - - // vp9_init_dequantizer() is first called here. Add check in - // frame_init_dequantizer() to avoid unnecessary calling of - // vp9_init_dequantizer() for every frame. - vp9_init_dequantizer(cm); - - vp9_loop_filter_init(cm); - - cm->error.setjmp = 0; - pbi->decoded_key_frame = 0; - - init_macroblockd(pbi); - - vp9_worker_init(&pbi->lf_worker); - - return pbi; -} - -void vp9_remove_decompressor(VP9D_PTR ptr) { - int i; - VP9D_COMP *const pbi = (VP9D_COMP *)ptr; - - if (!pbi) - return; - - vp9_remove_common(&pbi->common); - vp9_worker_end(&pbi->lf_worker); - vpx_free(pbi->lf_worker.data1); - for (i = 0; i < pbi->num_tile_workers; ++i) { - VP9Worker *const worker = &pbi->tile_workers[i]; - vp9_worker_end(worker); - vpx_free(worker->data1); - vpx_free(worker->data2); - } - vpx_free(pbi->tile_workers); - vpx_free(pbi->mi_streams); - vpx_free(pbi->above_context[0]); - vpx_free(pbi->above_seg_context); - vpx_free(pbi); -} - -static int equal_dimensions(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) { - return a->y_height == b->y_height && a->y_width == b->y_width && - a->uv_height == b->uv_height && a->uv_width == b->uv_width; -} - -vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR ptr, - VP9_REFFRAME ref_frame_flag, - YV12_BUFFER_CONFIG *sd) { - VP9D_COMP *pbi = (VP9D_COMP *) ptr; - VP9_COMMON *cm = &pbi->common; - - /* TODO(jkoleszar): The decoder doesn't have any real knowledge of what the - * encoder is using the frame buffers for. This is just a stub to keep the - * vpxenc --test-decode functionality working, and will be replaced in a - * later commit that adds VP9-specific controls for this functionality. - */ - if (ref_frame_flag == VP9_LAST_FLAG) { - YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->ref_frame_map[0]]; - if (!equal_dimensions(cfg, sd)) - vpx_internal_error(&cm->error, VPX_CODEC_ERROR, - "Incorrect buffer dimensions"); - else - vp8_yv12_copy_frame(cfg, sd); - } else { - vpx_internal_error(&cm->error, VPX_CODEC_ERROR, - "Invalid reference frame"); - } - - return cm->error.error_code; -} - - -vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag, - YV12_BUFFER_CONFIG *sd) { - VP9D_COMP *pbi = (VP9D_COMP *) ptr; - VP9_COMMON *cm = &pbi->common; - int *ref_fb_ptr = NULL; - - /* TODO(jkoleszar): The decoder doesn't have any real knowledge of what the - * encoder is using the frame buffers for. This is just a stub to keep the - * vpxenc --test-decode functionality working, and will be replaced in a - * later commit that adds VP9-specific controls for this functionality. - */ - if (ref_frame_flag == VP9_LAST_FLAG) { - ref_fb_ptr = &pbi->common.active_ref_idx[0]; - } else if (ref_frame_flag == VP9_GOLD_FLAG) { - ref_fb_ptr = &pbi->common.active_ref_idx[1]; - } else if (ref_frame_flag == VP9_ALT_FLAG) { - ref_fb_ptr = &pbi->common.active_ref_idx[2]; - } else { - vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR, - "Invalid reference frame"); - return pbi->common.error.error_code; - } - - if (!equal_dimensions(&cm->yv12_fb[*ref_fb_ptr], sd)) { - vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR, - "Incorrect buffer dimensions"); - } else { - // Find an empty frame buffer. - const int free_fb = get_free_fb(cm); - // Decrease fb_idx_ref_cnt since it will be increased again in - // ref_cnt_fb() below. - cm->fb_idx_ref_cnt[free_fb]--; - - // Manage the reference counters and copy image. - ref_cnt_fb(cm->fb_idx_ref_cnt, ref_fb_ptr, free_fb); - vp8_yv12_copy_frame(sd, &cm->yv12_fb[*ref_fb_ptr]); - } - - return pbi->common.error.error_code; -} - - -int vp9_get_reference_dec(VP9D_PTR ptr, int index, YV12_BUFFER_CONFIG **fb) { - VP9D_COMP *pbi = (VP9D_COMP *) ptr; - VP9_COMMON *cm = &pbi->common; - - if (index < 0 || index >= NUM_REF_FRAMES) - return -1; - - *fb = &cm->yv12_fb[cm->ref_frame_map[index]]; - return 0; -} - -/* If any buffer updating is signaled it should be done here. */ -static void swap_frame_buffers(VP9D_COMP *pbi) { - int ref_index = 0, mask; - VP9_COMMON *const cm = &pbi->common; - - for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { - if (mask & 1) - ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->ref_frame_map[ref_index], - cm->new_fb_idx); - ++ref_index; - } - - cm->frame_to_show = get_frame_new_buffer(cm); - cm->fb_idx_ref_cnt[cm->new_fb_idx]--; - - // Invalidate these references until the next frame starts. - for (ref_index = 0; ref_index < 3; ref_index++) - cm->active_ref_idx[ref_index] = INT_MAX; -} - -int vp9_receive_compressed_data(VP9D_PTR ptr, - size_t size, const uint8_t **psource, - int64_t time_stamp) { - VP9D_COMP *pbi = (VP9D_COMP *) ptr; - VP9_COMMON *cm = &pbi->common; - const uint8_t *source = *psource; - int retcode = 0; - - /*if(pbi->ready_for_new_data == 0) - return -1;*/ - - if (ptr == 0) - return -1; - - cm->error.error_code = VPX_CODEC_OK; - - pbi->source = source; - pbi->source_sz = size; - - if (pbi->source_sz == 0) { - /* This is used to signal that we are missing frames. - * We do not know if the missing frame(s) was supposed to update - * any of the reference buffers, but we act conservative and - * mark only the last buffer as corrupted. - * - * TODO(jkoleszar): Error concealment is undefined and non-normative - * at this point, but if it becomes so, [0] may not always be the correct - * thing to do here. - */ - if (cm->active_ref_idx[0] != INT_MAX) - get_frame_ref_buffer(cm, 0)->corrupted = 1; - } - - cm->new_fb_idx = get_free_fb(cm); - - if (setjmp(cm->error.jmp)) { - cm->error.setjmp = 0; - - /* We do not know if the missing frame(s) was supposed to update - * any of the reference buffers, but we act conservative and - * mark only the last buffer as corrupted. - * - * TODO(jkoleszar): Error concealment is undefined and non-normative - * at this point, but if it becomes so, [0] may not always be the correct - * thing to do here. - */ - if (cm->active_ref_idx[0] != INT_MAX) - get_frame_ref_buffer(cm, 0)->corrupted = 1; - - if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0) - cm->fb_idx_ref_cnt[cm->new_fb_idx]--; - - return -1; - } - - cm->error.setjmp = 1; - - retcode = vp9_decode_frame(pbi, psource); - - if (retcode < 0) { - cm->error.error_code = VPX_CODEC_ERROR; - cm->error.setjmp = 0; - if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0) - cm->fb_idx_ref_cnt[cm->new_fb_idx]--; - return retcode; - } - - swap_frame_buffers(pbi); - -#if WRITE_RECON_BUFFER == 2 - if (cm->show_frame) - write_dx_frame_to_file(cm->frame_to_show, - cm->current_video_frame); - else - write_dx_frame_to_file(cm->frame_to_show, - cm->current_video_frame + 1000); -#endif - - if (!pbi->do_loopfilter_inline) { - vp9_loop_filter_frame(cm, &pbi->mb, pbi->common.lf.filter_level, 0, 0); - } - -#if WRITE_RECON_BUFFER == 2 - if (cm->show_frame) - write_dx_frame_to_file(cm->frame_to_show, - cm->current_video_frame + 2000); - else - write_dx_frame_to_file(cm->frame_to_show, - cm->current_video_frame + 3000); -#endif - - vp9_extend_frame_inner_borders(cm->frame_to_show, - cm->subsampling_x, - cm->subsampling_y); - -#if WRITE_RECON_BUFFER == 1 - if (cm->show_frame) - recon_write_yuv_frame("recon.yuv", cm->frame_to_show, - cm->width, cm->height); -#endif - - vp9_clear_system_state(); - - cm->last_show_frame = cm->show_frame; - if (cm->show_frame) { - // current mip will be the prev_mip for the next frame - MODE_INFO *temp = cm->prev_mip; - MODE_INFO **temp2 = cm->prev_mi_grid_base; - cm->prev_mip = cm->mip; - cm->mip = temp; - cm->prev_mi_grid_base = cm->mi_grid_base; - cm->mi_grid_base = temp2; - - // update the upper left visible macroblock ptrs - cm->mi = cm->mip + cm->mode_info_stride + 1; - cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1; - cm->mi_grid_visible = cm->mi_grid_base + cm->mode_info_stride + 1; - cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1; - - pbi->mb.mi_8x8 = cm->mi_grid_visible; - pbi->mb.mi_8x8[0] = cm->mi; - - cm->current_video_frame++; - } - - pbi->ready_for_new_data = 0; - pbi->last_time_stamp = time_stamp; - pbi->source_sz = 0; - - cm->error.setjmp = 0; - return retcode; -} - -int vp9_get_raw_frame(VP9D_PTR ptr, YV12_BUFFER_CONFIG *sd, - int64_t *time_stamp, int64_t *time_end_stamp, - vp9_ppflags_t *flags) { - int ret = -1; - VP9D_COMP *pbi = (VP9D_COMP *) ptr; - - if (pbi->ready_for_new_data == 1) - return ret; - - /* ie no raw frame to show!!! */ - if (pbi->common.show_frame == 0) - return ret; - - pbi->ready_for_new_data = 1; - *time_stamp = pbi->last_time_stamp; - *time_end_stamp = 0; - -#if CONFIG_VP9_POSTPROC - ret = vp9_post_proc_frame(&pbi->common, sd, flags); -#else - - if (pbi->common.frame_to_show) { - *sd = *pbi->common.frame_to_show; - sd->y_width = pbi->common.width; - sd->y_height = pbi->common.height; - sd->uv_width = sd->y_width >> pbi->common.subsampling_x; - sd->uv_height = sd->y_height >> pbi->common.subsampling_y; - - ret = 0; - } else { - ret = -1; - } - -#endif /*!CONFIG_POSTPROC*/ - vp9_clear_system_state(); - return ret; -} diff --git a/media/libvpx/vp9/decoder/vp9_onyxd_int.h b/media/libvpx/vp9/decoder/vp9_onyxd_int.h deleted file mode 100644 index d3d29e98dedc..000000000000 --- a/media/libvpx/vp9/decoder/vp9_onyxd_int.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_DECODER_VP9_ONYXD_INT_H_ -#define VP9_DECODER_VP9_ONYXD_INT_H_ - -#include "./vpx_config.h" - -#include "vp9/common/vp9_onyxc_int.h" -#include "vp9/decoder/vp9_onyxd.h" -#include "vp9/decoder/vp9_thread.h" - -typedef struct VP9Decompressor { - DECLARE_ALIGNED(16, MACROBLOCKD, mb); - - DECLARE_ALIGNED(16, VP9_COMMON, common); - - DECLARE_ALIGNED(16, int16_t, qcoeff[MAX_MB_PLANE][64 * 64]); - DECLARE_ALIGNED(16, int16_t, dqcoeff[MAX_MB_PLANE][64 * 64]); - DECLARE_ALIGNED(16, uint16_t, eobs[MAX_MB_PLANE][256]); - - VP9D_CONFIG oxcf; - - const uint8_t *source; - size_t source_sz; - - int64_t last_time_stamp; - int ready_for_new_data; - - int refresh_frame_flags; - - int decoded_key_frame; - - int initial_width; - int initial_height; - - int do_loopfilter_inline; // apply loopfilter to available rows immediately - VP9Worker lf_worker; - - VP9Worker *tile_workers; - int num_tile_workers; - - /* Each tile column has its own MODE_INFO stream. This array indexes them by - tile column index. */ - MODE_INFO **mi_streams; - - ENTROPY_CONTEXT *above_context[MAX_MB_PLANE]; - PARTITION_CONTEXT *above_seg_context; - - DECLARE_ALIGNED(16, uint8_t, token_cache[1024]); -} VP9D_COMP; - -#endif // VP9_DECODER_VP9_ONYXD_INT_H_ diff --git a/media/libvpx/vp9/decoder/vp9_read_bit_buffer.c b/media/libvpx/vp9/decoder/vp9_read_bit_buffer.c new file mode 100644 index 000000000000..3eef72844c10 --- /dev/null +++ b/media/libvpx/vp9/decoder/vp9_read_bit_buffer.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "vp9/decoder/vp9_read_bit_buffer.h" + +size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb) { + return (rb->bit_offset + CHAR_BIT - 1) / CHAR_BIT; +} + +int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb) { + const size_t off = rb->bit_offset; + const size_t p = off / CHAR_BIT; + const int q = CHAR_BIT - 1 - (int)off % CHAR_BIT; + if (rb->bit_buffer + p >= rb->bit_buffer_end) { + rb->error_handler(rb->error_handler_data); + return 0; + } else { + const int bit = (rb->bit_buffer[p] & (1 << q)) >> q; + rb->bit_offset = off + 1; + return bit; + } +} + +int vp9_rb_read_literal(struct vp9_read_bit_buffer *rb, int bits) { + int value = 0, bit; + for (bit = bits - 1; bit >= 0; bit--) + value |= vp9_rb_read_bit(rb) << bit; + return value; +} + +int vp9_rb_read_signed_literal(struct vp9_read_bit_buffer *rb, + int bits) { + const int value = vp9_rb_read_literal(rb, bits); + return vp9_rb_read_bit(rb) ? -value : value; +} diff --git a/media/libvpx/vp9/decoder/vp9_read_bit_buffer.h b/media/libvpx/vp9/decoder/vp9_read_bit_buffer.h index 41a686837242..fc88bd70a15c 100644 --- a/media/libvpx/vp9/decoder/vp9_read_bit_buffer.h +++ b/media/libvpx/vp9/decoder/vp9_read_bit_buffer.h @@ -15,7 +15,11 @@ #include "vpx/vpx_integer.h" -typedef void (*vp9_rb_error_handler)(void *data, size_t bit_offset); +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*vp9_rb_error_handler)(void *data); struct vp9_read_bit_buffer { const uint8_t *bit_buffer; @@ -26,35 +30,16 @@ struct vp9_read_bit_buffer { vp9_rb_error_handler error_handler; }; -static size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb) { - return rb->bit_offset / CHAR_BIT + (rb->bit_offset % CHAR_BIT > 0); -} +size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb); -static int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb) { - const size_t off = rb->bit_offset; - const size_t p = off / CHAR_BIT; - const int q = CHAR_BIT - 1 - (int)off % CHAR_BIT; - if (rb->bit_buffer + p >= rb->bit_buffer_end) { - rb->error_handler(rb->error_handler_data, rb->bit_offset); - return 0; - } else { - const int bit = (rb->bit_buffer[p] & (1 << q)) >> q; - rb->bit_offset = off + 1; - return bit; - } -} +int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb); -static int vp9_rb_read_literal(struct vp9_read_bit_buffer *rb, int bits) { - int value = 0, bit; - for (bit = bits - 1; bit >= 0; bit--) - value |= vp9_rb_read_bit(rb) << bit; - return value; -} +int vp9_rb_read_literal(struct vp9_read_bit_buffer *rb, int bits); -static int vp9_rb_read_signed_literal(struct vp9_read_bit_buffer *rb, - int bits) { - const int value = vp9_rb_read_literal(rb, bits); - return vp9_rb_read_bit(rb) ? -value : value; -} +int vp9_rb_read_signed_literal(struct vp9_read_bit_buffer *rb, int bits); + +#ifdef __cplusplus +} // extern "C" +#endif #endif // VP9_DECODER_VP9_READ_BIT_BUFFER_H_ diff --git a/media/libvpx/vp9/decoder/vp9_dboolhuff.c b/media/libvpx/vp9/decoder/vp9_reader.c similarity index 62% rename from media/libvpx/vp9/decoder/vp9_dboolhuff.c rename to media/libvpx/vp9/decoder/vp9_reader.c index 06acec4dbf54..6bb4f9f732bb 100644 --- a/media/libvpx/vp9/decoder/vp9_dboolhuff.c +++ b/media/libvpx/vp9/decoder/vp9_reader.c @@ -11,40 +11,51 @@ #include "vpx_ports/mem.h" #include "vpx_mem/vpx_mem.h" -#include "vp9/decoder/vp9_dboolhuff.h" +#include "vp9/decoder/vp9_reader.h" // This is meant to be a large, positive constant that can still be efficiently // loaded as an immediate (on platforms like ARM, for example). // Even relatively modest values like 100 would work fine. #define LOTS_OF_BITS 0x40000000 - -int vp9_reader_init(vp9_reader *r, const uint8_t *buffer, size_t size) { - int marker_bit; - - r->buffer_end = buffer + size; - r->buffer = buffer; - r->value = 0; - r->count = -8; - r->range = 255; - - if (size && !buffer) +int vp9_reader_init(vp9_reader *r, + const uint8_t *buffer, + size_t size, + vpx_decrypt_cb decrypt_cb, + void *decrypt_state) { + if (size && !buffer) { return 1; - - vp9_reader_fill(r); - marker_bit = vp9_read_bit(r); - return marker_bit != 0; + } else { + r->buffer_end = buffer + size; + r->buffer = buffer; + r->value = 0; + r->count = -8; + r->range = 255; + r->decrypt_cb = decrypt_cb; + r->decrypt_state = decrypt_state; + vp9_reader_fill(r); + return vp9_read_bit(r) != 0; // marker bit + } } void vp9_reader_fill(vp9_reader *r) { const uint8_t *const buffer_end = r->buffer_end; const uint8_t *buffer = r->buffer; - VP9_BD_VALUE value = r->value; + const uint8_t *buffer_start = buffer; + BD_VALUE value = r->value; int count = r->count; - int shift = BD_VALUE_SIZE - 8 - (count + 8); + int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT); int loop_end = 0; - const int bits_left = (int)((buffer_end - buffer)*CHAR_BIT); - const int x = shift + CHAR_BIT - bits_left; + const size_t bytes_left = buffer_end - buffer; + const size_t bits_left = bytes_left * CHAR_BIT; + const int x = (int)(shift + CHAR_BIT - bits_left); + + if (r->decrypt_cb) { + size_t n = MIN(sizeof(r->clear_buffer), bytes_left); + r->decrypt_cb(r->decrypt_state, buffer, r->clear_buffer, (int)n); + buffer = r->clear_buffer; + buffer_start = r->clear_buffer; + } if (x >= 0) { count += LOTS_OF_BITS; @@ -54,12 +65,15 @@ void vp9_reader_fill(vp9_reader *r) { if (x < 0 || bits_left) { while (shift >= loop_end) { count += CHAR_BIT; - value |= (VP9_BD_VALUE)*buffer++ << shift; + value |= (BD_VALUE)*buffer++ << shift; shift -= CHAR_BIT; } } - r->buffer = buffer; + // NOTE: Variable 'buffer' may not relate to 'r->buffer' after decryption, + // so we increase 'r->buffer' by the amount that 'buffer' moved, rather than + // assign 'buffer' to 'r->buffer'. + r->buffer += buffer - buffer_start; r->value = value; r->count = count; } diff --git a/media/libvpx/vp9/decoder/vp9_reader.h b/media/libvpx/vp9/decoder/vp9_reader.h new file mode 100644 index 000000000000..2d9eccfbf93e --- /dev/null +++ b/media/libvpx/vp9/decoder/vp9_reader.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_DECODER_VP9_READER_H_ +#define VP9_DECODER_VP9_READER_H_ + +#include +#include + +#include "./vpx_config.h" +#include "vpx_ports/mem.h" +#include "vpx/vp8dx.h" +#include "vpx/vpx_integer.h" + +#include "vp9/common/vp9_prob.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef size_t BD_VALUE; + +#define BD_VALUE_SIZE ((int)sizeof(BD_VALUE) * CHAR_BIT) + +typedef struct { + const uint8_t *buffer_end; + const uint8_t *buffer; + uint8_t clear_buffer[sizeof(BD_VALUE) + 1]; + BD_VALUE value; + int count; + unsigned int range; + vpx_decrypt_cb decrypt_cb; + void *decrypt_state; +} vp9_reader; + +int vp9_reader_init(vp9_reader *r, + const uint8_t *buffer, + size_t size, + vpx_decrypt_cb decrypt_cb, + void *decrypt_state); + +void vp9_reader_fill(vp9_reader *r); + +int vp9_reader_has_error(vp9_reader *r); + +const uint8_t *vp9_reader_find_end(vp9_reader *r); + +static INLINE int vp9_read(vp9_reader *r, int prob) { + unsigned int bit = 0; + BD_VALUE value; + BD_VALUE bigsplit; + int count; + unsigned int range; + unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT; + + if (r->count < 0) + vp9_reader_fill(r); + + value = r->value; + count = r->count; + + bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT); + + range = split; + + if (value >= bigsplit) { + range = r->range - split; + value = value - bigsplit; + bit = 1; + } + + { + register unsigned int shift = vp9_norm[range]; + range <<= shift; + value <<= shift; + count -= shift; + } + r->value = value; + r->count = count; + r->range = range; + + return bit; +} + +static INLINE int vp9_read_bit(vp9_reader *r) { + return vp9_read(r, 128); // vp9_prob_half +} + +static INLINE int vp9_read_literal(vp9_reader *r, int bits) { + int literal = 0, bit; + + for (bit = bits - 1; bit >= 0; bit--) + literal |= vp9_read_bit(r) << bit; + + return literal; +} + +static INLINE int vp9_read_tree(vp9_reader *r, const vp9_tree_index *tree, + const vp9_prob *probs) { + vp9_tree_index i = 0; + + while ((i = tree[i + vp9_read(r, probs[i >> 1])]) > 0) + continue; + + return -i; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_DECODER_VP9_READER_H_ diff --git a/media/libvpx/vp9/decoder/vp9_thread.c b/media/libvpx/vp9/decoder/vp9_thread.c deleted file mode 100644 index d953e72b333a..000000000000 --- a/media/libvpx/vp9/decoder/vp9_thread.c +++ /dev/null @@ -1,251 +0,0 @@ -// Copyright 2013 Google Inc. All Rights Reserved. -// -// Use of this source code is governed by a BSD-style license -// that can be found in the COPYING file in the root of the source -// tree. An additional intellectual property rights grant can be found -// in the file PATENTS. All contributing project authors may -// be found in the AUTHORS file in the root of the source tree. -// ----------------------------------------------------------------------------- -// -// Multi-threaded worker -// -// Original source: -// http://git.chromium.org/webm/libwebp.git -// 100644 blob eff8f2a8c20095aade3c292b0e9292dac6cb3587 src/utils/thread.c - - -#include -#include // for memset() -#include "./vp9_thread.h" - -#if defined(__cplusplus) || defined(c_plusplus) -extern "C" { -#endif - -#if CONFIG_MULTITHREAD - -#if defined(_WIN32) - -//------------------------------------------------------------------------------ -// simplistic pthread emulation layer - -#include // NOLINT - -// _beginthreadex requires __stdcall -#define THREADFN unsigned int __stdcall -#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val) - -static int pthread_create(pthread_t* const thread, const void* attr, - unsigned int (__stdcall *start)(void*), void* arg) { - (void)attr; - *thread = (pthread_t)_beginthreadex(NULL, /* void *security */ - 0, /* unsigned stack_size */ - start, - arg, - 0, /* unsigned initflag */ - NULL); /* unsigned *thrdaddr */ - if (*thread == NULL) return 1; - SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL); - return 0; -} - -static int pthread_join(pthread_t thread, void** value_ptr) { - (void)value_ptr; - return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 || - CloseHandle(thread) == 0); -} - -// Mutex -static int pthread_mutex_init(pthread_mutex_t* const mutex, void* mutexattr) { - (void)mutexattr; - InitializeCriticalSection(mutex); - return 0; -} - -static int pthread_mutex_lock(pthread_mutex_t* const mutex) { - EnterCriticalSection(mutex); - return 0; -} - -static int pthread_mutex_unlock(pthread_mutex_t* const mutex) { - LeaveCriticalSection(mutex); - return 0; -} - -static int pthread_mutex_destroy(pthread_mutex_t* const mutex) { - DeleteCriticalSection(mutex); - return 0; -} - -// Condition -static int pthread_cond_destroy(pthread_cond_t* const condition) { - int ok = 1; - ok &= (CloseHandle(condition->waiting_sem_) != 0); - ok &= (CloseHandle(condition->received_sem_) != 0); - ok &= (CloseHandle(condition->signal_event_) != 0); - return !ok; -} - -static int pthread_cond_init(pthread_cond_t* const condition, void* cond_attr) { - (void)cond_attr; - condition->waiting_sem_ = CreateSemaphore(NULL, 0, 1, NULL); - condition->received_sem_ = CreateSemaphore(NULL, 0, 1, NULL); - condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL); - if (condition->waiting_sem_ == NULL || - condition->received_sem_ == NULL || - condition->signal_event_ == NULL) { - pthread_cond_destroy(condition); - return 1; - } - return 0; -} - -static int pthread_cond_signal(pthread_cond_t* const condition) { - int ok = 1; - if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) { - // a thread is waiting in pthread_cond_wait: allow it to be notified - ok = SetEvent(condition->signal_event_); - // wait until the event is consumed so the signaler cannot consume - // the event via its own pthread_cond_wait. - ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) != - WAIT_OBJECT_0); - } - return !ok; -} - -static int pthread_cond_wait(pthread_cond_t* const condition, - pthread_mutex_t* const mutex) { - int ok; - // note that there is a consumer available so the signal isn't dropped in - // pthread_cond_signal - if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL)) - return 1; - // now unlock the mutex so pthread_cond_signal may be issued - pthread_mutex_unlock(mutex); - ok = (WaitForSingleObject(condition->signal_event_, INFINITE) == - WAIT_OBJECT_0); - ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL); - pthread_mutex_lock(mutex); - return !ok; -} - -#else // _WIN32 -# define THREADFN void* -# define THREAD_RETURN(val) val -#endif - -//------------------------------------------------------------------------------ - -static THREADFN thread_loop(void *ptr) { // thread loop - VP9Worker* const worker = (VP9Worker*)ptr; - int done = 0; - while (!done) { - pthread_mutex_lock(&worker->mutex_); - while (worker->status_ == OK) { // wait in idling mode - pthread_cond_wait(&worker->condition_, &worker->mutex_); - } - if (worker->status_ == WORK) { - vp9_worker_execute(worker); - worker->status_ = OK; - } else if (worker->status_ == NOT_OK) { // finish the worker - done = 1; - } - // signal to the main thread that we're done (for Sync()) - pthread_cond_signal(&worker->condition_); - pthread_mutex_unlock(&worker->mutex_); - } - return THREAD_RETURN(NULL); // Thread is finished -} - -// main thread state control -static void change_state(VP9Worker* const worker, - VP9WorkerStatus new_status) { - // no-op when attempting to change state on a thread that didn't come up - if (worker->status_ < OK) return; - - pthread_mutex_lock(&worker->mutex_); - // wait for the worker to finish - while (worker->status_ != OK) { - pthread_cond_wait(&worker->condition_, &worker->mutex_); - } - // assign new status and release the working thread if needed - if (new_status != OK) { - worker->status_ = new_status; - pthread_cond_signal(&worker->condition_); - } - pthread_mutex_unlock(&worker->mutex_); -} - -#endif // CONFIG_MULTITHREAD - -//------------------------------------------------------------------------------ - -void vp9_worker_init(VP9Worker* const worker) { - memset(worker, 0, sizeof(*worker)); - worker->status_ = NOT_OK; -} - -int vp9_worker_sync(VP9Worker* const worker) { -#if CONFIG_MULTITHREAD - change_state(worker, OK); -#endif - assert(worker->status_ <= OK); - return !worker->had_error; -} - -int vp9_worker_reset(VP9Worker* const worker) { - int ok = 1; - worker->had_error = 0; - if (worker->status_ < OK) { -#if CONFIG_MULTITHREAD - if (pthread_mutex_init(&worker->mutex_, NULL) || - pthread_cond_init(&worker->condition_, NULL)) { - return 0; - } - pthread_mutex_lock(&worker->mutex_); - ok = !pthread_create(&worker->thread_, NULL, thread_loop, worker); - if (ok) worker->status_ = OK; - pthread_mutex_unlock(&worker->mutex_); -#else - worker->status_ = OK; -#endif - } else if (worker->status_ > OK) { - ok = vp9_worker_sync(worker); - } - assert(!ok || (worker->status_ == OK)); - return ok; -} - -void vp9_worker_execute(VP9Worker* const worker) { - if (worker->hook != NULL) { - worker->had_error |= !worker->hook(worker->data1, worker->data2); - } -} - -void vp9_worker_launch(VP9Worker* const worker) { -#if CONFIG_MULTITHREAD - change_state(worker, WORK); -#else - vp9_worker_execute(worker); -#endif -} - -void vp9_worker_end(VP9Worker* const worker) { - if (worker->status_ >= OK) { -#if CONFIG_MULTITHREAD - change_state(worker, NOT_OK); - pthread_join(worker->thread_, NULL); - pthread_mutex_destroy(&worker->mutex_); - pthread_cond_destroy(&worker->condition_); -#else - worker->status_ = NOT_OK; -#endif - } - assert(worker->status_ == NOT_OK); -} - -//------------------------------------------------------------------------------ - -#if defined(__cplusplus) || defined(c_plusplus) -} // extern "C" -#endif diff --git a/media/libvpx/vp9/decoder/vp9_thread.h b/media/libvpx/vp9/decoder/vp9_thread.h deleted file mode 100644 index a624f3c2ab78..000000000000 --- a/media/libvpx/vp9/decoder/vp9_thread.h +++ /dev/null @@ -1,98 +0,0 @@ -// Copyright 2013 Google Inc. All Rights Reserved. -// -// Use of this source code is governed by a BSD-style license -// that can be found in the COPYING file in the root of the source -// tree. An additional intellectual property rights grant can be found -// in the file PATENTS. All contributing project authors may -// be found in the AUTHORS file in the root of the source tree. -// ----------------------------------------------------------------------------- -// -// Multi-threaded worker -// -// Original source: -// http://git.chromium.org/webm/libwebp.git -// 100644 blob 13a61a4c84194c3374080cbf03d881d3cd6af40d src/utils/thread.h - - -#ifndef VP9_DECODER_VP9_THREAD_H_ -#define VP9_DECODER_VP9_THREAD_H_ - -#include "./vpx_config.h" - -#if defined(__cplusplus) || defined(c_plusplus) -extern "C" { -#endif - -#if CONFIG_MULTITHREAD - -#if defined(_WIN32) - -#include // NOLINT -typedef HANDLE pthread_t; -typedef CRITICAL_SECTION pthread_mutex_t; -typedef struct { - HANDLE waiting_sem_; - HANDLE received_sem_; - HANDLE signal_event_; -} pthread_cond_t; - -#else - -#include // NOLINT - -#endif /* _WIN32 */ -#endif /* CONFIG_MULTITHREAD */ - -// State of the worker thread object -typedef enum { - NOT_OK = 0, // object is unusable - OK, // ready to work - WORK // busy finishing the current task -} VP9WorkerStatus; - -// Function to be called by the worker thread. Takes two opaque pointers as -// arguments (data1 and data2), and should return false in case of error. -typedef int (*VP9WorkerHook)(void*, void*); - -// Synchronize object used to launch job in the worker thread -typedef struct { -#if CONFIG_MULTITHREAD - pthread_mutex_t mutex_; - pthread_cond_t condition_; - pthread_t thread_; -#endif - VP9WorkerStatus status_; - VP9WorkerHook hook; // hook to call - void* data1; // first argument passed to 'hook' - void* data2; // second argument passed to 'hook' - int had_error; // return value of the last call to 'hook' -} VP9Worker; - -// Must be called first, before any other method. -void vp9_worker_init(VP9Worker* const worker); -// Must be called to initialize the object and spawn the thread. Re-entrant. -// Will potentially launch the thread. Returns false in case of error. -int vp9_worker_reset(VP9Worker* const worker); -// Makes sure the previous work is finished. Returns true if worker->had_error -// was not set and no error condition was triggered by the working thread. -int vp9_worker_sync(VP9Worker* const worker); -// Triggers the thread to call hook() with data1 and data2 argument. These -// hook/data1/data2 can be changed at any time before calling this function, -// but not be changed afterward until the next call to vp9_worker_sync(). -void vp9_worker_launch(VP9Worker* const worker); -// This function is similar to vp9_worker_launch() except that it calls the -// hook directly instead of using a thread. Convenient to bypass the thread -// mechanism while still using the VP9Worker structs. vp9_worker_sync() must -// still be called afterward (for error reporting). -void vp9_worker_execute(VP9Worker* const worker); -// Kill the thread and terminate the object. To use the object again, one -// must call vp9_worker_reset() again. -void vp9_worker_end(VP9Worker* const worker); - -//------------------------------------------------------------------------------ - -#if defined(__cplusplus) || defined(c_plusplus) -} // extern "C" -#endif - -#endif // VP9_DECODER_VP9_THREAD_H_ diff --git a/media/libvpx/vp9/decoder/vp9_treereader.h b/media/libvpx/vp9/decoder/vp9_treereader.h deleted file mode 100644 index 41680d245b2f..000000000000 --- a/media/libvpx/vp9/decoder/vp9_treereader.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_DECODER_VP9_TREEREADER_H_ -#define VP9_DECODER_VP9_TREEREADER_H_ - -#include "vp9/common/vp9_treecoder.h" -#include "vp9/decoder/vp9_dboolhuff.h" - -// Intent of tree data structure is to make decoding trivial. -static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */ - vp9_tree t, - const vp9_prob *const p) { - register vp9_tree_index i = 0; - - while ((i = t[ i + vp9_read(r, p[i >> 1])]) > 0) - continue; - - return -i; -} - -#endif // VP9_DECODER_VP9_TREEREADER_H_ diff --git a/media/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c b/media/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c new file mode 100644 index 000000000000..6c66f5d5bc98 --- /dev/null +++ b/media/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vp9_rtcd.h" +#include "./vpx_config.h" + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" + +void vp9_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) { + int r; + int16x8_t sum = vld1q_s16(&input[0]); + for (r = 1; r < 8; ++r) { + const int16x8_t input_00 = vld1q_s16(&input[r * stride]); + sum = vaddq_s16(sum, input_00); + } + { + const int32x4_t a = vpaddlq_s16(sum); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0); + output[1] = 0; + } +} + +void vp9_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { + int i; + // stage 1 + int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); + int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2); + int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2); + int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2); + int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2); + int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); + int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); + int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); + for (i = 0; i < 2; ++i) { + int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7; + const int16x8_t v_s0 = vaddq_s16(input_0, input_7); + const int16x8_t v_s1 = vaddq_s16(input_1, input_6); + const int16x8_t v_s2 = vaddq_s16(input_2, input_5); + const int16x8_t v_s3 = vaddq_s16(input_3, input_4); + const int16x8_t v_s4 = vsubq_s16(input_3, input_4); + const int16x8_t v_s5 = vsubq_s16(input_2, input_5); + const int16x8_t v_s6 = vsubq_s16(input_1, input_6); + const int16x8_t v_s7 = vsubq_s16(input_0, input_7); + // fdct4(step, step); + int16x8_t v_x0 = vaddq_s16(v_s0, v_s3); + int16x8_t v_x1 = vaddq_s16(v_s1, v_s2); + int16x8_t v_x2 = vsubq_s16(v_s1, v_s2); + int16x8_t v_x3 = vsubq_s16(v_s0, v_s3); + // fdct4(step, step); + int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); + int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); + int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); + int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); + int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64); + int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64); + int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64); + int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64); + v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64); + v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64); + v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64); + v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64); + v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64); + v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); + const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); + const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); + const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); + out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43 + out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63 + out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47 + out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67 + } + // Stage 2 + v_x0 = vsubq_s16(v_s6, v_s5); + v_x1 = vaddq_s16(v_s6, v_s5); + v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x8_t ab = vcombine_s16(a, b); + const int16x8_t cd = vcombine_s16(c, d); + // Stage 3 + v_x0 = vaddq_s16(v_s4, ab); + v_x1 = vsubq_s16(v_s4, ab); + v_x2 = vsubq_s16(v_s7, cd); + v_x3 = vaddq_s16(v_s7, cd); + } + // Stage 4 + v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64); + v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64); + v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64); + v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64); + v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64); + v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64); + v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64); + v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64); + v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64); + v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64); + v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); + const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); + const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); + const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); + out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53 + out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73 + out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57 + out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77 + } + // transpose 8x8 + { + // 00 01 02 03 40 41 42 43 + // 10 11 12 13 50 51 52 53 + // 20 21 22 23 60 61 62 63 + // 30 31 32 33 70 71 72 73 + // 04 05 06 07 44 45 46 47 + // 14 15 16 17 54 55 56 57 + // 24 25 26 27 64 65 66 67 + // 34 35 36 37 74 75 76 77 + const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0), + vreinterpretq_s32_s16(out_2)); + const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1), + vreinterpretq_s32_s16(out_3)); + const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4), + vreinterpretq_s32_s16(out_6)); + const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5), + vreinterpretq_s32_s16(out_7)); + const int16x8x2_t r01_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]), + vreinterpretq_s16_s32(r13_s32.val[0])); + const int16x8x2_t r23_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]), + vreinterpretq_s16_s32(r13_s32.val[1])); + const int16x8x2_t r45_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]), + vreinterpretq_s16_s32(r57_s32.val[0])); + const int16x8x2_t r67_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]), + vreinterpretq_s16_s32(r57_s32.val[1])); + input_0 = r01_s16.val[0]; + input_1 = r01_s16.val[1]; + input_2 = r23_s16.val[0]; + input_3 = r23_s16.val[1]; + input_4 = r45_s16.val[0]; + input_5 = r45_s16.val[1]; + input_6 = r67_s16.val[0]; + input_7 = r67_s16.val[1]; + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + } + } // for + { + // from vp9_dct_sse2.c + // Post-condition (division by two) + // division of two 16 bits signed numbers using shifts + // n / 2 = (n - (n >> 15)) >> 1 + const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15); + const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15); + const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15); + const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15); + const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15); + const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15); + const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15); + const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15); + input_0 = vhsubq_s16(input_0, sign_in0); + input_1 = vhsubq_s16(input_1, sign_in1); + input_2 = vhsubq_s16(input_2, sign_in2); + input_3 = vhsubq_s16(input_3, sign_in3); + input_4 = vhsubq_s16(input_4, sign_in4); + input_5 = vhsubq_s16(input_5, sign_in5); + input_6 = vhsubq_s16(input_6, sign_in6); + input_7 = vhsubq_s16(input_7, sign_in7); + // store results + vst1q_s16(&final_output[0 * 8], input_0); + vst1q_s16(&final_output[1 * 8], input_1); + vst1q_s16(&final_output[2 * 8], input_2); + vst1q_s16(&final_output[3 * 8], input_3); + vst1q_s16(&final_output[4 * 8], input_4); + vst1q_s16(&final_output[5 * 8], input_5); + vst1q_s16(&final_output[6 * 8], input_6); + vst1q_s16(&final_output[7 * 8], input_7); + } +} + diff --git a/media/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c b/media/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c new file mode 100644 index 000000000000..8c13d0da672c --- /dev/null +++ b/media/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include + +#include "vpx_mem/vpx_mem.h" + +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_seg_common.h" + +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_quantize.h" +#include "vp9/encoder/vp9_rd.h" + +void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, + int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, + int zbin_oq_value, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + // TODO(jingning) Decide the need of these arguments after the + // quantization process is completed. + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)zbin_oq_value; + (void)scan; + + if (!skip_block) { + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + int i; + const int16x8_t v_zero = vdupq_n_s16(0); + const int16x8_t v_one = vdupq_n_s16(1); + int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); + int16x8_t v_round = vmovq_n_s16(round_ptr[1]); + int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]); + int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]); + // adjust for dc + v_round = vsetq_lane_s16(round_ptr[0], v_round, 0); + v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0); + v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0); + // process dc and the first seven ac coeffs + { + const int16x8_t v_iscan = vld1q_s16(&iscan[0]); + const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); + const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp), + vget_low_s16(v_quant)); + const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp), + vget_high_s16(v_quant)); + const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), + vshrn_n_s32(v_tmp_hi, 16)); + const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); + const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); + const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); + v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); + vst1q_s16(&qcoeff_ptr[0], v_qcoeff); + vst1q_s16(&dqcoeff_ptr[0], v_dqcoeff); + v_round = vmovq_n_s16(round_ptr[1]); + v_quant = vmovq_n_s16(quant_ptr[1]); + v_dequant = vmovq_n_s16(dequant_ptr[1]); + } + // now process the rest of the ac coeffs + for (i = 8; i < count; i += 8) { + const int16x8_t v_iscan = vld1q_s16(&iscan[i]); + const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); + const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp), + vget_low_s16(v_quant)); + const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp), + vget_high_s16(v_quant)); + const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), + vshrn_n_s32(v_tmp_hi, 16)); + const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); + const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); + const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); + v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); + vst1q_s16(&qcoeff_ptr[i], v_qcoeff); + vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff); + } + { + const int16x4_t v_eobmax_3210 = + vmax_s16(vget_low_s16(v_eobmax_76543210), + vget_high_s16(v_eobmax_76543210)); + const int64x1_t v_eobmax_xx32 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); + const int16x4_t v_eobmax_tmp = + vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); + const int64x1_t v_eobmax_xxx3 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); + const int16x4_t v_eobmax_final = + vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); + + *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0); + } + } else { + vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t)); + vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t)); + *eob_ptr = 0; + } +} diff --git a/media/libvpx/vp9/encoder/arm/neon/vp9_sad_neon.c b/media/libvpx/vp9/encoder/arm/neon/vp9_sad_neon.c new file mode 100644 index 000000000000..c4cd856804da --- /dev/null +++ b/media/libvpx/vp9/encoder/arm/neon/vp9_sad_neon.c @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vp9_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" + +static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, + const uint16x8_t vec_hi) { + const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), + vget_high_u16(vec_lo)); + const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), + vget_high_u16(vec_hi)); + const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} +static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { + const uint32x4_t a = vpaddlq_u16(vec_16x8); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} + +unsigned int vp9_sad64x64_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum_lo = vdupq_n_u16(0); + uint16x8_t vec_accum_hi = vdupq_n_u16(0); + for (i = 0; i < 64; ++i) { + const uint8x16_t vec_src_00 = vld1q_u8(src); + const uint8x16_t vec_src_16 = vld1q_u8(src + 16); + const uint8x16_t vec_src_32 = vld1q_u8(src + 32); + const uint8x16_t vec_src_48 = vld1q_u8(src + 48); + const uint8x16_t vec_ref_00 = vld1q_u8(ref); + const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); + const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); + const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); + src += src_stride; + ref += ref_stride; + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), + vget_low_u8(vec_ref_00)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), + vget_high_u8(vec_ref_00)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), + vget_low_u8(vec_ref_16)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), + vget_high_u8(vec_ref_16)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32), + vget_low_u8(vec_ref_32)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32), + vget_high_u8(vec_ref_32)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48), + vget_low_u8(vec_ref_48)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48), + vget_high_u8(vec_ref_48)); + } + return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); +} + +unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum_lo = vdupq_n_u16(0); + uint16x8_t vec_accum_hi = vdupq_n_u16(0); + + for (i = 0; i < 32; ++i) { + const uint8x16_t vec_src_00 = vld1q_u8(src); + const uint8x16_t vec_src_16 = vld1q_u8(src + 16); + const uint8x16_t vec_ref_00 = vld1q_u8(ref); + const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); + src += src_stride; + ref += ref_stride; + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), + vget_low_u8(vec_ref_00)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), + vget_high_u8(vec_ref_00)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), + vget_low_u8(vec_ref_16)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), + vget_high_u8(vec_ref_16)); + } + return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); +} + +unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum_lo = vdupq_n_u16(0); + uint16x8_t vec_accum_hi = vdupq_n_u16(0); + + for (i = 0; i < 16; ++i) { + const uint8x16_t vec_src = vld1q_u8(src); + const uint8x16_t vec_ref = vld1q_u8(ref); + src += src_stride; + ref += ref_stride; + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src), + vget_low_u8(vec_ref)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src), + vget_high_u8(vec_ref)); + } + return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); +} + +unsigned int vp9_sad8x8_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum = vdupq_n_u16(0); + + for (i = 0; i < 8; ++i) { + const uint8x8_t vec_src = vld1_u8(src); + const uint8x8_t vec_ref = vld1_u8(ref); + src += src_stride; + ref += ref_stride; + vec_accum = vabal_u8(vec_accum, vec_src, vec_ref); + } + return horizontal_add_16x8(vec_accum); +} diff --git a/media/libvpx/vp9/encoder/arm/neon/vp9_subtract_neon.c b/media/libvpx/vp9/encoder/arm/neon/vp9_subtract_neon.c new file mode 100644 index 000000000000..b4bf567db99c --- /dev/null +++ b/media/libvpx/vp9/encoder/arm/neon/vp9_subtract_neon.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vp9_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" + +void vp9_subtract_block_neon(int rows, int cols, + int16_t *diff, ptrdiff_t diff_stride, + const uint8_t *src, ptrdiff_t src_stride, + const uint8_t *pred, ptrdiff_t pred_stride) { + int r, c; + + if (cols > 16) { + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; c += 32) { + const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]); + const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]); + const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]); + const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]); + const uint16x8_t v_diff_lo_00 = vsubl_u8(vget_low_u8(v_src_00), + vget_low_u8(v_pred_00)); + const uint16x8_t v_diff_hi_00 = vsubl_u8(vget_high_u8(v_src_00), + vget_high_u8(v_pred_00)); + const uint16x8_t v_diff_lo_16 = vsubl_u8(vget_low_u8(v_src_16), + vget_low_u8(v_pred_16)); + const uint16x8_t v_diff_hi_16 = vsubl_u8(vget_high_u8(v_src_16), + vget_high_u8(v_pred_16)); + vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00)); + vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00)); + vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16)); + vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16)); + } + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } else if (cols > 8) { + for (r = 0; r < rows; ++r) { + const uint8x16_t v_src = vld1q_u8(&src[0]); + const uint8x16_t v_pred = vld1q_u8(&pred[0]); + const uint16x8_t v_diff_lo = vsubl_u8(vget_low_u8(v_src), + vget_low_u8(v_pred)); + const uint16x8_t v_diff_hi = vsubl_u8(vget_high_u8(v_src), + vget_high_u8(v_pred)); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo)); + vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi)); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } else if (cols > 4) { + for (r = 0; r < rows; ++r) { + const uint8x8_t v_src = vld1_u8(&src[0]); + const uint8x8_t v_pred = vld1_u8(&pred[0]); + const uint16x8_t v_diff = vsubl_u8(v_src, v_pred); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff)); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } else { + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; ++c) + diff[c] = src[c] - pred[c]; + + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } +} diff --git a/media/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c b/media/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c new file mode 100644 index 000000000000..816fbda1fbe3 --- /dev/null +++ b/media/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c @@ -0,0 +1,227 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./vp9_rtcd.h" + +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_filter.h" + +#include "vp9/encoder/vp9_variance.h" + +enum { kWidth8 = 8 }; +enum { kHeight8 = 8 }; +enum { kHeight8PlusOne = 9 }; +enum { kWidth16 = 16 }; +enum { kHeight16 = 16 }; +enum { kHeight16PlusOne = 17 }; +enum { kWidth32 = 32 }; +enum { kHeight32 = 32 }; +enum { kHeight32PlusOne = 33 }; +enum { kPixelStepOne = 1 }; +enum { kAlign16 = 16 }; + +static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { + const int32x4_t a = vpaddlq_s16(v_16x8); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { + const int64x2_t b = vpaddlq_s32(v_32x4); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static void variance_neon_w8(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + int i, j; + int16x8_t v_sum = vdupq_n_s16(0); + int32x4_t v_sse_lo = vdupq_n_s32(0); + int32x4_t v_sse_hi = vdupq_n_s32(0); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const uint8x8_t v_a = vld1_u8(&a[j]); + const uint8x8_t v_b = vld1_u8(&b[j]); + const uint16x8_t v_diff = vsubl_u8(v_a, v_b); + const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); + v_sum = vaddq_s16(v_sum, sv_diff); + v_sse_lo = vmlal_s16(v_sse_lo, + vget_low_s16(sv_diff), + vget_low_s16(sv_diff)); + v_sse_hi = vmlal_s16(v_sse_hi, + vget_high_s16(sv_diff), + vget_high_s16(sv_diff)); + } + a += a_stride; + b += b_stride; + } + + *sum = horizontal_add_s16x8(v_sum); + *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); +} + +void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth8, + kHeight8, sse, sum); +} + +unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum); + return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8)); +} + +void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth16, + kHeight16, sse, sum); +} + +unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum); + return *sse - (((int64_t)sum * sum) / (kWidth16 * kHeight16)); +} + +static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp9_filter) { + const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]); + const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]); + unsigned int i; + for (i = 0; i < output_height; ++i) { + const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); + const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); + const uint16x8_t a = vmull_u8(src_0, f0); + const uint16x8_t b = vmlal_u8(a, src_1, f1); + const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); + vst1_u8(&output_ptr[0], out); + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } +} + +static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp9_filter) { + const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]); + const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]); + unsigned int i, j; + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 16) { + const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]); + const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]); + const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); + const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); + const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); + const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); + const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); + const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); + vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi)); + } + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } +} + +unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src, + int src_stride, + int xoffset, + int yoffset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8 * kWidth8); + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8); + + var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne, + kHeight8PlusOne, kWidth8, + BILINEAR_FILTERS_2TAP(xoffset)); + var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8, + kWidth8, BILINEAR_FILTERS_2TAP(yoffset)); + return vp9_variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse); +} + +unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src, + int src_stride, + int xoffset, + int yoffset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight16 * kWidth16); + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight16PlusOne * kWidth16); + + var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne, + kHeight16PlusOne, kWidth16, + BILINEAR_FILTERS_2TAP(xoffset)); + var_filter_block2d_bil_w16(fdata3, temp2, kWidth16, kWidth16, kHeight16, + kWidth16, BILINEAR_FILTERS_2TAP(yoffset)); + return vp9_variance16x16_neon(temp2, kWidth16, dst, dst_stride, sse); +} + +void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth32, + kHeight32, sse, sum); +} + +unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, sse, &sum); + return *sse - (((int64_t)sum * sum) / (kWidth32 * kHeight32)); +} + +unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, + int src_stride, + int xoffset, + int yoffset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight32 * kWidth32); + DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight32PlusOne * kWidth32); + + var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne, + kHeight32PlusOne, kWidth32, + BILINEAR_FILTERS_2TAP(xoffset)); + var_filter_block2d_bil_w16(fdata3, temp2, kWidth32, kWidth32, kHeight32, + kWidth32, BILINEAR_FILTERS_2TAP(yoffset)); + return vp9_variance32x32_neon(temp2, kWidth32, dst, dst_stride, sse); +} diff --git a/media/libvpx/vp9/encoder/vp9_aq_complexity.c b/media/libvpx/vp9/encoder/vp9_aq_complexity.c new file mode 100644 index 000000000000..33f92393c941 --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_aq_complexity.c @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "vp9/common/vp9_seg_common.h" + +#include "vp9/encoder/vp9_segmentation.h" + +#define AQ_C_SEGMENTS 3 +#define AQ_C_STRENGTHS 3 +static const int aq_c_active_segments[AQ_C_STRENGTHS] = {1, 2, 3}; +static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = + {{1.0, 1.0, 1.0}, {1.0, 2.0, 1.0}, {1.0, 1.5, 2.5}}; +static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = + {{1.0, 1.0, 1.0}, {1.0, 0.25, 0.0}, {1.0, 0.5, 0.25}}; + +static int get_aq_c_strength(int q_index) { + // Approximate base quatizer (truncated to int) + int base_quant = vp9_ac_quant(q_index, 0) / 4; + return (base_quant > 20) + (base_quant > 45); +} + +void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + struct segmentation *const seg = &cm->seg; + + // Make SURE use of floating point in this function is safe. + vp9_clear_system_state(); + + if (cm->frame_type == KEY_FRAME || + cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { + int segment; + const int aq_strength = get_aq_c_strength(cm->base_qindex); + const int active_segments = aq_c_active_segments[aq_strength]; + + // Clear down the segment map. + vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); + + // Clear down the complexity map used for rd. + vpx_memset(cpi->complexity_map, 0, cm->mi_rows * cm->mi_cols); + + vp9_clearall_segfeatures(seg); + + // Segmentation only makes sense if the target bits per SB is above a + // threshold. Below this the overheads will usually outweigh any benefit. + if (cpi->rc.sb64_target_rate < 256) { + vp9_disable_segmentation(seg); + return; + } + + vp9_enable_segmentation(seg); + + // Select delta coding method. + seg->abs_delta = SEGMENT_DELTADATA; + + // Segment 0 "Q" feature is disabled so it defaults to the baseline Q. + vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q); + + // Use some of the segments for in frame Q adjustment. + for (segment = 1; segment < active_segments; ++segment) { + int qindex_delta = + vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex, + aq_c_q_adj_factor[aq_strength][segment]); + + // For AQ complexity mode, we dont allow Q0 in a segment if the base + // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment + // Q delta is sometimes applied without going back around the rd loop. + // This could lead to an illegal combination of partition size and q. + if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) { + qindex_delta = -cm->base_qindex + 1; + } + if ((cm->base_qindex + qindex_delta) > 0) { + vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q); + vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta); + } + } + } +} + +// Select a segment for the current SB64 block. +// The choice of segment for a block depends on the ratio of the projected +// bits for the block vs a target average. +// An "aq_strength" value determines how many segments are supported, +// the set of transition points to use and the extent of the quantizer +// adjustment for each segment (configured in vp9_setup_in_frame_q_adj()). +void vp9_select_in_frame_q_segment(VP9_COMP *cpi, + int mi_row, int mi_col, + int output_enabled, int projected_rate) { + VP9_COMMON *const cm = &cpi->common; + + const int mi_offset = mi_row * cm->mi_cols + mi_col; + const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64]; + const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64]; + const int xmis = MIN(cm->mi_cols - mi_col, bw); + const int ymis = MIN(cm->mi_rows - mi_row, bh); + int complexity_metric = 64; + int x, y; + + unsigned char segment; + + if (!output_enabled) { + segment = 0; + } else { + // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh). + // It is converted to bits * 256 units. + const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) / + (bw * bh); + const int aq_strength = get_aq_c_strength(cm->base_qindex); + const int active_segments = aq_c_active_segments[aq_strength]; + + // The number of segments considered and the transition points used to + // select them is determined by the "aq_strength" value. + // Currently this loop only supports segments that reduce Q (i.e. where + // there is undershoot. + // The loop counts down towards segment 0 which is the default segment + // with no Q adjustment. + segment = active_segments - 1; + while (segment > 0) { + if (projected_rate < + (target_rate * aq_c_transitions[aq_strength][segment])) { + break; + } + --segment; + } + + if (target_rate > 0) { + complexity_metric = + clamp((int)((projected_rate * 64) / target_rate), 16, 255); + } + } + + // Fill in the entires in the segment map corresponding to this SB64. + for (y = 0; y < ymis; y++) { + for (x = 0; x < xmis; x++) { + cpi->segmentation_map[mi_offset + y * cm->mi_cols + x] = segment; + cpi->complexity_map[mi_offset + y * cm->mi_cols + x] = + (unsigned char)complexity_metric; + } + } +} diff --git a/media/libvpx/vp9/encoder/vp9_aq_complexity.h b/media/libvpx/vp9/encoder/vp9_aq_complexity.h new file mode 100644 index 000000000000..af031a46c6cd --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_aq_complexity.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ +#define VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP9_COMP; + +// Select a segment for the current SB64. +void vp9_select_in_frame_q_segment(struct VP9_COMP *cpi, int mi_row, int mi_col, + int output_enabled, int projected_rate); + + +// This function sets up a set of segments with delta Q values around +// the baseline frame quantizer. +void vp9_setup_in_frame_q_adj(struct VP9_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ diff --git a/media/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/media/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c new file mode 100644 index 000000000000..e7f0daac3ae6 --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -0,0 +1,323 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "vp9/encoder/vp9_aq_cyclicrefresh.h" + +#include "vp9/common/vp9_seg_common.h" + +#include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_segmentation.h" + +struct CYCLIC_REFRESH { + // Percentage of super-blocks per frame that are targeted as candidates + // for cyclic refresh. + int max_sbs_perframe; + // Maximum q-delta as percentage of base q. + int max_qdelta_perc; + // Block size below which we don't apply cyclic refresh. + BLOCK_SIZE min_block_size; + // Superblock starting index for cycling through the frame. + int sb_index; + // Controls how long a block will need to wait to be refreshed again. + int time_for_refresh; + // Actual number of (8x8) blocks that were applied delta-q (segment 1). + int num_seg_blocks; + // Actual encoding bits for segment 1. + int actual_seg_bits; + // RD mult. parameters for segment 1. + int rdmult; + // Cyclic refresh map. + signed char *map; + // Projected rate and distortion for the current superblock. + int64_t projected_rate_sb; + int64_t projected_dist_sb; + // Thresholds applied to projected rate/distortion of the superblock. + int64_t thresh_rate_sb; + int64_t thresh_dist_sb; +}; + +CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) { + CYCLIC_REFRESH *const cr = vpx_calloc(1, sizeof(*cr)); + if (cr == NULL) + return NULL; + + cr->map = vpx_calloc(mi_rows * mi_cols, sizeof(*cr->map)); + if (cr->map == NULL) { + vpx_free(cr); + return NULL; + } + + return cr; +} + +void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr) { + vpx_free(cr->map); + vpx_free(cr); +} + +// Check if we should turn off cyclic refresh based on bitrate condition. +static int apply_cyclic_refresh_bitrate(const VP9_COMMON *cm, + const RATE_CONTROL *rc) { + // Turn off cyclic refresh if bits available per frame is not sufficiently + // larger than bit cost of segmentation. Segment map bit cost should scale + // with number of seg blocks, so compare available bits to number of blocks. + // Average bits available per frame = avg_frame_bandwidth + // Number of (8x8) blocks in frame = mi_rows * mi_cols; + const float factor = 0.5; + const int number_blocks = cm->mi_rows * cm->mi_cols; + // The condition below corresponds to turning off at target bitrates: + // ~24kbps for CIF, 72kbps for VGA (at 30fps). + // Also turn off at very small frame sizes, to avoid too large fraction of + // superblocks to be refreshed per frame. Threshold below is less than QCIF. + if (rc->avg_frame_bandwidth < factor * number_blocks || + number_blocks / 64 < 5) + return 0; + else + return 1; +} + +// Check if this coding block, of size bsize, should be considered for refresh +// (lower-qp coding). Decision can be based on various factors, such as +// size of the coding block (i.e., below min_block size rejected), coding +// mode, and rate/distortion. +static int candidate_refresh_aq(const CYCLIC_REFRESH *cr, + const MB_MODE_INFO *mbmi, + BLOCK_SIZE bsize, int use_rd) { + if (use_rd) { + // If projected rate is below the thresh_rate (well below target, + // so undershoot expected), accept it for lower-qp coding. + if (cr->projected_rate_sb < cr->thresh_rate_sb) + return 1; + // Otherwise, reject the block for lower-qp coding if any of the following: + // 1) prediction block size is below min_block_size + // 2) mode is non-zero mv and projected distortion is above thresh_dist + // 3) mode is an intra-mode (we may want to allow some of this under + // another thresh_dist) + else if (bsize < cr->min_block_size || + (mbmi->mv[0].as_int != 0 && + cr->projected_dist_sb > cr->thresh_dist_sb) || + !is_inter_block(mbmi)) + return 0; + else + return 1; + } else { + // Rate/distortion not used for update. + if (bsize < cr->min_block_size || + mbmi->mv[0].as_int != 0 || + !is_inter_block(mbmi)) + return 0; + else + return 1; + } +} + +// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col), +// check if we should reset the segment_id, and update the cyclic_refresh map +// and segmentation map. +void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi, + MB_MODE_INFO *const mbmi, + int mi_row, int mi_col, + BLOCK_SIZE bsize, int use_rd) { + const VP9_COMMON *const cm = &cpi->common; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + const int bw = num_8x8_blocks_wide_lookup[bsize]; + const int bh = num_8x8_blocks_high_lookup[bsize]; + const int xmis = MIN(cm->mi_cols - mi_col, bw); + const int ymis = MIN(cm->mi_rows - mi_row, bh); + const int block_index = mi_row * cm->mi_cols + mi_col; + const int refresh_this_block = cpi->mb.in_static_area || + candidate_refresh_aq(cr, mbmi, bsize, use_rd); + // Default is to not update the refresh map. + int new_map_value = cr->map[block_index]; + int x = 0; int y = 0; + + // Check if we should reset the segment_id for this block. + if (mbmi->segment_id > 0 && !refresh_this_block) + mbmi->segment_id = 0; + + // Update the cyclic refresh map, to be used for setting segmentation map + // for the next frame. If the block will be refreshed this frame, mark it + // as clean. The magnitude of the -ve influences how long before we consider + // it for refresh again. + if (mbmi->segment_id == 1) { + new_map_value = -cr->time_for_refresh; + } else if (refresh_this_block) { + // Else if it is accepted as candidate for refresh, and has not already + // been refreshed (marked as 1) then mark it as a candidate for cleanup + // for future time (marked as 0), otherwise don't update it. + if (cr->map[block_index] == 1) + new_map_value = 0; + } else { + // Leave it marked as block that is not candidate for refresh. + new_map_value = 1; + } + // Update entries in the cyclic refresh map with new_map_value, and + // copy mbmi->segment_id into global segmentation map. + for (y = 0; y < ymis; y++) + for (x = 0; x < xmis; x++) { + cr->map[block_index + y * cm->mi_cols + x] = new_map_value; + cpi->segmentation_map[block_index + y * cm->mi_cols + x] = + mbmi->segment_id; + } + // Keep track of actual number (in units of 8x8) of blocks in segment 1 used + // for encoding this frame. + if (mbmi->segment_id) + cr->num_seg_blocks += xmis * ymis; +} + +// Setup cyclic background refresh: set delta q and segmentation map. +void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + struct segmentation *const seg = &cm->seg; + unsigned char *const seg_map = cpi->segmentation_map; + const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc); + // Don't apply refresh on key frame or enhancement layer frames. + if (!apply_cyclic_refresh || + (cm->frame_type == KEY_FRAME) || + (cpi->svc.temporal_layer_id > 0)) { + // Set segmentation map to 0 and disable. + vpx_memset(seg_map, 0, cm->mi_rows * cm->mi_cols); + vp9_disable_segmentation(&cm->seg); + if (cm->frame_type == KEY_FRAME) + cr->sb_index = 0; + return; + } else { + int qindex_delta = 0; + int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame; + int xmis, ymis, x, y, qindex2; + + // Rate target ratio to set q delta. + const float rate_ratio_qdelta = 2.0; + const double q = vp9_convert_qindex_to_q(cm->base_qindex); + vp9_clear_system_state(); + // Some of these parameters may be set via codec-control function later. + cr->max_sbs_perframe = 10; + cr->max_qdelta_perc = 50; + cr->min_block_size = BLOCK_8X8; + cr->time_for_refresh = 1; + // Set rate threshold to some fraction of target (and scaled by 256). + cr->thresh_rate_sb = (rc->sb64_target_rate * 256) >> 2; + // Distortion threshold, quadratic in Q, scale factor to be adjusted. + cr->thresh_dist_sb = 8 * (int)(q * q); + if (cpi->sf.use_nonrd_pick_mode) { + // May want to be more conservative with thresholds in non-rd mode for now + // as rate/distortion are derived from model based on prediction residual. + cr->thresh_rate_sb = (rc->sb64_target_rate * 256) >> 3; + cr->thresh_dist_sb = 4 * (int)(q * q); + } + + cr->num_seg_blocks = 0; + // Set up segmentation. + // Clear down the segment map. + vpx_memset(seg_map, 0, cm->mi_rows * cm->mi_cols); + vp9_enable_segmentation(&cm->seg); + vp9_clearall_segfeatures(seg); + // Select delta coding method. + seg->abs_delta = SEGMENT_DELTADATA; + + // Note: setting temporal_update has no effect, as the seg-map coding method + // (temporal or spatial) is determined in vp9_choose_segmap_coding_method(), + // based on the coding cost of each method. For error_resilient mode on the + // last_frame_seg_map is set to 0, so if temporal coding is used, it is + // relative to 0 previous map. + // seg->temporal_update = 0; + + // Segment 0 "Q" feature is disabled so it defaults to the baseline Q. + vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q); + // Use segment 1 for in-frame Q adjustment. + vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q); + + // Set the q delta for segment 1. + qindex_delta = vp9_compute_qdelta_by_rate(rc, cm->frame_type, + cm->base_qindex, + rate_ratio_qdelta); + // TODO(marpan): Incorporate the actual-vs-target rate over/undershoot from + // previous encoded frame. + if (-qindex_delta > cr->max_qdelta_perc * cm->base_qindex / 100) + qindex_delta = -cr->max_qdelta_perc * cm->base_qindex / 100; + + // Compute rd-mult for segment 1. + qindex2 = clamp(cm->base_qindex + cm->y_dc_delta_q + qindex_delta, 0, MAXQ); + cr->rdmult = vp9_compute_rd_mult(cpi, qindex2); + + vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qindex_delta); + + sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; + sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; + sbs_in_frame = sb_cols * sb_rows; + // Number of target superblocks to get the q delta (segment 1). + block_count = cr->max_sbs_perframe * sbs_in_frame / 100; + // Set the segmentation map: cycle through the superblocks, starting at + // cr->mb_index, and stopping when either block_count blocks have been found + // to be refreshed, or we have passed through whole frame. + assert(cr->sb_index < sbs_in_frame); + i = cr->sb_index; + do { + int sum_map = 0; + // Get the mi_row/mi_col corresponding to superblock index i. + int sb_row_index = (i / sb_cols); + int sb_col_index = i - sb_row_index * sb_cols; + int mi_row = sb_row_index * MI_BLOCK_SIZE; + int mi_col = sb_col_index * MI_BLOCK_SIZE; + assert(mi_row >= 0 && mi_row < cm->mi_rows); + assert(mi_col >= 0 && mi_col < cm->mi_cols); + bl_index = mi_row * cm->mi_cols + mi_col; + // Loop through all 8x8 blocks in superblock and update map. + xmis = MIN(cm->mi_cols - mi_col, + num_8x8_blocks_wide_lookup[BLOCK_64X64]); + ymis = MIN(cm->mi_rows - mi_row, + num_8x8_blocks_high_lookup[BLOCK_64X64]); + for (y = 0; y < ymis; y++) { + for (x = 0; x < xmis; x++) { + const int bl_index2 = bl_index + y * cm->mi_cols + x; + // If the block is as a candidate for clean up then mark it + // for possible boost/refresh (segment 1). The segment id may get + // reset to 0 later if block gets coded anything other than ZEROMV. + if (cr->map[bl_index2] == 0) { + seg_map[bl_index2] = 1; + sum_map++; + } else if (cr->map[bl_index2] < 0) { + cr->map[bl_index2]++; + } + } + } + // Enforce constant segment over superblock. + // If segment is partial over superblock, reset to either all 1 or 0. + if (sum_map > 0 && sum_map < xmis * ymis) { + const int new_value = (sum_map >= xmis * ymis / 2); + for (y = 0; y < ymis; y++) + for (x = 0; x < xmis; x++) + seg_map[bl_index + y * cm->mi_cols + x] = new_value; + } + i++; + if (i == sbs_in_frame) { + i = 0; + } + if (sum_map >= xmis * ymis /2) + block_count--; + } while (block_count && i != cr->sb_index); + cr->sb_index = i; + } +} + +void vp9_cyclic_refresh_set_rate_and_dist_sb(CYCLIC_REFRESH *cr, + int64_t rate_sb, int64_t dist_sb) { + cr->projected_rate_sb = rate_sb; + cr->projected_dist_sb = dist_sb; +} + +int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) { + return cr->rdmult; +} diff --git a/media/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h b/media/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h new file mode 100644 index 000000000000..f556d658bdcc --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ +#define VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ + +#include "vp9/common/vp9_blockd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct VP9_COMP; + +struct CYCLIC_REFRESH; +typedef struct CYCLIC_REFRESH CYCLIC_REFRESH; + +CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols); + +void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr); + +// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col), +// check if we should reset the segment_id, and update the cyclic_refresh map +// and segmentation map. +void vp9_cyclic_refresh_update_segment(struct VP9_COMP *const cpi, + MB_MODE_INFO *const mbmi, + int mi_row, int mi_col, + BLOCK_SIZE bsize, int use_rd); + +// Setup cyclic background refresh: set delta q and segmentation map. +void vp9_cyclic_refresh_setup(struct VP9_COMP *const cpi); + +void vp9_cyclic_refresh_set_rate_and_dist_sb(CYCLIC_REFRESH *cr, + int64_t rate_sb, int64_t dist_sb); + +int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ diff --git a/media/libvpx/vp9/encoder/vp9_vaq.c b/media/libvpx/vp9/encoder/vp9_aq_variance.c similarity index 66% rename from media/libvpx/vp9/encoder/vp9_vaq.c rename to media/libvpx/vp9/encoder/vp9_aq_variance.c index 1f9cb8709c28..56db95eb78b6 100644 --- a/media/libvpx/vp9/encoder/vp9_vaq.c +++ b/media/libvpx/vp9/encoder/vp9_aq_variance.c @@ -10,17 +10,17 @@ #include -#include "vp9/encoder/vp9_vaq.h" +#include "vp9/encoder/vp9_aq_variance.h" #include "vp9/common/vp9_seg_common.h" #include "vp9/encoder/vp9_ratectrl.h" -#include "vp9/encoder/vp9_rdopt.h" +#include "vp9/encoder/vp9_rd.h" #include "vp9/encoder/vp9_segmentation.h" #include "vp9/common/vp9_systemdependent.h" -#define ENERGY_MIN (-3) -#define ENERGY_MAX (3) +#define ENERGY_MIN (-1) +#define ENERGY_MAX (1) #define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1) #define ENERGY_IN_BOUNDS(energy)\ assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX) @@ -44,7 +44,7 @@ unsigned int vp9_vaq_segment_id(int energy) { double vp9_vaq_rdmult_ratio(int energy) { ENERGY_IN_BOUNDS(energy); - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); return RDMULT_RATIO(energy); } @@ -52,7 +52,7 @@ double vp9_vaq_rdmult_ratio(int energy) { double vp9_vaq_inv_q_ratio(int energy) { ENERGY_IN_BOUNDS(energy); - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); return Q_RATIO(-energy); } @@ -63,9 +63,9 @@ void vp9_vaq_init() { assert(ENERGY_SPAN <= MAX_SEGMENTS); - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); - base_ratio = 1.8; + base_ratio = 1.5; for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) { Q_RATIO(i) = pow(base_ratio, i/3.0); @@ -75,35 +75,39 @@ void vp9_vaq_init() { void vp9_vaq_frame_setup(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; struct segmentation *seg = &cm->seg; - int base_q = vp9_convert_qindex_to_q(cm->base_qindex); - int base_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + - cm->y_dc_delta_q); + const double base_q = vp9_convert_qindex_to_q(cm->base_qindex); + const int base_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + + cm->y_dc_delta_q); int i; - vp9_enable_segmentation((VP9_PTR)cpi); - vp9_clearall_segfeatures(seg); + if (cm->frame_type == KEY_FRAME || + cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { + vp9_enable_segmentation(seg); + vp9_clearall_segfeatures(seg); - seg->abs_delta = SEGMENT_DELTADATA; + seg->abs_delta = SEGMENT_DELTADATA; - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); - for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) { - int qindex_delta, segment_rdmult; + for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) { + int qindex_delta, segment_rdmult; - if (Q_RATIO(i) == 1) { - // No need to enable SEG_LVL_ALT_Q for this segment - RDMULT_RATIO(i) = 1; - continue; + if (Q_RATIO(i) == 1) { + // No need to enable SEG_LVL_ALT_Q for this segment + RDMULT_RATIO(i) = 1; + continue; + } + + qindex_delta = vp9_compute_qdelta(&cpi->rc, base_q, base_q * Q_RATIO(i)); + vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta); + vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q); + + segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta + + cm->y_dc_delta_q); + + RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult; } - - qindex_delta = vp9_compute_qdelta(cpi, base_q, base_q * Q_RATIO(i)); - vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta); - vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q); - - segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta + - cm->y_dc_delta_q); - - RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult; } } @@ -137,11 +141,8 @@ int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { double energy; unsigned int var = block_variance(cpi, x, bs); - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); - // if (var <= 1000) - // return 0; - - energy = 0.9*(logf(var + 1) - 10.0); - return clamp(round(energy), ENERGY_MIN, ENERGY_MAX); + energy = 0.9 * (log(var + 1.0) - 10.0); + return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX); } diff --git a/media/libvpx/vp9/encoder/vp9_vaq.h b/media/libvpx/vp9/encoder/vp9_aq_variance.h similarity index 73% rename from media/libvpx/vp9/encoder/vp9_vaq.h rename to media/libvpx/vp9/encoder/vp9_aq_variance.h index dc18b22f251b..d1a459fe9ec5 100644 --- a/media/libvpx/vp9/encoder/vp9_vaq.h +++ b/media/libvpx/vp9/encoder/vp9_aq_variance.h @@ -9,10 +9,14 @@ */ -#ifndef VP9_ENCODER_VP9_CONFIG_VAQ_H_ -#define VP9_ENCODER_VP9_CONFIG_VAQ_H_ +#ifndef VP9_ENCODER_VP9_AQ_VARIANCE_H_ +#define VP9_ENCODER_VP9_AQ_VARIANCE_H_ -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif unsigned int vp9_vaq_segment_id(int energy); double vp9_vaq_rdmult_ratio(int energy); @@ -23,4 +27,8 @@ void vp9_vaq_frame_setup(VP9_COMP *cpi); int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); -#endif // VP9_ENCODER_VP9_CONFIG_VAQ_H_ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_ENCODER_VP9_AQ_VARIANCE_H_ diff --git a/media/libvpx/vp9/encoder/vp9_bitstream.c b/media/libvpx/vp9/encoder/vp9_bitstream.c index efbadbab0816..b60524820de0 100644 --- a/media/libvpx/vp9/encoder/vp9_bitstream.c +++ b/media/libvpx/vp9/encoder/vp9_bitstream.c @@ -14,185 +14,77 @@ #include "vpx/vpx_encoder.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem_ops.h" +#include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_entropymv.h" -#include "vp9/common/vp9_findnearmv.h" -#include "vp9/common/vp9_tile_common.h" -#include "vp9/common/vp9_seg_common.h" -#include "vp9/common/vp9_pred_common.h" -#include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_mvref_common.h" -#include "vp9/common/vp9_treecoder.h" +#include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_seg_common.h" #include "vp9/common/vp9_systemdependent.h" -#include "vp9/common/vp9_pragmas.h" +#include "vp9/common/vp9_tile_common.h" -#include "vp9/encoder/vp9_mcomp.h" -#include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_cost.h" #include "vp9/encoder/vp9_bitstream.h" +#include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_mcomp.h" #include "vp9/encoder/vp9_segmentation.h" #include "vp9/encoder/vp9_subexp.h" +#include "vp9/encoder/vp9_tokenize.h" #include "vp9/encoder/vp9_write_bit_buffer.h" +static struct vp9_token intra_mode_encodings[INTRA_MODES]; +static struct vp9_token switchable_interp_encodings[SWITCHABLE_FILTERS]; +static struct vp9_token partition_encodings[PARTITION_TYPES]; +static struct vp9_token inter_mode_encodings[INTER_MODES]; -#if defined(SECTIONBITS_OUTPUT) -unsigned __int64 Sectionbits[500]; -#endif - -#ifdef ENTROPY_STATS -int intra_mode_stats[INTRA_MODES] - [INTRA_MODES] - [INTRA_MODES]; -vp9_coeff_stats tree_update_hist[TX_SIZES][BLOCK_TYPES]; - -extern unsigned int active_section; -#endif - - -#ifdef MODE_STATS -int64_t tx_count_32x32p_stats[TX_SIZE_CONTEXTS][TX_SIZES]; -int64_t tx_count_16x16p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 1]; -int64_t tx_count_8x8p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 2]; -int64_t switchable_interp_stats[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS]; - -void init_tx_count_stats() { - vp9_zero(tx_count_32x32p_stats); - vp9_zero(tx_count_16x16p_stats); - vp9_zero(tx_count_8x8p_stats); +void vp9_entropy_mode_init() { + vp9_tokens_from_tree(intra_mode_encodings, vp9_intra_mode_tree); + vp9_tokens_from_tree(switchable_interp_encodings, vp9_switchable_interp_tree); + vp9_tokens_from_tree(partition_encodings, vp9_partition_tree); + vp9_tokens_from_tree(inter_mode_encodings, vp9_inter_mode_tree); } -void init_switchable_interp_stats() { - vp9_zero(switchable_interp_stats); +static void write_intra_mode(vp9_writer *w, PREDICTION_MODE mode, + const vp9_prob *probs) { + vp9_write_token(w, vp9_intra_mode_tree, probs, &intra_mode_encodings[mode]); } -static void update_tx_count_stats(VP9_COMMON *cm) { - int i, j; - for (i = 0; i < TX_SIZE_CONTEXTS; i++) { - for (j = 0; j < TX_SIZES; j++) { - tx_count_32x32p_stats[i][j] += cm->fc.tx_count_32x32p[i][j]; - } - } - for (i = 0; i < TX_SIZE_CONTEXTS; i++) { - for (j = 0; j < TX_SIZES - 1; j++) { - tx_count_16x16p_stats[i][j] += cm->fc.tx_count_16x16p[i][j]; - } - } - for (i = 0; i < TX_SIZE_CONTEXTS; i++) { - for (j = 0; j < TX_SIZES - 2; j++) { - tx_count_8x8p_stats[i][j] += cm->fc.tx_count_8x8p[i][j]; - } - } +static void write_inter_mode(vp9_writer *w, PREDICTION_MODE mode, + const vp9_prob *probs) { + assert(is_inter_mode(mode)); + vp9_write_token(w, vp9_inter_mode_tree, probs, + &inter_mode_encodings[INTER_OFFSET(mode)]); } -static void update_switchable_interp_stats(VP9_COMMON *cm) { - int i, j; - for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) - for (j = 0; j < SWITCHABLE_FILTERS; ++j) - switchable_interp_stats[i][j] += cm->fc.switchable_interp_count[i][j]; -} - -void write_tx_count_stats() { - int i, j; - FILE *fp = fopen("tx_count.bin", "wb"); - fwrite(tx_count_32x32p_stats, sizeof(tx_count_32x32p_stats), 1, fp); - fwrite(tx_count_16x16p_stats, sizeof(tx_count_16x16p_stats), 1, fp); - fwrite(tx_count_8x8p_stats, sizeof(tx_count_8x8p_stats), 1, fp); - fclose(fp); - - printf( - "vp9_default_tx_count_32x32p[TX_SIZE_CONTEXTS][TX_SIZES] = {\n"); - for (i = 0; i < TX_SIZE_CONTEXTS; i++) { - printf(" { "); - for (j = 0; j < TX_SIZES; j++) { - printf("%"PRId64", ", tx_count_32x32p_stats[i][j]); - } - printf("},\n"); - } - printf("};\n"); - printf( - "vp9_default_tx_count_16x16p[TX_SIZE_CONTEXTS][TX_SIZES-1] = {\n"); - for (i = 0; i < TX_SIZE_CONTEXTS; i++) { - printf(" { "); - for (j = 0; j < TX_SIZES - 1; j++) { - printf("%"PRId64", ", tx_count_16x16p_stats[i][j]); - } - printf("},\n"); - } - printf("};\n"); - printf( - "vp9_default_tx_count_8x8p[TX_SIZE_CONTEXTS][TX_SIZES-2] = {\n"); - for (i = 0; i < TX_SIZE_CONTEXTS; i++) { - printf(" { "); - for (j = 0; j < TX_SIZES - 2; j++) { - printf("%"PRId64", ", tx_count_8x8p_stats[i][j]); - } - printf("},\n"); - } - printf("};\n"); -} - -void write_switchable_interp_stats() { - int i, j; - FILE *fp = fopen("switchable_interp.bin", "wb"); - fwrite(switchable_interp_stats, sizeof(switchable_interp_stats), 1, fp); - fclose(fp); - - printf( - "vp9_default_switchable_filter_count[SWITCHABLE_FILTER_CONTEXTS]" - "[SWITCHABLE_FILTERS] = {\n"); - for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { - printf(" { "); - for (j = 0; j < SWITCHABLE_FILTERS; j++) { - printf("%"PRId64", ", switchable_interp_stats[i][j]); - } - printf("},\n"); - } - printf("};\n"); -} -#endif - -static INLINE void write_be32(uint8_t *p, int value) { - p[0] = value >> 24; - p[1] = value >> 16; - p[2] = value >> 8; - p[3] = value; -} - -void vp9_encode_unsigned_max(struct vp9_write_bit_buffer *wb, - int data, int max) { +static void encode_unsigned_max(struct vp9_write_bit_buffer *wb, + int data, int max) { vp9_wb_write_literal(wb, data, get_unsigned_bits(max)); } -static void update_mode(vp9_writer *w, int n, vp9_tree tree, - vp9_prob Pcur[/* n-1 */], - unsigned int bct[/* n-1 */][2], - const unsigned int num_events[/* n */]) { - int i = 0; +static void prob_diff_update(const vp9_tree_index *tree, + vp9_prob probs[/*n - 1*/], + const unsigned int counts[/*n - 1*/], + int n, vp9_writer *w) { + int i; + unsigned int branch_ct[32][2]; - vp9_tree_probs_from_distribution(tree, bct, num_events); + // Assuming max number of probabilities <= 32 + assert(n <= 32); + + vp9_tree_probs_from_distribution(tree, branch_ct, counts); for (i = 0; i < n - 1; ++i) - vp9_cond_prob_diff_update(w, &Pcur[i], bct[i]); + vp9_cond_prob_diff_update(w, &probs[i], branch_ct[i]); } -static void update_mbintra_mode_probs(VP9_COMP* const cpi, - vp9_writer* const bc) { - VP9_COMMON *const cm = &cpi->common; - int j; - unsigned int bct[INTRA_MODES - 1][2]; - - for (j = 0; j < BLOCK_SIZE_GROUPS; j++) - update_mode(bc, INTRA_MODES, vp9_intra_mode_tree, - cm->fc.y_mode_prob[j], bct, - (unsigned int *)cpi->y_mode_count[j]); -} - -static void write_selected_tx_size(const VP9_COMP *cpi, MODE_INFO *m, +static void write_selected_tx_size(const VP9_COMMON *cm, + const MACROBLOCKD *xd, TX_SIZE tx_size, BLOCK_SIZE bsize, vp9_writer *w) { const TX_SIZE max_tx_size = max_txsize_lookup[bsize]; - const MACROBLOCKD *const xd = &cpi->mb.e_mbd; const vp9_prob *const tx_probs = get_tx_probs2(max_tx_size, xd, - &cpi->common.fc.tx_probs); + &cm->fc.tx_probs); vp9_write(w, tx_size != TX_4X4, tx_probs[0]); if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) { vp9_write(w, tx_size != TX_8X8, tx_probs[1]); @@ -201,66 +93,34 @@ static void write_selected_tx_size(const VP9_COMP *cpi, MODE_INFO *m, } } -static int write_skip_coeff(const VP9_COMP *cpi, int segment_id, MODE_INFO *m, - vp9_writer *w) { - const MACROBLOCKD *const xd = &cpi->mb.e_mbd; - if (vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) { +static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *xd, + int segment_id, const MODE_INFO *mi, vp9_writer *w) { + if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { return 1; } else { - const int skip_coeff = m->mbmi.skip_coeff; - vp9_write(w, skip_coeff, vp9_get_pred_prob_mbskip(&cpi->common, xd)); - return skip_coeff; + const int skip = mi->mbmi.skip; + vp9_write(w, skip, vp9_get_skip_prob(cm, xd)); + return skip; } } -void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *w) { - VP9_COMMON *cm = &cpi->common; +static void update_skip_probs(VP9_COMMON *cm, vp9_writer *w) { int k; - for (k = 0; k < MBSKIP_CONTEXTS; ++k) - vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k], cm->counts.mbskip[k]); + for (k = 0; k < SKIP_CONTEXTS; ++k) + vp9_cond_prob_diff_update(w, &cm->fc.skip_probs[k], cm->counts.skip[k]); } -static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) { - write_token(bc, vp9_intra_mode_tree, p, vp9_intra_mode_encodings + m); +static void update_switchable_interp_probs(VP9_COMMON *cm, vp9_writer *w) { + int j; + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) + prob_diff_update(vp9_switchable_interp_tree, + cm->fc.switchable_interp_prob[j], + cm->counts.switchable_interp[j], SWITCHABLE_FILTERS, w); } -static void update_switchable_interp_probs(VP9_COMP *cpi, vp9_writer *w) { - VP9_COMMON *const cm = &cpi->common; - unsigned int branch_ct[SWITCHABLE_FILTERS - 1][2]; - int i, j; - for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) { - vp9_tree_probs_from_distribution(vp9_switchable_interp_tree, branch_ct, - cm->counts.switchable_interp[j]); - - for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) - vp9_cond_prob_diff_update(w, &cm->fc.switchable_interp_prob[j][i], - branch_ct[i]); - } - -#ifdef MODE_STATS - if (!cpi->dummy_packing) - update_switchable_interp_stats(cm); -#endif -} - -static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer *w) { - int i, j; - - for (i = 0; i < INTER_MODE_CONTEXTS; ++i) { - unsigned int branch_ct[INTER_MODES - 1][2]; - vp9_tree_probs_from_distribution(vp9_inter_mode_tree, branch_ct, - cm->counts.inter_mode[i]); - - for (j = 0; j < INTER_MODES - 1; ++j) - vp9_cond_prob_diff_update(w, &cm->fc.inter_mode_probs[i][j], - branch_ct[j]); - } -} - -static void pack_mb_tokens(vp9_writer* const w, - TOKENEXTRA **tp, - const TOKENEXTRA *const stop) { +static void pack_mb_tokens(vp9_writer *w, + TOKENEXTRA **tp, const TOKENEXTRA *const stop) { TOKENEXTRA *p = *tp; while (p < stop && p->token != EOSB_TOKEN) { @@ -268,18 +128,8 @@ static void pack_mb_tokens(vp9_writer* const w, const struct vp9_token *const a = &vp9_coef_encodings[t]; const vp9_extra_bit *const b = &vp9_extra_bits[t]; int i = 0; - const vp9_prob *pp; int v = a->value; int n = a->len; - vp9_prob probs[ENTROPY_NODES]; - - if (t >= TWO_TOKEN) { - vp9_model_to_full_probs(p->context_tree, probs); - pp = probs; - } else { - pp = p->context_tree; - } - assert(pp != 0); /* skip one or two nodes */ if (p->skip_eob_node) { @@ -287,11 +137,24 @@ static void pack_mb_tokens(vp9_writer* const w, i = 2 * p->skip_eob_node; } - do { - const int bb = (v >> --n) & 1; - vp9_write(w, bb, pp[i >> 1]); - i = vp9_coef_tree[i + bb]; - } while (n); + // TODO(jbb): expanding this can lead to big gains. It allows + // much better branch prediction and would enable us to avoid numerous + // lookups and compares. + + // If we have a token that's in the constrained set, the coefficient tree + // is split into two treed writes. The first treed write takes care of the + // unconstrained nodes. The second treed write takes care of the + // constrained nodes. + if (t >= TWO_TOKEN && t < EOB_TOKEN) { + int len = UNCONSTRAINED_NODES - p->skip_eob_node; + int bits = v >> (n - len); + vp9_write_tree(w, vp9_coef_tree, p->context_tree, bits, len, i); + vp9_write_tree(w, vp9_coef_con_tree, + vp9_pareto8_full[p->context_tree[PIVOT_NODE] - 1], + v, n - len, 0); + } else { + vp9_write_tree(w, vp9_coef_tree, p->context_tree, v, n, i); + } if (b->base_val) { const int e = p->extra, l = b->len; @@ -317,275 +180,226 @@ static void pack_mb_tokens(vp9_writer* const w, *tp = p + (p->token == EOSB_TOKEN); } -static void write_sb_mv_ref(vp9_writer *w, MB_PREDICTION_MODE mode, - const vp9_prob *p) { - assert(is_inter_mode(mode)); - write_token(w, vp9_inter_mode_tree, p, - &vp9_inter_mode_encodings[INTER_OFFSET(mode)]); -} - - static void write_segment_id(vp9_writer *w, const struct segmentation *seg, int segment_id) { if (seg->enabled && seg->update_map) - treed_write(w, vp9_segment_tree, seg->tree_probs, segment_id, 3); + vp9_write_tree(w, vp9_segment_tree, seg->tree_probs, segment_id, 3, 0); } // This function encodes the reference frame -static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *mi = &xd->mi_8x8[0]->mbmi; - const int segment_id = mi->segment_id; - int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id, - SEG_LVL_REF_FRAME); +static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd, + vp9_writer *w) { + const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const int is_compound = has_second_ref(mbmi); + const int segment_id = mbmi->segment_id; + // If segment level coding of this signal is disabled... // or the segment allows multiple reference frame options - if (!seg_ref_active) { + if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) { + assert(!is_compound); + assert(mbmi->ref_frame[0] == + vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME)); + } else { // does the feature use compound prediction or not // (if not specified at the frame/segment level) - if (cm->comp_pred_mode == HYBRID_PREDICTION) { - vp9_write(bc, mi->ref_frame[1] > INTRA_FRAME, - vp9_get_pred_prob_comp_inter_inter(cm, xd)); + if (cm->reference_mode == REFERENCE_MODE_SELECT) { + vp9_write(w, is_compound, vp9_get_reference_mode_prob(cm, xd)); } else { - assert((mi->ref_frame[1] <= INTRA_FRAME) == - (cm->comp_pred_mode == SINGLE_PREDICTION_ONLY)); + assert(!is_compound == (cm->reference_mode == SINGLE_REFERENCE)); } - if (mi->ref_frame[1] > INTRA_FRAME) { - vp9_write(bc, mi->ref_frame[0] == GOLDEN_FRAME, + if (is_compound) { + vp9_write(w, mbmi->ref_frame[0] == GOLDEN_FRAME, vp9_get_pred_prob_comp_ref_p(cm, xd)); } else { - vp9_write(bc, mi->ref_frame[0] != LAST_FRAME, - vp9_get_pred_prob_single_ref_p1(cm, xd)); - if (mi->ref_frame[0] != LAST_FRAME) - vp9_write(bc, mi->ref_frame[0] != GOLDEN_FRAME, - vp9_get_pred_prob_single_ref_p2(cm, xd)); + const int bit0 = mbmi->ref_frame[0] != LAST_FRAME; + vp9_write(w, bit0, vp9_get_pred_prob_single_ref_p1(cm, xd)); + if (bit0) { + const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME; + vp9_write(w, bit1, vp9_get_pred_prob_single_ref_p2(cm, xd)); + } } - } else { - assert(mi->ref_frame[1] <= INTRA_FRAME); - assert(vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) == - mi->ref_frame[0]); } - - // If using the prediction model we have nothing further to do because - // the reference frame is fully coded by the segment. } -static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { +static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, + vp9_writer *w) { VP9_COMMON *const cm = &cpi->common; const nmv_context *nmvc = &cm->fc.nmvc; - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; - struct segmentation *seg = &cm->seg; - MB_MODE_INFO *const mi = &m->mbmi; - const MV_REFERENCE_FRAME rf = mi->ref_frame[0]; - const MB_PREDICTION_MODE mode = mi->mode; - const int segment_id = mi->segment_id; - int skip_coeff; - const BLOCK_SIZE bsize = mi->sb_type; + const MACROBLOCK *const x = &cpi->mb; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct segmentation *const seg = &cm->seg; + const MB_MODE_INFO *const mbmi = &mi->mbmi; + const PREDICTION_MODE mode = mbmi->mode; + const int segment_id = mbmi->segment_id; + const BLOCK_SIZE bsize = mbmi->sb_type; const int allow_hp = cm->allow_high_precision_mv; - -#ifdef ENTROPY_STATS - active_section = 9; -#endif + const int is_inter = is_inter_block(mbmi); + const int is_compound = has_second_ref(mbmi); + int skip, ref; if (seg->update_map) { if (seg->temporal_update) { - const int pred_flag = mi->seg_id_predicted; + const int pred_flag = mbmi->seg_id_predicted; vp9_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd); - vp9_write(bc, pred_flag, pred_prob); + vp9_write(w, pred_flag, pred_prob); if (!pred_flag) - write_segment_id(bc, seg, segment_id); + write_segment_id(w, seg, segment_id); } else { - write_segment_id(bc, seg, segment_id); + write_segment_id(w, seg, segment_id); } } - skip_coeff = write_skip_coeff(cpi, segment_id, m, bc); + skip = write_skip(cm, xd, segment_id, mi, w); if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) - vp9_write(bc, rf != INTRA_FRAME, - vp9_get_pred_prob_intra_inter(cm, xd)); + vp9_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd)); if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT && - !(rf != INTRA_FRAME && - (skip_coeff || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) { - write_selected_tx_size(cpi, m, mi->tx_size, bsize, bc); + !(is_inter && + (skip || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) { + write_selected_tx_size(cm, xd, mbmi->tx_size, bsize, w); } - if (rf == INTRA_FRAME) { -#ifdef ENTROPY_STATS - active_section = 6; -#endif - + if (!is_inter) { if (bsize >= BLOCK_8X8) { - write_intra_mode(bc, mode, cm->fc.y_mode_prob[size_group_lookup[bsize]]); + write_intra_mode(w, mode, cm->fc.y_mode_prob[size_group_lookup[bsize]]); } else { int idx, idy; - const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; - const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; - for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { - for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { - const MB_PREDICTION_MODE bm = m->bmi[idy * 2 + idx].as_mode; - write_intra_mode(bc, bm, cm->fc.y_mode_prob[0]); + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + for (idy = 0; idy < 2; idy += num_4x4_h) { + for (idx = 0; idx < 2; idx += num_4x4_w) { + const PREDICTION_MODE b_mode = mi->bmi[idy * 2 + idx].as_mode; + write_intra_mode(w, b_mode, cm->fc.y_mode_prob[0]); } } } - write_intra_mode(bc, mi->uv_mode, cm->fc.uv_mode_prob[mode]); + write_intra_mode(w, mbmi->uv_mode, cm->fc.uv_mode_prob[mode]); } else { - vp9_prob *mv_ref_p; - encode_ref_frame(cpi, bc); - mv_ref_p = cpi->common.fc.inter_mode_probs[mi->mode_context[rf]]; - -#ifdef ENTROPY_STATS - active_section = 3; -#endif + const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]]; + const vp9_prob *const inter_probs = cm->fc.inter_mode_probs[mode_ctx]; + write_ref_frames(cm, xd, w); // If segment skip is not enabled code the mode. if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) { if (bsize >= BLOCK_8X8) { - write_sb_mv_ref(bc, mode, mv_ref_p); - ++cm->counts.inter_mode[mi->mode_context[rf]] - [INTER_OFFSET(mode)]; + write_inter_mode(w, mode, inter_probs); + ++cm->counts.inter_mode[mode_ctx][INTER_OFFSET(mode)]; } } - if (cm->mcomp_filter_type == SWITCHABLE) { + if (cm->interp_filter == SWITCHABLE) { const int ctx = vp9_get_pred_context_switchable_interp(xd); - write_token(bc, vp9_switchable_interp_tree, - cm->fc.switchable_interp_prob[ctx], - &vp9_switchable_interp_encodings[mi->interp_filter]); + vp9_write_token(w, vp9_switchable_interp_tree, + cm->fc.switchable_interp_prob[ctx], + &switchable_interp_encodings[mbmi->interp_filter]); + ++cpi->interp_filter_selected[0][mbmi->interp_filter]; } else { - assert(mi->interp_filter == cm->mcomp_filter_type); + assert(mbmi->interp_filter == cm->interp_filter); } if (bsize < BLOCK_8X8) { - const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; - const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; int idx, idy; - for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { - for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { + for (idy = 0; idy < 2; idy += num_4x4_h) { + for (idx = 0; idx < 2; idx += num_4x4_w) { const int j = idy * 2 + idx; - const MB_PREDICTION_MODE blockmode = m->bmi[j].as_mode; - write_sb_mv_ref(bc, blockmode, mv_ref_p); - ++cm->counts.inter_mode[mi->mode_context[rf]] - [INTER_OFFSET(blockmode)]; - - if (blockmode == NEWMV) { -#ifdef ENTROPY_STATS - active_section = 11; -#endif - vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[0].as_mv, - &mi->best_mv[0].as_mv, nmvc, allow_hp); - - if (has_second_ref(mi)) - vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[1].as_mv, - &mi->best_mv[1].as_mv, nmvc, allow_hp); + const PREDICTION_MODE b_mode = mi->bmi[j].as_mode; + write_inter_mode(w, b_mode, inter_probs); + ++cm->counts.inter_mode[mode_ctx][INTER_OFFSET(b_mode)]; + if (b_mode == NEWMV) { + for (ref = 0; ref < 1 + is_compound; ++ref) + vp9_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv, + &mbmi->ref_mvs[mbmi->ref_frame[ref]][0].as_mv, + nmvc, allow_hp); } } } - } else if (mode == NEWMV) { -#ifdef ENTROPY_STATS - active_section = 5; -#endif - vp9_encode_mv(cpi, bc, &mi->mv[0].as_mv, - &mi->best_mv[0].as_mv, nmvc, allow_hp); - - if (has_second_ref(mi)) - vp9_encode_mv(cpi, bc, &mi->mv[1].as_mv, - &mi->best_mv[1].as_mv, nmvc, allow_hp); + } else { + if (mode == NEWMV) { + for (ref = 0; ref < 1 + is_compound; ++ref) + vp9_encode_mv(cpi, w, &mbmi->mv[ref].as_mv, + &mbmi->ref_mvs[mbmi->ref_frame[ref]][0].as_mv, nmvc, + allow_hp); + } } } } -static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8, - vp9_writer *bc) { - const VP9_COMMON *const cm = &cpi->common; - const MACROBLOCKD *const xd = &cpi->mb.e_mbd; +static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd, + MODE_INFO **mi_8x8, vp9_writer *w) { const struct segmentation *const seg = &cm->seg; - MODE_INFO *m = mi_8x8[0]; - const int ym = m->mbmi.mode; - const int segment_id = m->mbmi.segment_id; - MODE_INFO *above_mi = mi_8x8[-xd->mode_info_stride]; - MODE_INFO *left_mi = xd->left_available ? mi_8x8[-1] : NULL; + const MODE_INFO *const mi = mi_8x8[0]; + const MODE_INFO *const above_mi = mi_8x8[-xd->mi_stride]; + const MODE_INFO *const left_mi = xd->left_available ? mi_8x8[-1] : NULL; + const MB_MODE_INFO *const mbmi = &mi->mbmi; + const BLOCK_SIZE bsize = mbmi->sb_type; if (seg->update_map) - write_segment_id(bc, seg, m->mbmi.segment_id); + write_segment_id(w, seg, mbmi->segment_id); - write_skip_coeff(cpi, segment_id, m, bc); + write_skip(cm, xd, mbmi->segment_id, mi, w); - if (m->mbmi.sb_type >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT) - write_selected_tx_size(cpi, m, m->mbmi.tx_size, m->mbmi.sb_type, bc); + if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT) + write_selected_tx_size(cm, xd, mbmi->tx_size, bsize, w); - if (m->mbmi.sb_type >= BLOCK_8X8) { - const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, 0); - const MB_PREDICTION_MODE L = left_block_mode(m, left_mi, 0); - write_intra_mode(bc, ym, vp9_kf_y_mode_prob[A][L]); + if (bsize >= BLOCK_8X8) { + write_intra_mode(w, mbmi->mode, get_y_mode_probs(mi, above_mi, left_mi, 0)); } else { + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; int idx, idy; - const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[m->mbmi.sb_type]; - const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[m->mbmi.sb_type]; - for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { - for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { - int i = idy * 2 + idx; - const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, i); - const MB_PREDICTION_MODE L = left_block_mode(m, left_mi, i); - const int bm = m->bmi[i].as_mode; -#ifdef ENTROPY_STATS - ++intra_mode_stats[A][L][bm]; -#endif - write_intra_mode(bc, bm, vp9_kf_y_mode_prob[A][L]); + + for (idy = 0; idy < 2; idy += num_4x4_h) { + for (idx = 0; idx < 2; idx += num_4x4_w) { + const int block = idy * 2 + idx; + write_intra_mode(w, mi->bmi[block].as_mode, + get_y_mode_probs(mi, above_mi, left_mi, block)); } } } - write_intra_mode(bc, m->mbmi.uv_mode, vp9_kf_uv_mode_prob[ym]); + write_intra_mode(w, mbmi->uv_mode, vp9_kf_uv_mode_prob[mbmi->mode]); } static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile, - vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end, + vp9_writer *w, TOKENEXTRA **tok, + const TOKENEXTRA *const tok_end, int mi_row, int mi_col) { - VP9_COMMON *const cm = &cpi->common; + const VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->mb.e_mbd; MODE_INFO *m; - xd->mi_8x8 = cm->mi_grid_visible + (mi_row * cm->mode_info_stride + mi_col); - m = xd->mi_8x8[0]; + xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col); + m = xd->mi[0]; set_mi_row_col(xd, tile, mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type], mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type], cm->mi_rows, cm->mi_cols); if (frame_is_intra_only(cm)) { - write_mb_modes_kf(cpi, xd->mi_8x8, w); -#ifdef ENTROPY_STATS - active_section = 8; -#endif + write_mb_modes_kf(cm, xd, xd->mi, w); } else { pack_inter_mode_mvs(cpi, m, w); -#ifdef ENTROPY_STATS - active_section = 1; -#endif } assert(*tok < tok_end); pack_mb_tokens(w, tok, tok_end); } -static void write_partition(VP9_COMP *cpi, int hbs, int mi_row, int mi_col, +static void write_partition(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd, + int hbs, int mi_row, int mi_col, PARTITION_TYPE p, BLOCK_SIZE bsize, vp9_writer *w) { - VP9_COMMON *const cm = &cpi->common; - const int ctx = partition_plane_context(cpi->above_seg_context, - cpi->left_seg_context, - mi_row, mi_col, bsize); + const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize); const vp9_prob *const probs = get_partition_probs(cm, ctx); const int has_rows = (mi_row + hbs) < cm->mi_rows; const int has_cols = (mi_col + hbs) < cm->mi_cols; if (has_rows && has_cols) { - write_token(w, vp9_partition_tree, probs, &vp9_partition_encodings[p]); + vp9_write_token(w, vp9_partition_tree, probs, &partition_encodings[p]); } else if (!has_rows && has_cols) { assert(p == PARTITION_SPLIT || p == PARTITION_HORZ); vp9_write(w, p == PARTITION_SPLIT, probs[1]); @@ -597,21 +411,26 @@ static void write_partition(VP9_COMP *cpi, int hbs, int mi_row, int mi_col, } } -static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile, - vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end, +static void write_modes_sb(VP9_COMP *cpi, + const TileInfo *const tile, vp9_writer *w, + TOKENEXTRA **tok, const TOKENEXTRA *const tok_end, int mi_row, int mi_col, BLOCK_SIZE bsize) { - VP9_COMMON *const cm = &cpi->common; + const VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->mb.e_mbd; + const int bsl = b_width_log2(bsize); const int bs = (1 << bsl) / 4; PARTITION_TYPE partition; BLOCK_SIZE subsize; - MODE_INFO *m = cm->mi_grid_visible[mi_row * cm->mode_info_stride + mi_col]; + const MODE_INFO *m = NULL; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]; + partition = partition_lookup[bsl][m->mbmi.sb_type]; - write_partition(cpi, bs, mi_row, mi_col, partition, bsize, w); + write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w); subsize = get_subsize(bsize, partition); if (subsize < BLOCK_8X8) { write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); @@ -647,37 +466,36 @@ static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile, // update partition context if (bsize >= BLOCK_8X8 && (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) - update_partition_context(cpi->above_seg_context, cpi->left_seg_context, - mi_row, mi_col, subsize, bsize); + update_partition_context(xd, mi_row, mi_col, subsize, bsize); } -static void write_modes(VP9_COMP *cpi, const TileInfo *const tile, - vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end) { +static void write_modes(VP9_COMP *cpi, + const TileInfo *const tile, vp9_writer *w, + TOKENEXTRA **tok, const TOKENEXTRA *const tok_end) { int mi_row, mi_col; for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; mi_row += MI_BLOCK_SIZE) { - vp9_zero(cpi->left_seg_context); + vp9_zero(cpi->mb.e_mbd.left_seg_context); for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; mi_col += MI_BLOCK_SIZE) - write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, BLOCK_64X64); + write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, + BLOCK_64X64); } } -static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) { - vp9_coeff_probs_model *coef_probs = cpi->frame_coef_probs[tx_size]; +static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size, + vp9_coeff_stats *coef_branch_ct, + vp9_coeff_probs_model *coef_probs) { vp9_coeff_count *coef_counts = cpi->coef_counts[tx_size]; - unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] = + unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] = cpi->common.counts.eob_branch[tx_size]; - vp9_coeff_stats *coef_branch_ct = cpi->frame_branch_ct[tx_size]; int i, j, k, l, m; - for (i = 0; i < BLOCK_TYPES; ++i) { + for (i = 0; i < PLANE_TYPES; ++i) { for (j = 0; j < REF_TYPES; ++j) { for (k = 0; k < COEF_BANDS; ++k) { - for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { - if (l >= 3 && k == 0) - continue; + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { vp9_tree_probs_from_distribution(vp9_coef_tree, coef_branch_ct[i][j][k][l], coef_counts[i][j][k][l]); @@ -687,58 +505,38 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) { coef_probs[i][j][k][l][m] = get_binary_prob( coef_branch_ct[i][j][k][l][m][0], coef_branch_ct[i][j][k][l][m][1]); -#ifdef ENTROPY_STATS - if (!cpi->dummy_packing) { - int t; - for (t = 0; t < MAX_ENTROPY_TOKENS; ++t) - context_counters[tx_size][i][j][k][l][t] += - coef_counts[i][j][k][l][t]; - context_counters[tx_size][i][j][k][l][MAX_ENTROPY_TOKENS] += - eob_branch_ct[i][j][k][l]; - } -#endif } } } } } -static void build_coeff_contexts(VP9_COMP *cpi) { - TX_SIZE t; - for (t = TX_4X4; t <= TX_32X32; t++) - build_tree_distribution(cpi, t); -} - static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, - TX_SIZE tx_size) { - vp9_coeff_probs_model *new_frame_coef_probs = cpi->frame_coef_probs[tx_size]; - vp9_coeff_probs_model *old_frame_coef_probs = - cpi->common.fc.coef_probs[tx_size]; - vp9_coeff_stats *frame_branch_ct = cpi->frame_branch_ct[tx_size]; + TX_SIZE tx_size, + vp9_coeff_stats *frame_branch_ct, + vp9_coeff_probs_model *new_coef_probs) { + vp9_coeff_probs_model *old_coef_probs = cpi->common.fc.coef_probs[tx_size]; const vp9_prob upd = DIFF_UPDATE_PROB; const int entropy_nodes_update = UNCONSTRAINED_NODES; int i, j, k, l, t; switch (cpi->sf.use_fast_coef_updates) { - case 0: { - /* dry run to see if there is any udpate at all needed */ + case TWO_LOOP: { + /* dry run to see if there is any update at all needed */ int savings = 0; int update[2] = {0, 0}; - for (i = 0; i < BLOCK_TYPES; ++i) { + for (i = 0; i < PLANE_TYPES; ++i) { for (j = 0; j < REF_TYPES; ++j) { for (k = 0; k < COEF_BANDS; ++k) { - for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { for (t = 0; t < entropy_nodes_update; ++t) { - vp9_prob newp = new_frame_coef_probs[i][j][k][l][t]; - const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t]; + vp9_prob newp = new_coef_probs[i][j][k][l][t]; + const vp9_prob oldp = old_coef_probs[i][j][k][l][t]; int s; int u = 0; - - if (l >= 3 && k == 0) - continue; if (t == PIVOT_NODE) s = vp9_prob_diff_update_savings_search_model( frame_branch_ct[i][j][k][l][0], - old_frame_coef_probs[i][j][k][l], &newp, upd, i, j); + old_coef_probs[i][j][k][l], &newp, upd); else s = vp9_prob_diff_update_savings_search( frame_branch_ct[i][j][k][l][t], oldp, &newp, upd); @@ -762,23 +560,21 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, return; } vp9_write_bit(bc, 1); - for (i = 0; i < BLOCK_TYPES; ++i) { + for (i = 0; i < PLANE_TYPES; ++i) { for (j = 0; j < REF_TYPES; ++j) { for (k = 0; k < COEF_BANDS; ++k) { - for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { // calc probs and branch cts for this frame only for (t = 0; t < entropy_nodes_update; ++t) { - vp9_prob newp = new_frame_coef_probs[i][j][k][l][t]; - vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t; + vp9_prob newp = new_coef_probs[i][j][k][l][t]; + vp9_prob *oldp = old_coef_probs[i][j][k][l] + t; const vp9_prob upd = DIFF_UPDATE_PROB; int s; int u = 0; - if (l >= 3 && k == 0) - continue; if (t == PIVOT_NODE) s = vp9_prob_diff_update_savings_search_model( frame_branch_ct[i][j][k][l][0], - old_frame_coef_probs[i][j][k][l], &newp, upd, i, j); + old_coef_probs[i][j][k][l], &newp, upd); else s = vp9_prob_diff_update_savings_search( frame_branch_ct[i][j][k][l][t], @@ -786,10 +582,6 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, if (s > 0 && newp != *oldp) u = 1; vp9_write(bc, u, upd); -#ifdef ENTROPY_STATS - if (!cpi->dummy_packing) - ++tree_update_hist[tx_size][i][j][k][l][t][u]; -#endif if (u) { /* send/use new probability */ vp9_write_prob_diff_update(bc, newp, *oldp); @@ -803,28 +595,26 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, return; } - case 1: - case 2: { + case ONE_LOOP: + case ONE_LOOP_REDUCED: { const int prev_coef_contexts_to_update = - (cpi->sf.use_fast_coef_updates == 2 ? - PREV_COEF_CONTEXTS >> 1 : PREV_COEF_CONTEXTS); + cpi->sf.use_fast_coef_updates == ONE_LOOP_REDUCED ? + COEFF_CONTEXTS >> 1 : COEFF_CONTEXTS; const int coef_band_to_update = - (cpi->sf.use_fast_coef_updates == 2 ? - COEF_BANDS >> 1 : COEF_BANDS); + cpi->sf.use_fast_coef_updates == ONE_LOOP_REDUCED ? + COEF_BANDS >> 1 : COEF_BANDS; int updates = 0; int noupdates_before_first = 0; - for (i = 0; i < BLOCK_TYPES; ++i) { + for (i = 0; i < PLANE_TYPES; ++i) { for (j = 0; j < REF_TYPES; ++j) { for (k = 0; k < COEF_BANDS; ++k) { - for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { // calc probs and branch cts for this frame only for (t = 0; t < entropy_nodes_update; ++t) { - vp9_prob newp = new_frame_coef_probs[i][j][k][l][t]; - vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t; + vp9_prob newp = new_coef_probs[i][j][k][l][t]; + vp9_prob *oldp = old_coef_probs[i][j][k][l] + t; int s; int u = 0; - if (l >= 3 && k == 0) - continue; if (l >= prev_coef_contexts_to_update || k >= coef_band_to_update) { u = 0; @@ -832,7 +622,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, if (t == PIVOT_NODE) s = vp9_prob_diff_update_savings_search_model( frame_branch_ct[i][j][k][l][0], - old_frame_coef_probs[i][j][k][l], &newp, upd, i, j); + old_coef_probs[i][j][k][l], &newp, upd); else s = vp9_prob_diff_update_savings_search( frame_branch_ct[i][j][k][l][t], @@ -843,10 +633,6 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, updates += u; if (u == 0 && updates == 0) { noupdates_before_first++; -#ifdef ENTROPY_STATS - if (!cpi->dummy_packing) - ++tree_update_hist[tx_size][i][j][k][l][t][u]; -#endif continue; } if (u == 1 && updates == 1) { @@ -857,10 +643,6 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, vp9_write(bc, 0, upd); } vp9_write(bc, u, upd); -#ifdef ENTROPY_STATS - if (!cpi->dummy_packing) - ++tree_update_hist[tx_size][i][j][k][l][t][u]; -#endif if (u) { /* send/use new probability */ vp9_write_prob_diff_update(bc, newp, *oldp); @@ -882,25 +664,20 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi, } } -static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) { +static void update_coef_probs(VP9_COMP *cpi, vp9_writer* w) { const TX_MODE tx_mode = cpi->common.tx_mode; + const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode]; + TX_SIZE tx_size; + vp9_coeff_stats frame_branch_ct[TX_SIZES][PLANE_TYPES]; + vp9_coeff_probs_model frame_coef_probs[TX_SIZES][PLANE_TYPES]; - vp9_clear_system_state(); + for (tx_size = TX_4X4; tx_size <= TX_32X32; ++tx_size) + build_tree_distribution(cpi, tx_size, frame_branch_ct[tx_size], + frame_coef_probs[tx_size]); - // Build the cofficient contexts based on counts collected in encode loop - build_coeff_contexts(cpi); - - update_coef_probs_common(bc, cpi, TX_4X4); - - // do not do this if not even allowed - if (tx_mode > ONLY_4X4) - update_coef_probs_common(bc, cpi, TX_8X8); - - if (tx_mode > ALLOW_8X8) - update_coef_probs_common(bc, cpi, TX_16X16); - - if (tx_mode > ALLOW_16X16) - update_coef_probs_common(bc, cpi, TX_32X32); + for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) + update_coef_probs_common(w, cpi, tx_size, frame_branch_ct[tx_size], + frame_coef_probs[tx_size]); } static void encode_loopfilter(struct loopfilter *lf, @@ -916,38 +693,27 @@ static void encode_loopfilter(struct loopfilter *lf, vp9_wb_write_bit(wb, lf->mode_ref_delta_enabled); if (lf->mode_ref_delta_enabled) { - // Do the deltas need to be updated vp9_wb_write_bit(wb, lf->mode_ref_delta_update); if (lf->mode_ref_delta_update) { - // Send update for (i = 0; i < MAX_REF_LF_DELTAS; i++) { const int delta = lf->ref_deltas[i]; - - // Frame level data - if (delta != lf->last_ref_deltas[i]) { + const int changed = delta != lf->last_ref_deltas[i]; + vp9_wb_write_bit(wb, changed); + if (changed) { lf->last_ref_deltas[i] = delta; - vp9_wb_write_bit(wb, 1); - - assert(delta != 0); vp9_wb_write_literal(wb, abs(delta) & 0x3F, 6); vp9_wb_write_bit(wb, delta < 0); - } else { - vp9_wb_write_bit(wb, 0); } } - // Send update for (i = 0; i < MAX_MODE_LF_DELTAS; i++) { const int delta = lf->mode_deltas[i]; - if (delta != lf->last_mode_deltas[i]) { + const int changed = delta != lf->last_mode_deltas[i]; + vp9_wb_write_bit(wb, changed); + if (changed) { lf->last_mode_deltas[i] = delta; - vp9_wb_write_bit(wb, 1); - - assert(delta != 0); vp9_wb_write_literal(wb, abs(delta) & 0x3F, 6); vp9_wb_write_bit(wb, delta < 0); - } else { - vp9_wb_write_bit(wb, 0); } } } @@ -964,7 +730,7 @@ static void write_delta_q(struct vp9_write_bit_buffer *wb, int delta_q) { } } -static void encode_quantization(VP9_COMMON *cm, +static void encode_quantization(const VP9_COMMON *const cm, struct vp9_write_bit_buffer *wb) { vp9_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS); write_delta_q(wb, cm->y_dc_delta_q); @@ -972,12 +738,11 @@ static void encode_quantization(VP9_COMMON *cm, write_delta_q(wb, cm->uv_ac_delta_q); } - -static void encode_segmentation(VP9_COMP *cpi, +static void encode_segmentation(VP9_COMMON *cm, MACROBLOCKD *xd, struct vp9_write_bit_buffer *wb) { int i, j; - struct segmentation *seg = &cpi->common.seg; + const struct segmentation *seg = &cm->seg; vp9_wb_write_bit(wb, seg->enabled); if (!seg->enabled) @@ -987,7 +752,7 @@ static void encode_segmentation(VP9_COMP *cpi, vp9_wb_write_bit(wb, seg->update_map); if (seg->update_map) { // Select the coding strategy (temporal or spatial) - vp9_choose_segmap_coding_method(cpi); + vp9_choose_segmap_coding_method(cm, xd); // Write out probabilities used to decode unpredicted macro-block segments for (i = 0; i < SEG_TREE_PROBS; i++) { const int prob = seg->tree_probs[i]; @@ -1024,10 +789,10 @@ static void encode_segmentation(VP9_COMP *cpi, const int data_max = vp9_seg_feature_data_max(j); if (vp9_is_segfeature_signed(j)) { - vp9_encode_unsigned_max(wb, abs(data), data_max); + encode_unsigned_max(wb, abs(data), data_max); vp9_wb_write_bit(wb, data < 0); } else { - vp9_encode_unsigned_max(wb, data, data_max); + encode_unsigned_max(wb, data, data_max); } } } @@ -1035,10 +800,7 @@ static void encode_segmentation(VP9_COMP *cpi, } } - -static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) { - VP9_COMMON *const cm = &cpi->common; - +static void encode_txfm_probs(VP9_COMMON *cm, vp9_writer *w) { // Mode vp9_write_literal(w, MIN(cm->tx_mode, ALLOW_32X32), 2); if (cm->tx_mode >= ALLOW_32X32) @@ -1071,26 +833,20 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) { vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p32x32[i][j], ct_32x32p[j]); } -#ifdef MODE_STATS - if (!cpi->dummy_packing) - update_tx_count_stats(cm); -#endif } } -static void write_interp_filter_type(INTERPOLATION_TYPE type, - struct vp9_write_bit_buffer *wb) { - const int type_to_literal[] = { 1, 0, 2, 3 }; +static void write_interp_filter(INTERP_FILTER filter, + struct vp9_write_bit_buffer *wb) { + const int filter_to_literal[] = { 1, 0, 2, 3 }; - vp9_wb_write_bit(wb, type == SWITCHABLE); - if (type != SWITCHABLE) - vp9_wb_write_literal(wb, type_to_literal[type], 2); + vp9_wb_write_bit(wb, filter == SWITCHABLE); + if (filter != SWITCHABLE) + vp9_wb_write_literal(wb, filter_to_literal[filter], 2); } -static void fix_mcomp_filter_type(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; - - if (cm->mcomp_filter_type == SWITCHABLE) { +static void fix_interp_filter(VP9_COMMON *cm) { + if (cm->interp_filter == SWITCHABLE) { // Check to see if only one of the filters is actually used int count[SWITCHABLE_FILTERS]; int i, j, c = 0; @@ -1104,7 +860,7 @@ static void fix_mcomp_filter_type(VP9_COMP *cpi) { // Only one filter is used. So set the filter at frame level for (i = 0; i < SWITCHABLE_FILTERS; ++i) { if (count[i]) { - cm->mcomp_filter_type = i; + cm->interp_filter = i; break; } } @@ -1112,7 +868,8 @@ static void fix_mcomp_filter_type(VP9_COMP *cpi) { } } -static void write_tile_info(VP9_COMMON *cm, struct vp9_write_bit_buffer *wb) { +static void write_tile_info(const VP9_COMMON *const cm, + struct vp9_write_bit_buffer *wb) { int min_log2_tile_cols, max_log2_tile_cols, ones; vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); @@ -1131,39 +888,29 @@ static void write_tile_info(VP9_COMMON *cm, struct vp9_write_bit_buffer *wb) { } static int get_refresh_mask(VP9_COMP *cpi) { - // Should the GF or ARF be updated using the transmitted frame or buffer -#if CONFIG_MULTIPLE_ARF - if (!cpi->multi_arf_enabled && cpi->refresh_golden_frame && - !cpi->refresh_alt_ref_frame) { -#else - if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame && - !cpi->use_svc) { -#endif - // Preserve the previously existing golden frame and update the frame in - // the alt ref slot instead. This is highly specific to the use of - // alt-ref as a forward reference, and this needs to be generalized as - // other uses are implemented (like RTC/temporal scaling) - // - // gld_fb_idx and alt_fb_idx need to be swapped for future frames, but - // that happens in vp9_onyx_if.c:update_reference_frames() so that it can - // be done outside of the recode loop. - return (cpi->refresh_last_frame << cpi->lst_fb_idx) | - (cpi->refresh_golden_frame << cpi->alt_fb_idx); - } else { - int arf_idx = cpi->alt_fb_idx; -#if CONFIG_MULTIPLE_ARF - // Determine which ARF buffer to use to encode this ARF frame. - if (cpi->multi_arf_enabled) { - int sn = cpi->sequence_number; - arf_idx = (cpi->frame_coding_order[sn] < 0) ? - cpi->arf_buffer_idx[sn + 1] : - cpi->arf_buffer_idx[sn]; - } -#endif - return (cpi->refresh_last_frame << cpi->lst_fb_idx) | - (cpi->refresh_golden_frame << cpi->gld_fb_idx) | - (cpi->refresh_alt_ref_frame << arf_idx); + if (vp9_preserve_existing_gf(cpi)) { + // We have decided to preserve the previously existing golden frame as our + // new ARF frame. However, in the short term we leave it in the GF slot and, + // if we're updating the GF with the current decoded frame, we save it + // instead to the ARF slot. + // Later, in the function vp9_encoder.c:vp9_update_reference_frames() we + // will swap gld_fb_idx and alt_fb_idx to achieve our objective. We do it + // there so that it can be done outside of the recode loop. + // Note: This is highly specific to the use of ARF as a forward reference, + // and this needs to be generalized as other uses are implemented + // (like RTC/temporal scalability). + return (cpi->refresh_last_frame << cpi->lst_fb_idx) | + (cpi->refresh_golden_frame << cpi->alt_fb_idx); + } else { + int arf_idx = cpi->alt_fb_idx; + if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + arf_idx = gf_group->arf_update_idx[gf_group->index]; } + return (cpi->refresh_last_frame << cpi->lst_fb_idx) | + (cpi->refresh_golden_frame << cpi->gld_fb_idx) | + (cpi->refresh_alt_ref_frame << arf_idx); + } } static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { @@ -1176,7 +923,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { const int tile_cols = 1 << cm->log2_tile_cols; const int tile_rows = 1 << cm->log2_tile_rows; - vpx_memset(cpi->above_seg_context, 0, sizeof(*cpi->above_seg_context) * + vpx_memset(cm->above_seg_context, 0, sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols)); tok[0][0] = cpi->tok; @@ -1207,7 +954,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { vp9_stop_encode(&residual_bc); if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) { // size of this tile - write_be32(data_ptr + total_size, residual_bc.pos); + mem_put_be32(data_ptr + total_size, residual_bc.pos); total_size += 4; } @@ -1218,9 +965,8 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { return total_size; } -static void write_display_size(VP9_COMP *cpi, struct vp9_write_bit_buffer *wb) { - VP9_COMMON *const cm = &cpi->common; - +static void write_display_size(const VP9_COMMON *cm, + struct vp9_write_bit_buffer *wb) { const int scaling_active = cm->width != cm->display_width || cm->height != cm->display_height; vp9_wb_write_bit(wb, scaling_active); @@ -1230,30 +976,31 @@ static void write_display_size(VP9_COMP *cpi, struct vp9_write_bit_buffer *wb) { } } -static void write_frame_size(VP9_COMP *cpi, +static void write_frame_size(const VP9_COMMON *cm, struct vp9_write_bit_buffer *wb) { - VP9_COMMON *const cm = &cpi->common; vp9_wb_write_literal(wb, cm->width - 1, 16); vp9_wb_write_literal(wb, cm->height - 1, 16); - write_display_size(cpi, wb); + write_display_size(cm, wb); } static void write_frame_size_with_refs(VP9_COMP *cpi, struct vp9_write_bit_buffer *wb) { VP9_COMMON *const cm = &cpi->common; - int refs[ALLOWED_REFS_PER_FRAME] = {cpi->lst_fb_idx, cpi->gld_fb_idx, - cpi->alt_fb_idx}; - int i, found = 0; + int found = 0; - for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) { - YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->ref_frame_map[refs[i]]]; + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame); found = cm->width == cfg->y_crop_width && cm->height == cfg->y_crop_height; - // TODO(ivan): This prevents a bug while more than 3 buffers are used. Do it - // in a better way. - if (cpi->use_svc) { + // Set "found" to 0 for temporal svc and for spatial svc key frame + if (cpi->use_svc && + ((cpi->svc.number_temporal_layers > 1 && + cpi->oxcf.rc_mode == VPX_CBR) || + (cpi->svc.number_spatial_layers > 1 && + cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame))) { found = 0; } vp9_wb_write_bit(wb, found); @@ -1267,7 +1014,7 @@ static void write_frame_size_with_refs(VP9_COMP *cpi, vp9_wb_write_literal(wb, cm->height - 1, 16); } - write_display_size(cpi, wb); + write_display_size(cm, wb); } static void write_sync_code(struct vp9_write_bit_buffer *wb) { @@ -1276,44 +1023,77 @@ static void write_sync_code(struct vp9_write_bit_buffer *wb) { vp9_wb_write_literal(wb, VP9_SYNC_CODE_2, 8); } +static void write_profile(BITSTREAM_PROFILE profile, + struct vp9_write_bit_buffer *wb) { + switch (profile) { + case PROFILE_0: + vp9_wb_write_literal(wb, 0, 2); + break; + case PROFILE_1: + vp9_wb_write_literal(wb, 2, 2); + break; + case PROFILE_2: + vp9_wb_write_literal(wb, 1, 2); + break; + case PROFILE_3: + vp9_wb_write_literal(wb, 6, 3); + break; + default: + assert(0); + } +} + +static void write_bitdepth_colorspace_sampling( + VP9_COMMON *const cm, struct vp9_write_bit_buffer *wb) { + if (cm->profile >= PROFILE_2) { + assert(cm->bit_depth > VPX_BITS_8); + vp9_wb_write_bit(wb, cm->bit_depth == VPX_BITS_10 ? 0 : 1); + } + vp9_wb_write_literal(wb, cm->color_space, 3); + if (cm->color_space != SRGB) { + vp9_wb_write_bit(wb, 0); // 0: [16, 235] (i.e. xvYCC), 1: [0, 255] + if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) { + assert(cm->subsampling_x != 1 || cm->subsampling_y != 1); + vp9_wb_write_bit(wb, cm->subsampling_x); + vp9_wb_write_bit(wb, cm->subsampling_y); + vp9_wb_write_bit(wb, 0); // unused + } else { + assert(cm->subsampling_x == 1 && cm->subsampling_y == 1); + } + } else { + assert(cm->profile == PROFILE_1 || cm->profile == PROFILE_3); + vp9_wb_write_bit(wb, 0); // unused + } +} + static void write_uncompressed_header(VP9_COMP *cpi, struct vp9_write_bit_buffer *wb) { VP9_COMMON *const cm = &cpi->common; vp9_wb_write_literal(wb, VP9_FRAME_MARKER, 2); - // bitstream version. - // 00 - profile 0. 4:2:0 only - // 10 - profile 1. adds 4:4:4, 4:2:2, alpha - vp9_wb_write_bit(wb, cm->version); - vp9_wb_write_bit(wb, 0); + write_profile(cm->profile, wb); - vp9_wb_write_bit(wb, 0); + vp9_wb_write_bit(wb, 0); // show_existing_frame vp9_wb_write_bit(wb, cm->frame_type); vp9_wb_write_bit(wb, cm->show_frame); vp9_wb_write_bit(wb, cm->error_resilient_mode); if (cm->frame_type == KEY_FRAME) { - const COLOR_SPACE cs = UNKNOWN; write_sync_code(wb); - vp9_wb_write_literal(wb, cs, 3); - if (cs != SRGB) { - vp9_wb_write_bit(wb, 0); // 0: [16, 235] (i.e. xvYCC), 1: [0, 255] - if (cm->version == 1) { - vp9_wb_write_bit(wb, cm->subsampling_x); - vp9_wb_write_bit(wb, cm->subsampling_y); - vp9_wb_write_bit(wb, 0); // has extra plane - } - } else { - assert(cm->version == 1); - vp9_wb_write_bit(wb, 0); // has extra plane - } - - write_frame_size(cpi, wb); + write_bitdepth_colorspace_sampling(cm, wb); + write_frame_size(cm, wb); } else { - const int refs[ALLOWED_REFS_PER_FRAME] = {cpi->lst_fb_idx, cpi->gld_fb_idx, - cpi->alt_fb_idx}; - if (!cm->show_frame) + // In spatial svc if it's not error_resilient_mode then we need to code all + // visible frames as invisible. But we need to keep the show_frame flag so + // that the publisher could know whether it is supposed to be visible. + // So we will code the show_frame flag as it is. Then code the intra_only + // bit here. This will make the bitstream incompatible. In the player we + // will change to show_frame flag to 0, then add an one byte frame with + // show_existing_frame flag which tells the decoder which frame we want to + // show. + if (!cm->show_frame || + (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0)) vp9_wb_write_bit(wb, cm->intra_only); if (!cm->error_resilient_mode) @@ -1322,22 +1102,28 @@ static void write_uncompressed_header(VP9_COMP *cpi, if (cm->intra_only) { write_sync_code(wb); - vp9_wb_write_literal(wb, get_refresh_mask(cpi), NUM_REF_FRAMES); - write_frame_size(cpi, wb); + // Note for profile 0, 420 8bpp is assumed. + if (cm->profile > PROFILE_0) { + write_bitdepth_colorspace_sampling(cm, wb); + } + + vp9_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES); + write_frame_size(cm, wb); } else { - int i; - vp9_wb_write_literal(wb, get_refresh_mask(cpi), NUM_REF_FRAMES); - for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) { - vp9_wb_write_literal(wb, refs[i], NUM_REF_FRAMES_LOG2); - vp9_wb_write_bit(wb, cm->ref_frame_sign_bias[LAST_FRAME + i]); + MV_REFERENCE_FRAME ref_frame; + vp9_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES); + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + vp9_wb_write_literal(wb, get_ref_frame_idx(cpi, ref_frame), + REF_FRAMES_LOG2); + vp9_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]); } write_frame_size_with_refs(cpi, wb); vp9_wb_write_bit(wb, cm->allow_high_precision_mv); - fix_mcomp_filter_type(cpi); - write_interp_filter_type(cm->mcomp_filter_type, wb); + fix_interp_filter(cm); + write_interp_filter(cm->interp_filter, wb); } } @@ -1346,11 +1132,11 @@ static void write_uncompressed_header(VP9_COMP *cpi, vp9_wb_write_bit(wb, cm->frame_parallel_decoding_mode); } - vp9_wb_write_literal(wb, cm->frame_context_idx, NUM_FRAME_CONTEXTS_LOG2); + vp9_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2); encode_loopfilter(&cm->lf, wb); encode_quantization(cm, wb); - encode_segmentation(cpi, wb); + encode_segmentation(cm, &cpi->mb.e_mbd, wb); write_tile_info(cm, wb); } @@ -1366,36 +1152,30 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { if (xd->lossless) cm->tx_mode = ONLY_4X4; else - encode_txfm_probs(cpi, &header_bc); + encode_txfm_probs(cm, &header_bc); update_coef_probs(cpi, &header_bc); - -#ifdef ENTROPY_STATS - active_section = 2; -#endif - - vp9_update_skip_probs(cpi, &header_bc); + update_skip_probs(cm, &header_bc); if (!frame_is_intra_only(cm)) { int i; -#ifdef ENTROPY_STATS - active_section = 1; -#endif - update_inter_mode_probs(cm, &header_bc); + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) + prob_diff_update(vp9_inter_mode_tree, cm->fc.inter_mode_probs[i], + cm->counts.inter_mode[i], INTER_MODES, &header_bc); + vp9_zero(cm->counts.inter_mode); - if (cm->mcomp_filter_type == SWITCHABLE) - update_switchable_interp_probs(cpi, &header_bc); + if (cm->interp_filter == SWITCHABLE) + update_switchable_interp_probs(cm, &header_bc); for (i = 0; i < INTRA_INTER_CONTEXTS; i++) vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i], - cpi->intra_inter_count[i]); + cm->counts.intra_inter[i]); if (cm->allow_comp_inter_inter) { - const int comp_pred_mode = cpi->common.comp_pred_mode; - const int use_compound_pred = comp_pred_mode != SINGLE_PREDICTION_ONLY; - const int use_hybrid_pred = comp_pred_mode == HYBRID_PREDICTION; + const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE; + const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT; vp9_write_bit(&header_bc, use_compound_pred); if (use_compound_pred) { @@ -1403,34 +1183,33 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { if (use_hybrid_pred) for (i = 0; i < COMP_INTER_CONTEXTS; i++) vp9_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i], - cpi->comp_inter_count[i]); + cm->counts.comp_inter[i]); } } - if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) { + if (cm->reference_mode != COMPOUND_REFERENCE) { for (i = 0; i < REF_CONTEXTS; i++) { vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0], - cpi->single_ref_count[i][0]); + cm->counts.single_ref[i][0]); vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1], - cpi->single_ref_count[i][1]); + cm->counts.single_ref[i][1]); } } - if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY) + if (cm->reference_mode != SINGLE_REFERENCE) for (i = 0; i < REF_CONTEXTS; i++) vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i], - cpi->comp_ref_count[i]); + cm->counts.comp_ref[i]); - update_mbintra_mode_probs(cpi, &header_bc); + for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) + prob_diff_update(vp9_intra_mode_tree, cm->fc.y_mode_prob[i], + cm->counts.y_mode[i], INTRA_MODES, &header_bc); - for (i = 0; i < PARTITION_CONTEXTS; ++i) { - unsigned int bct[PARTITION_TYPES - 1][2]; - update_mode(&header_bc, PARTITION_TYPES, vp9_partition_tree, - fc->partition_prob[i], bct, - (unsigned int *)cpi->partition_count[i]); - } + for (i = 0; i < PARTITION_CONTEXTS; ++i) + prob_diff_update(vp9_partition_tree, fc->partition_prob[i], + cm->counts.partition[i], PARTITION_TYPES, &header_bc); - vp9_write_nmv_probs(cpi, cm->allow_high_precision_mv, &header_bc); + vp9_write_nmv_probs(cm, cm->allow_high_precision_mv, &header_bc); } vp9_stop_encode(&header_bc); @@ -1439,9 +1218,9 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { return header_bc.pos; } -void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) { +void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) { uint8_t *data = dest; - size_t first_part_size; + size_t first_part_size, uncompressed_hdr_size; struct vp9_write_bit_buffer wb = {data, 0}; struct vp9_write_bit_buffer saved_wb; @@ -1449,75 +1228,17 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) { saved_wb = wb; vp9_wb_write_literal(&wb, 0, 16); // don't know in advance first part. size - data += vp9_rb_bytes_written(&wb); + uncompressed_hdr_size = vp9_wb_bytes_written(&wb); + data += uncompressed_hdr_size; - vp9_compute_update_table(); - -#ifdef ENTROPY_STATS - if (cm->frame_type == INTER_FRAME) - active_section = 0; - else - active_section = 7; -#endif - - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); first_part_size = write_compressed_header(cpi, data); data += first_part_size; - vp9_wb_write_literal(&saved_wb, first_part_size, 16); + // TODO(jbb): Figure out what to do if first_part_size > 16 bits. + vp9_wb_write_literal(&saved_wb, (int)first_part_size, 16); data += encode_tiles(cpi, data); *size = data - dest; } - -#ifdef ENTROPY_STATS -static void print_tree_update_for_type(FILE *f, - vp9_coeff_stats *tree_update_hist, - int block_types, const char *header) { - int i, j, k, l, m; - - fprintf(f, "const vp9_coeff_prob %s = {\n", header); - for (i = 0; i < block_types; i++) { - fprintf(f, " { \n"); - for (j = 0; j < REF_TYPES; j++) { - fprintf(f, " { \n"); - for (k = 0; k < COEF_BANDS; k++) { - fprintf(f, " {\n"); - for (l = 0; l < PREV_COEF_CONTEXTS; l++) { - fprintf(f, " {"); - for (m = 0; m < ENTROPY_NODES; m++) { - fprintf(f, "%3d, ", - get_binary_prob(tree_update_hist[i][j][k][l][m][0], - tree_update_hist[i][j][k][l][m][1])); - } - fprintf(f, "},\n"); - } - fprintf(f, "},\n"); - } - fprintf(f, " },\n"); - } - fprintf(f, " },\n"); - } - fprintf(f, "};\n"); -} - -void print_tree_update_probs() { - FILE *f = fopen("coefupdprob.h", "w"); - fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n"); - - print_tree_update_for_type(f, tree_update_hist[TX_4X4], BLOCK_TYPES, - "vp9_coef_update_probs_4x4[BLOCK_TYPES]"); - print_tree_update_for_type(f, tree_update_hist[TX_8X8], BLOCK_TYPES, - "vp9_coef_update_probs_8x8[BLOCK_TYPES]"); - print_tree_update_for_type(f, tree_update_hist[TX_16X16], BLOCK_TYPES, - "vp9_coef_update_probs_16x16[BLOCK_TYPES]"); - print_tree_update_for_type(f, tree_update_hist[TX_32X32], BLOCK_TYPES, - "vp9_coef_update_probs_32x32[BLOCK_TYPES]"); - - fclose(f); - f = fopen("treeupdate.bin", "wb"); - fwrite(tree_update_hist, sizeof(tree_update_hist), 1, f); - fclose(f); -} -#endif diff --git a/media/libvpx/vp9/encoder/vp9_bitstream.h b/media/libvpx/vp9/encoder/vp9_bitstream.h index b3dbee1a7725..b48826140f9a 100644 --- a/media/libvpx/vp9/encoder/vp9_bitstream.h +++ b/media/libvpx/vp9/encoder/vp9_bitstream.h @@ -12,6 +12,28 @@ #ifndef VP9_ENCODER_VP9_BITSTREAM_H_ #define VP9_ENCODER_VP9_BITSTREAM_H_ -void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *bc); +#ifdef __cplusplus +extern "C" { +#endif + +#include "vp9/encoder/vp9_encoder.h" + +void vp9_entropy_mode_init(); + +void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size); + +static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) { + return !cpi->multi_arf_allowed && cpi->refresh_golden_frame && + cpi->rc.is_src_frame_alt_ref && + (!cpi->use_svc || // Add spatial svc base layer case here + (is_two_pass_svc(cpi) && + cpi->svc.spatial_layer_id == 0 && + cpi->svc.layer_context[0].gold_ref_idx >=0 && + cpi->oxcf.ss_play_alternate[0])); +} + +#ifdef __cplusplus +} // extern "C" +#endif #endif // VP9_ENCODER_VP9_BITSTREAM_H_ diff --git a/media/libvpx/vp9/encoder/vp9_block.h b/media/libvpx/vp9/encoder/vp9_block.h index 444597067337..b7263832279d 100644 --- a/media/libvpx/vp9/encoder/vp9_block.h +++ b/media/libvpx/vp9/encoder/vp9_block.h @@ -11,81 +11,45 @@ #ifndef VP9_ENCODER_VP9_BLOCK_H_ #define VP9_ENCODER_VP9_BLOCK_H_ -#include "vp9/common/vp9_onyx.h" #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_entropy.h" #include "vpx_ports/mem.h" #include "vp9/common/vp9_onyxc_int.h" -// motion search site +#ifdef __cplusplus +extern "C" { +#endif + typedef struct { - MV mv; - int offset; -} search_site; - -// Structure to hold snapshot of coding context during the mode picking process -typedef struct { - MODE_INFO mic; - uint8_t *zcoeff_blk; - int16_t *coeff[MAX_MB_PLANE][3]; - int16_t *qcoeff[MAX_MB_PLANE][3]; - int16_t *dqcoeff[MAX_MB_PLANE][3]; - uint16_t *eobs[MAX_MB_PLANE][3]; - - // dual buffer pointers, 0: in use, 1: best in store - int16_t *coeff_pbuf[MAX_MB_PLANE][3]; - int16_t *qcoeff_pbuf[MAX_MB_PLANE][3]; - int16_t *dqcoeff_pbuf[MAX_MB_PLANE][3]; - uint16_t *eobs_pbuf[MAX_MB_PLANE][3]; - - int is_coded; - int num_4x4_blk; - int skip; - int_mv best_ref_mv; - int_mv second_best_ref_mv; - int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES]; - int rate; - int distortion; - int64_t intra_error; - int best_mode_index; - int rddiv; - int rdmult; - int hybrid_pred_diff; - int comp_pred_diff; - int single_pred_diff; - int64_t tx_rd_diff[TX_MODES]; - int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; - - // motion vector cache for adaptive motion search control in partition - // search loop - int_mv pred_mv[MAX_REF_FRAMES]; - - // Bit flag for each mode whether it has high error in comparison to others. - unsigned int modes_with_high_error; - - // Bit flag for each ref frame whether it has high error compared to others. - unsigned int frames_with_high_error; -} PICK_MODE_CONTEXT; + unsigned int sse; + int sum; + unsigned int var; +} diff; struct macroblock_plane { DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]); + int16_t *qcoeff; int16_t *coeff; + uint16_t *eobs; struct buf_2d src; // Quantizer setings + int16_t *quant_fp; + int16_t *round_fp; int16_t *quant; int16_t *quant_shift; int16_t *zbin; int16_t *round; + int64_t quant_thred[2]; // Zbin Over Quant value int16_t zbin_extra; }; /* The [2] dimension is for whether we skip the EOB node (i.e. if previous * coefficient in this block was zero) or not. */ -typedef unsigned int vp9_coeff_cost[BLOCK_TYPES][REF_TYPES][COEF_BANDS][2] - [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS]; +typedef unsigned int vp9_coeff_cost[PLANE_TYPES][REF_TYPES][COEF_BANDS][2] + [COEFF_CONTEXTS][ENTROPY_TOKENS]; typedef struct macroblock MACROBLOCK; struct macroblock { @@ -93,54 +57,34 @@ struct macroblock { MACROBLOCKD e_mbd; int skip_block; - int select_txfm_size; + int select_tx_size; int skip_recode; int skip_optimize; int q_index; - search_site *ss; - int ss_count; - int searches_per_step; - int errorperbit; int sadperbit16; int sadperbit4; int rddiv; int rdmult; - unsigned int mb_energy; - unsigned int *mb_activity_ptr; - int *mb_norm_activity_ptr; - signed int act_zbin_adj; + int mb_energy; int mv_best_ref_index[MAX_REF_FRAMES]; unsigned int max_mv_context[MAX_REF_FRAMES]; unsigned int source_variance; + unsigned int pred_sse[MAX_REF_FRAMES]; + int pred_mv_sad[MAX_REF_FRAMES]; int nmvjointcost[MV_JOINTS]; - int nmvcosts[2][MV_VALS]; int *nmvcost[2]; - int nmvcosts_hp[2][MV_VALS]; int *nmvcost_hp[2]; int **mvcost; int nmvjointsadcost[MV_JOINTS]; - int nmvsadcosts[2][MV_VALS]; int *nmvsadcost[2]; - int nmvsadcosts_hp[2][MV_VALS]; int *nmvsadcost_hp[2]; int **mvsadcost; - int mbmode_cost[MB_MODE_COUNT]; - unsigned inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES]; - int intra_uv_mode_cost[2][MB_MODE_COUNT]; - int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES]; - int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS]; - - unsigned char sb_index; // index of 32x32 block inside the 64x64 block - unsigned char mb_index; // index of 16x16 block inside the 32x32 block - unsigned char b_index; // index of 8x8 block inside the 16x16 block - unsigned char ab_index; // index of 4x4 block inside the 8x8 block - // These define limits to motion vector components to prevent them // from extending outside the UMV borders int mv_col_min; @@ -153,11 +97,10 @@ struct macroblock { int encode_breakout; - unsigned char *active_ptr; - // note that token_costs is the cost when eob node is skipped vp9_coeff_cost token_costs[TX_SIZES]; - DECLARE_ALIGNED(16, uint8_t, token_cache[1024]); + + int in_static_area; int optimize; @@ -165,91 +108,23 @@ struct macroblock { int use_lp32x32fdct; int skip_encode; + // use fast quantization process + int quant_fp; + + // skip forward transform and quantization + uint8_t skip_txfm[MAX_MB_PLANE << 2]; + + int64_t bsse[MAX_MB_PLANE << 2]; + // Used to store sub partition's choices. - int fast_ms; - int_mv pred_mv[MAX_REF_FRAMES]; - int subblock_ref; - - // TODO(jingning): Need to refactor the structure arrays that buffers the - // coding mode decisions of each partition type. - PICK_MODE_CONTEXT ab4x4_context[4][4][4]; - PICK_MODE_CONTEXT sb8x4_context[4][4][4]; - PICK_MODE_CONTEXT sb4x8_context[4][4][4]; - PICK_MODE_CONTEXT sb8x8_context[4][4][4]; - PICK_MODE_CONTEXT sb8x16_context[4][4][2]; - PICK_MODE_CONTEXT sb16x8_context[4][4][2]; - PICK_MODE_CONTEXT mb_context[4][4]; - PICK_MODE_CONTEXT sb32x16_context[4][2]; - PICK_MODE_CONTEXT sb16x32_context[4][2]; - // when 4 MBs share coding parameters: - PICK_MODE_CONTEXT sb32_context[4]; - PICK_MODE_CONTEXT sb32x64_context[2]; - PICK_MODE_CONTEXT sb64x32_context[2]; - PICK_MODE_CONTEXT sb64_context; - int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES]; - - BLOCK_SIZE b_partitioning[4][4][4]; - BLOCK_SIZE mb_partitioning[4][4]; - BLOCK_SIZE sb_partitioning[4]; - BLOCK_SIZE sb64_partitioning; + MV pred_mv[MAX_REF_FRAMES]; void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride); + void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob); }; -// TODO(jingning): the variables used here are little complicated. need further -// refactoring on organizing the temporary buffers, when recursive -// partition down to 4x4 block size is enabled. -static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) { - switch (bsize) { - case BLOCK_64X64: - return &x->sb64_context; - case BLOCK_64X32: - return &x->sb64x32_context[x->sb_index]; - case BLOCK_32X64: - return &x->sb32x64_context[x->sb_index]; - case BLOCK_32X32: - return &x->sb32_context[x->sb_index]; - case BLOCK_32X16: - return &x->sb32x16_context[x->sb_index][x->mb_index]; - case BLOCK_16X32: - return &x->sb16x32_context[x->sb_index][x->mb_index]; - case BLOCK_16X16: - return &x->mb_context[x->sb_index][x->mb_index]; - case BLOCK_16X8: - return &x->sb16x8_context[x->sb_index][x->mb_index][x->b_index]; - case BLOCK_8X16: - return &x->sb8x16_context[x->sb_index][x->mb_index][x->b_index]; - case BLOCK_8X8: - return &x->sb8x8_context[x->sb_index][x->mb_index][x->b_index]; - case BLOCK_8X4: - return &x->sb8x4_context[x->sb_index][x->mb_index][x->b_index]; - case BLOCK_4X8: - return &x->sb4x8_context[x->sb_index][x->mb_index][x->b_index]; - case BLOCK_4X4: - return &x->ab4x4_context[x->sb_index][x->mb_index][x->b_index]; - default: - assert(0); - return NULL; - } -} - -struct rdcost_block_args { - MACROBLOCK *x; - ENTROPY_CONTEXT t_above[16]; - ENTROPY_CONTEXT t_left[16]; - TX_SIZE tx_size; - int bw; - int bh; - int rate; - int64_t dist; - int64_t sse; - int this_rate; - int64_t this_dist; - int64_t this_sse; - int64_t this_rd; - int64_t best_rd; - int skip; - const int16_t *scan, *nb; -}; +#ifdef __cplusplus +} // extern "C" +#endif #endif // VP9_ENCODER_VP9_BLOCK_H_ diff --git a/media/libvpx/vp9/encoder/vp9_context_tree.c b/media/libvpx/vp9/encoder/vp9_context_tree.c new file mode 100644 index 000000000000..9b7a93267f15 --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_context_tree.c @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/encoder/vp9_context_tree.h" +#include "vp9/encoder/vp9_encoder.h" + +static const BLOCK_SIZE square[] = { + BLOCK_8X8, + BLOCK_16X16, + BLOCK_32X32, + BLOCK_64X64, +}; + +static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk, + PICK_MODE_CONTEXT *ctx) { + const int num_blk = (num_4x4_blk < 4 ? 4 : num_4x4_blk); + const int num_pix = num_blk << 4; + int i, k; + ctx->num_4x4_blk = num_blk; + + CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, + vpx_calloc(num_4x4_blk, sizeof(uint8_t))); + for (i = 0; i < MAX_MB_PLANE; ++i) { + for (k = 0; k < 3; ++k) { + CHECK_MEM_ERROR(cm, ctx->coeff[i][k], + vpx_memalign(16, num_pix * sizeof(int16_t))); + CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k], + vpx_memalign(16, num_pix * sizeof(int16_t))); + CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k], + vpx_memalign(16, num_pix * sizeof(int16_t))); + CHECK_MEM_ERROR(cm, ctx->eobs[i][k], + vpx_memalign(16, num_pix * sizeof(uint16_t))); + ctx->coeff_pbuf[i][k] = ctx->coeff[i][k]; + ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k]; + ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k]; + ctx->eobs_pbuf[i][k] = ctx->eobs[i][k]; + } + } +} + +static void free_mode_context(PICK_MODE_CONTEXT *ctx) { + int i, k; + vpx_free(ctx->zcoeff_blk); + ctx->zcoeff_blk = 0; + for (i = 0; i < MAX_MB_PLANE; ++i) { + for (k = 0; k < 3; ++k) { + vpx_free(ctx->coeff[i][k]); + ctx->coeff[i][k] = 0; + vpx_free(ctx->qcoeff[i][k]); + ctx->qcoeff[i][k] = 0; + vpx_free(ctx->dqcoeff[i][k]); + ctx->dqcoeff[i][k] = 0; + vpx_free(ctx->eobs[i][k]); + ctx->eobs[i][k] = 0; + } + } +} + +static void alloc_tree_contexts(VP9_COMMON *cm, PC_TREE *tree, + int num_4x4_blk) { + alloc_mode_context(cm, num_4x4_blk, &tree->none); + alloc_mode_context(cm, num_4x4_blk/2, &tree->horizontal[0]); + alloc_mode_context(cm, num_4x4_blk/2, &tree->vertical[0]); + + /* TODO(Jbb): for 4x8 and 8x4 these allocated values are not used. + * Figure out a better way to do this. */ + alloc_mode_context(cm, num_4x4_blk/2, &tree->horizontal[1]); + alloc_mode_context(cm, num_4x4_blk/2, &tree->vertical[1]); +} + +static void free_tree_contexts(PC_TREE *tree) { + free_mode_context(&tree->none); + free_mode_context(&tree->horizontal[0]); + free_mode_context(&tree->horizontal[1]); + free_mode_context(&tree->vertical[0]); + free_mode_context(&tree->vertical[1]); +} + +// This function sets up a tree of contexts such that at each square +// partition level. There are contexts for none, horizontal, vertical, and +// split. Along with a block_size value and a selected block_size which +// represents the state of our search. +void vp9_setup_pc_tree(VP9_COMMON *cm, VP9_COMP *cpi) { + int i, j; + const int leaf_nodes = 64; + const int tree_nodes = 64 + 16 + 4 + 1; + int pc_tree_index = 0; + PC_TREE *this_pc; + PICK_MODE_CONTEXT *this_leaf; + int square_index = 1; + int nodes; + + vpx_free(cpi->leaf_tree); + CHECK_MEM_ERROR(cm, cpi->leaf_tree, vpx_calloc(leaf_nodes, + sizeof(*cpi->leaf_tree))); + vpx_free(cpi->pc_tree); + CHECK_MEM_ERROR(cm, cpi->pc_tree, vpx_calloc(tree_nodes, + sizeof(*cpi->pc_tree))); + + this_pc = &cpi->pc_tree[0]; + this_leaf = &cpi->leaf_tree[0]; + + // 4x4 blocks smaller than 8x8 but in the same 8x8 block share the same + // context so we only need to allocate 1 for each 8x8 block. + for (i = 0; i < leaf_nodes; ++i) + alloc_mode_context(cm, 1, &cpi->leaf_tree[i]); + + // Sets up all the leaf nodes in the tree. + for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) { + PC_TREE *const tree = &cpi->pc_tree[pc_tree_index]; + tree->block_size = square[0]; + alloc_tree_contexts(cm, tree, 4); + tree->leaf_split[0] = this_leaf++; + for (j = 1; j < 4; j++) + tree->leaf_split[j] = tree->leaf_split[0]; + } + + // Each node has 4 leaf nodes, fill each block_size level of the tree + // from leafs to the root. + for (nodes = 16; nodes > 0; nodes >>= 2) { + for (i = 0; i < nodes; ++i) { + PC_TREE *const tree = &cpi->pc_tree[pc_tree_index]; + alloc_tree_contexts(cm, tree, 4 << (2 * square_index)); + tree->block_size = square[square_index]; + for (j = 0; j < 4; j++) + tree->split[j] = this_pc++; + ++pc_tree_index; + } + ++square_index; + } + cpi->pc_root = &cpi->pc_tree[tree_nodes - 1]; + cpi->pc_root[0].none.best_mode_index = 2; +} + +void vp9_free_pc_tree(VP9_COMP *cpi) { + const int tree_nodes = 64 + 16 + 4 + 1; + int i; + + // Set up all 4x4 mode contexts + for (i = 0; i < 64; ++i) + free_mode_context(&cpi->leaf_tree[i]); + + // Sets up all the leaf nodes in the tree. + for (i = 0; i < tree_nodes; ++i) + free_tree_contexts(&cpi->pc_tree[i]); + + vpx_free(cpi->pc_tree); + cpi->pc_tree = NULL; + vpx_free(cpi->leaf_tree); + cpi->leaf_tree = NULL; +} diff --git a/media/libvpx/vp9/encoder/vp9_context_tree.h b/media/libvpx/vp9/encoder/vp9_context_tree.h new file mode 100644 index 000000000000..236389b6f9d8 --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_context_tree.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_ENCODER_VP9_CONTEXT_TREE_H_ +#define VP9_ENCODER_VP9_CONTEXT_TREE_H_ + +#include "vp9/common/vp9_onyxc_int.h" + +struct VP9_COMP; + +// Structure to hold snapshot of coding context during the mode picking process +typedef struct { + MODE_INFO mic; + uint8_t *zcoeff_blk; + int16_t *coeff[MAX_MB_PLANE][3]; + int16_t *qcoeff[MAX_MB_PLANE][3]; + int16_t *dqcoeff[MAX_MB_PLANE][3]; + uint16_t *eobs[MAX_MB_PLANE][3]; + + // dual buffer pointers, 0: in use, 1: best in store + int16_t *coeff_pbuf[MAX_MB_PLANE][3]; + int16_t *qcoeff_pbuf[MAX_MB_PLANE][3]; + int16_t *dqcoeff_pbuf[MAX_MB_PLANE][3]; + uint16_t *eobs_pbuf[MAX_MB_PLANE][3]; + + int is_coded; + int num_4x4_blk; + int skip; + // For current partition, only if all Y, U, and V transform blocks' + // coefficients are quantized to 0, skippable is set to 0. + int skippable; + uint8_t skip_txfm[MAX_MB_PLANE << 2]; + int best_mode_index; + int hybrid_pred_diff; + int comp_pred_diff; + int single_pred_diff; + int64_t tx_rd_diff[TX_MODES]; + int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; + +#if CONFIG_VP9_TEMPORAL_DENOISING + unsigned int newmv_sse; + unsigned int zeromv_sse; + PREDICTION_MODE best_sse_inter_mode; + int_mv best_sse_mv; + MV_REFERENCE_FRAME best_reference_frame; + MV_REFERENCE_FRAME best_zeromv_reference_frame; +#endif + + // motion vector cache for adaptive motion search control in partition + // search loop + MV pred_mv[MAX_REF_FRAMES]; + INTERP_FILTER pred_interp_filter; +} PICK_MODE_CONTEXT; + +typedef struct PC_TREE { + int index; + PARTITION_TYPE partitioning; + BLOCK_SIZE block_size; + PICK_MODE_CONTEXT none; + PICK_MODE_CONTEXT horizontal[2]; + PICK_MODE_CONTEXT vertical[2]; + union { + struct PC_TREE *split[4]; + PICK_MODE_CONTEXT *leaf_split[4]; + }; +} PC_TREE; + +void vp9_setup_pc_tree(struct VP9Common *cm, struct VP9_COMP *cpi); +void vp9_free_pc_tree(struct VP9_COMP *cpi); + +#endif /* VP9_ENCODER_VP9_CONTEXT_TREE_H_ */ diff --git a/media/libvpx/vp9/encoder/vp9_boolhuff.c b/media/libvpx/vp9/encoder/vp9_cost.c similarity index 71% rename from media/libvpx/vp9/encoder/vp9_boolhuff.c rename to media/libvpx/vp9/encoder/vp9_cost.c index 32c136e0f70e..1c3c3d248475 100644 --- a/media/libvpx/vp9/encoder/vp9_boolhuff.c +++ b/media/libvpx/vp9/encoder/vp9_cost.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,18 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include -#include "vp9/encoder/vp9_boolhuff.h" -#include "vp9/common/vp9_entropy.h" - -#if defined(SECTIONBITS_OUTPUT) -unsigned __int64 Sectionbits[500]; - -#endif - -#ifdef ENTROPY_STATS -unsigned int active_section = 0; -#endif +#include "vp9/encoder/vp9_cost.h" const unsigned int vp9_prob_cost[256] = { 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, @@ -45,24 +34,29 @@ const unsigned int vp9_prob_cost[256] = { 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 1}; -void vp9_start_encode(vp9_writer *br, uint8_t *source) { - br->lowvalue = 0; - br->range = 255; - br->value = 0; - br->count = -24; - br->buffer = source; - br->pos = 0; - vp9_write_bit(br, 0); +static void cost(int *costs, vp9_tree tree, const vp9_prob *probs, + int i, int c) { + const vp9_prob prob = probs[i / 2]; + int b; + + for (b = 0; b <= 1; ++b) { + const int cc = c + vp9_cost_bit(prob, b); + const vp9_tree_index ii = tree[i + b]; + + if (ii <= 0) + costs[-ii] = cc; + else + cost(costs, tree, probs, ii, cc); + } } -void vp9_stop_encode(vp9_writer *br) { - int i; - - for (i = 0; i < 32; i++) - vp9_write_bit(br, 0); - - // Ensure there's no ambigous collision with any index marker bytes - if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) - br->buffer[br->pos++] = 0; +void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree) { + cost(costs, tree, probs, 0, 0); } +void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree) { + assert(tree[0] <= 0 && tree[1] > 0); + + costs[-tree[0]] = vp9_cost_bit(probs[0], 0); + cost(costs, tree, probs, 2, 0); +} diff --git a/media/libvpx/vp9/encoder/vp9_cost.h b/media/libvpx/vp9/encoder/vp9_cost.h new file mode 100644 index 000000000000..6d2b9400d7e0 --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_cost.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_ENCODER_VP9_COST_H_ +#define VP9_ENCODER_VP9_COST_H_ + +#include "vp9/common/vp9_prob.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern const unsigned int vp9_prob_cost[256]; + +#define vp9_cost_zero(prob) (vp9_prob_cost[prob]) + +#define vp9_cost_one(prob) vp9_cost_zero(vp9_complement(prob)) + +#define vp9_cost_bit(prob, bit) vp9_cost_zero((bit) ? vp9_complement(prob) \ + : (prob)) + +static INLINE unsigned int cost_branch256(const unsigned int ct[2], + vp9_prob p) { + return ct[0] * vp9_cost_zero(p) + ct[1] * vp9_cost_one(p); +} + +static INLINE int treed_cost(vp9_tree tree, const vp9_prob *probs, + int bits, int len) { + int cost = 0; + vp9_tree_index i = 0; + + do { + const int bit = (bits >> --len) & 1; + cost += vp9_cost_bit(probs[i >> 1], bit); + i = tree[i + bit]; + } while (len); + + return cost; +} + +void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree); +void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_ENCODER_VP9_COST_H_ diff --git a/media/libvpx/vp9/encoder/vp9_dct.c b/media/libvpx/vp9/encoder/vp9_dct.c index 0f4a6bb63282..59222f0a959a 100644 --- a/media/libvpx/vp9/encoder/vp9_dct.c +++ b/media/libvpx/vp9/encoder/vp9_dct.c @@ -18,8 +18,6 @@ #include "vp9/common/vp9_idct.h" #include "vp9/common/vp9_systemdependent.h" -#include "vp9/encoder/vp9_dct.h" - static INLINE int fdct_round_shift(int input) { int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); assert(INT16_MIN <= rv && rv <= INT16_MAX); @@ -45,11 +43,22 @@ static void fdct4(const int16_t *input, int16_t *output) { output[3] = fdct_round_shift(temp2); } +void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) { + int r, c; + int16_t sum = 0; + for (r = 0; r < 4; ++r) + for (c = 0; c < 4; ++c) + sum += input[r * stride + c]; + + output[0] = sum << 1; + output[1] = 0; +} + void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, - // as the first pass results are transposed, we tranpose the columns (that + // as the first pass results are transposed, we transpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). int pass; @@ -157,32 +166,36 @@ static const transform_2d FHT_4[] = { { fadst4, fadst4 } // ADST_ADST = 3 }; -void vp9_short_fht4x4_c(const int16_t *input, int16_t *output, - int stride, int tx_type) { - int16_t out[4 * 4]; - int16_t *outptr = &out[0]; - int i, j; - int16_t temp_in[4], temp_out[4]; - const transform_2d ht = FHT_4[tx_type]; +void vp9_fht4x4_c(const int16_t *input, int16_t *output, + int stride, int tx_type) { + if (tx_type == DCT_DCT) { + vp9_fdct4x4_c(input, output, stride); + } else { + int16_t out[4 * 4]; + int16_t *outptr = &out[0]; + int i, j; + int16_t temp_in[4], temp_out[4]; + const transform_2d ht = FHT_4[tx_type]; - // Columns - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - temp_in[j] = input[j * stride + i] * 16; - if (i == 0 && temp_in[0]) - temp_in[0] += 1; - ht.cols(temp_in, temp_out); - for (j = 0; j < 4; ++j) - outptr[j * 4 + i] = temp_out[j]; - } + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = input[j * stride + i] * 16; + if (i == 0 && temp_in[0]) + temp_in[0] += 1; + ht.cols(temp_in, temp_out); + for (j = 0; j < 4; ++j) + outptr[j * 4 + i] = temp_out[j]; + } - // Rows - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - temp_in[j] = out[j + i * 4]; - ht.rows(temp_in, temp_out); - for (j = 0; j < 4; ++j) - output[j + i * 4] = (temp_out[j] + 1) >> 2; + // Rows + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j + i * 4]; + ht.rows(temp_in, temp_out); + for (j = 0; j < 4; ++j) + output[j + i * 4] = (temp_out[j] + 1) >> 2; + } } } @@ -238,6 +251,17 @@ static void fdct8(const int16_t *input, int16_t *output) { output[7] = fdct_round_shift(t3); } +void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) { + int r, c; + int16_t sum = 0; + for (r = 0; r < 8; ++r) + for (c = 0; c < 8; ++c) + sum += input[r * stride + c]; + + output[0] = sum; + output[1] = 0; +} + void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) { int i, j; int16_t intermediate[64]; @@ -309,11 +333,22 @@ void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) { } } +void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) { + int r, c; + int16_t sum = 0; + for (r = 0; r < 16; ++r) + for (c = 0; c < 16; ++c) + sum += input[r * stride + c]; + + output[0] = sum >> 1; + output[1] = 0; +} + void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, - // as the first pass results are transposed, we tranpose the columns (that + // as the first pass results are transposed, we transpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). int pass; @@ -443,20 +478,20 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { step3[7] = step1[7] + step2[4]; // step 4 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; - temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64; + temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64; step2[1] = fdct_round_shift(temp1); step2[2] = fdct_round_shift(temp2); - temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64; + temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64; temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; step2[5] = fdct_round_shift(temp1); step2[6] = fdct_round_shift(temp2); // step 5 step1[0] = step3[0] + step2[1]; step1[1] = step3[0] - step2[1]; - step1[2] = step3[3] - step2[2]; - step1[3] = step3[3] + step2[2]; - step1[4] = step3[4] + step2[5]; - step1[5] = step3[4] - step2[5]; + step1[2] = step3[3] + step2[2]; + step1[3] = step3[3] - step2[2]; + step1[4] = step3[4] - step2[5]; + step1[5] = step3[4] + step2[5]; step1[6] = step3[7] - step2[6]; step1[7] = step3[7] + step2[6]; // step 6 @@ -565,30 +600,34 @@ static const transform_2d FHT_8[] = { { fadst8, fadst8 } // ADST_ADST = 3 }; -void vp9_short_fht8x8_c(const int16_t *input, int16_t *output, - int stride, int tx_type) { - int16_t out[64]; - int16_t *outptr = &out[0]; - int i, j; - int16_t temp_in[8], temp_out[8]; - const transform_2d ht = FHT_8[tx_type]; +void vp9_fht8x8_c(const int16_t *input, int16_t *output, + int stride, int tx_type) { + if (tx_type == DCT_DCT) { + vp9_fdct8x8_c(input, output, stride); + } else { + int16_t out[64]; + int16_t *outptr = &out[0]; + int i, j; + int16_t temp_in[8], temp_out[8]; + const transform_2d ht = FHT_8[tx_type]; - // Columns - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) - temp_in[j] = input[j * stride + i] * 4; - ht.cols(temp_in, temp_out); - for (j = 0; j < 8; ++j) - outptr[j * 8 + i] = temp_out[j]; - } + // Columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = input[j * stride + i] * 4; + ht.cols(temp_in, temp_out); + for (j = 0; j < 8; ++j) + outptr[j * 8 + i] = temp_out[j]; + } - // Rows - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) - temp_in[j] = out[j + i * 8]; - ht.rows(temp_in, temp_out); - for (j = 0; j < 8; ++j) - output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; + // Rows + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j + i * 8]; + ht.rows(temp_in, temp_out); + for (j = 0; j < 8; ++j) + output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; + } } } @@ -749,10 +788,10 @@ static void fdct16(const int16_t in[16], int16_t out[16]) { // step 4 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; - temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64; + temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64; step2[1] = fdct_round_shift(temp1); step2[2] = fdct_round_shift(temp2); - temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64; + temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64; temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; step2[5] = fdct_round_shift(temp1); step2[6] = fdct_round_shift(temp2); @@ -760,10 +799,10 @@ static void fdct16(const int16_t in[16], int16_t out[16]) { // step 5 step1[0] = step3[0] + step2[1]; step1[1] = step3[0] - step2[1]; - step1[2] = step3[3] - step2[2]; - step1[3] = step3[3] + step2[2]; - step1[4] = step3[4] + step2[5]; - step1[5] = step3[4] - step2[5]; + step1[2] = step3[3] + step2[2]; + step1[3] = step3[3] - step2[2]; + step1[4] = step3[4] - step2[5]; + step1[5] = step3[4] + step2[5]; step1[6] = step3[7] - step2[6]; step1[7] = step3[7] + step2[6]; @@ -958,31 +997,34 @@ static const transform_2d FHT_16[] = { { fadst16, fadst16 } // ADST_ADST = 3 }; -void vp9_short_fht16x16_c(const int16_t *input, int16_t *output, - int stride, int tx_type) { - int16_t out[256]; - int16_t *outptr = &out[0]; - int i, j; - int16_t temp_in[16], temp_out[16]; - const transform_2d ht = FHT_16[tx_type]; +void vp9_fht16x16_c(const int16_t *input, int16_t *output, + int stride, int tx_type) { + if (tx_type == DCT_DCT) { + vp9_fdct16x16_c(input, output, stride); + } else { + int16_t out[256]; + int16_t *outptr = &out[0]; + int i, j; + int16_t temp_in[16], temp_out[16]; + const transform_2d ht = FHT_16[tx_type]; - // Columns - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = input[j * stride + i] * 4; - ht.cols(temp_in, temp_out); - for (j = 0; j < 16; ++j) - outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; -// outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; - } + // Columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = input[j * stride + i] * 4; + ht.cols(temp_in, temp_out); + for (j = 0; j < 16; ++j) + outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; + } - // Rows - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = out[j + i * 16]; - ht.rows(temp_in, temp_out); - for (j = 0; j < 16; ++j) - output[j + i * 16] = temp_out[j]; + // Rows + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j + i * 16]; + ht.rows(temp_in, temp_out); + for (j = 0; j < 16; ++j) + output[j + i * 16] = temp_out[j]; + } } } @@ -997,7 +1039,7 @@ static INLINE int half_round_shift(int input) { return rv; } -static void dct32_1d(const int *input, int *output, int round) { +static void fdct32(const int *input, int *output, int round) { int step[32]; // Stage 1 step[0] = input[0] + input[(32 - 1)]; @@ -1320,6 +1362,17 @@ static void dct32_1d(const int *input, int *output, int round) { output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); } +void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride) { + int r, c; + int16_t sum = 0; + for (r = 0; r < 32; ++r) + for (c = 0; c < 32; ++c) + sum += input[r * stride + c]; + + output[0] = sum >> 3; + output[1] = 0; +} + void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { int i, j; int output[32 * 32]; @@ -1329,7 +1382,7 @@ void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { int temp_in[32], temp_out[32]; for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; - dct32_1d(temp_in, temp_out, 0); + fdct32(temp_in, temp_out, 0); for (j = 0; j < 32; ++j) output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; } @@ -1339,13 +1392,13 @@ void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { int temp_in[32], temp_out[32]; for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; - dct32_1d(temp_in, temp_out, 0); + fdct32(temp_in, temp_out, 0); for (j = 0; j < 32; ++j) out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; } } -// Note that although we use dct_32_round in dct32_1d computation flow, +// Note that although we use dct_32_round in dct32 computation flow, // this 2d fdct32x32 for rate-distortion optimization loop is operating // within 16 bits precision. void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { @@ -1357,7 +1410,7 @@ void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { int temp_in[32], temp_out[32]; for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; - dct32_1d(temp_in, temp_out, 0); + fdct32(temp_in, temp_out, 0); for (j = 0; j < 32; ++j) // TODO(cd): see quality impact of only doing // output[j * 32 + i] = (temp_out[j] + 1) >> 2; @@ -1370,32 +1423,8 @@ void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { int temp_in[32], temp_out[32]; for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; - dct32_1d(temp_in, temp_out, 1); + fdct32(temp_in, temp_out, 1); for (j = 0; j < 32; ++j) out[j + i * 32] = temp_out[j]; } } - -void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output, - int stride) { - if (tx_type == DCT_DCT) - vp9_fdct4x4(input, output, stride); - else - vp9_short_fht4x4(input, output, stride, tx_type); -} - -void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output, - int stride) { - if (tx_type == DCT_DCT) - vp9_fdct8x8(input, output, stride); - else - vp9_short_fht8x8(input, output, stride, tx_type); -} - -void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output, - int stride) { - if (tx_type == DCT_DCT) - vp9_fdct16x16(input, output, stride); - else - vp9_short_fht16x16(input, output, stride, tx_type); -} diff --git a/media/libvpx/vp9/encoder/vp9_dct.h b/media/libvpx/vp9/encoder/vp9_dct.h deleted file mode 100644 index aaf976d93c5a..000000000000 --- a/media/libvpx/vp9/encoder/vp9_dct.h +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2013 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_ENCODER_VP9_DCT_H_ -#define VP9_ENCODER_VP9_DCT_H_ - -void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output, - int stride); - -void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output, - int stride); - -void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output, - int stride); - -#endif // VP9_ENCODER_VP9_DCT_H_ diff --git a/media/libvpx/vp9/encoder/vp9_denoiser.c b/media/libvpx/vp9/encoder/vp9_denoiser.c new file mode 100644 index 000000000000..c4cf5eeb6ded --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_denoiser.c @@ -0,0 +1,491 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include "vpx_scale/yv12config.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/encoder/vp9_context_tree.h" +#include "vp9/encoder/vp9_denoiser.h" + +/* The VP9 denoiser is a work-in-progress. It currently is only designed to work + * with speed 6, though it (inexplicably) seems to also work with speed 5 (one + * would need to modify the source code in vp9_pickmode.c and vp9_encoder.c to + * make the calls to the vp9_denoiser_* functions when in speed 5). + * + * The implementation is very similar to that of the VP8 denoiser. While + * choosing the motion vectors / reference frames, the denoiser is run, and if + * it did not modify the signal to much, the denoised block is copied to the + * signal. + */ + +#ifdef OUTPUT_YUV_DENOISED +static void make_grayscale(YV12_BUFFER_CONFIG *yuv); +#endif + +static const int widths[] = {4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 32, 64, 64}; +static const int heights[] = {4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 64, 32, 64}; + +static int absdiff_thresh(BLOCK_SIZE bs, int increase_denoising) { + (void)bs; + return 3 + (increase_denoising ? 1 : 0); +} + +static int delta_thresh(BLOCK_SIZE bs, int increase_denoising) { + (void)bs; + (void)increase_denoising; + return 4; +} + +static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) { + (void)bs; + (void)increase_denoising; + return 25 * 25; +} + +static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) { + return widths[bs] * heights[bs] * (increase_denoising ? 60 : 40); +} + +static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising, + int mv_row, int mv_col) { + if (mv_row * mv_row + mv_col * mv_col > + noise_motion_thresh(bs, increase_denoising)) { + return 0; + } else { + return widths[bs] * heights[bs] * 20; + } +} + +static int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising) { + return widths[bs] * heights[bs] * (increase_denoising ? 3 : 2); +} + +static int total_adj_weak_thresh(BLOCK_SIZE bs, int increase_denoising) { + return widths[bs] * heights[bs] * (increase_denoising ? 3 : 2); +} + +static VP9_DENOISER_DECISION denoiser_filter(const uint8_t *sig, int sig_stride, + const uint8_t *mc_avg, + int mc_avg_stride, + uint8_t *avg, int avg_stride, + int increase_denoising, + BLOCK_SIZE bs, + int motion_magnitude) { + int r, c; + const uint8_t *sig_start = sig; + const uint8_t *mc_avg_start = mc_avg; + uint8_t *avg_start = avg; + int diff, adj, absdiff, delta; + int adj_val[] = {3, 4, 6}; + int total_adj = 0; + int shift_inc = 1; + + /* If motion_magnitude is small, making the denoiser more aggressive by + * increasing the adjustment for each level. Add another increment for + * blocks that are labeled for increase denoising. */ + if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) { + if (increase_denoising) { + shift_inc = 2; + } + adj_val[0] += shift_inc; + adj_val[1] += shift_inc; + adj_val[2] += shift_inc; + } + + // First attempt to apply a strong temporal denoising filter. + for (r = 0; r < heights[bs]; ++r) { + for (c = 0; c < widths[bs]; ++c) { + diff = mc_avg[c] - sig[c]; + absdiff = abs(diff); + + if (absdiff <= absdiff_thresh(bs, increase_denoising)) { + avg[c] = mc_avg[c]; + total_adj += diff; + } else { + switch (absdiff) { + case 4: case 5: case 6: case 7: + adj = adj_val[0]; + break; + case 8: case 9: case 10: case 11: + case 12: case 13: case 14: case 15: + adj = adj_val[1]; + break; + default: + adj = adj_val[2]; + } + if (diff > 0) { + avg[c] = MIN(UINT8_MAX, sig[c] + adj); + total_adj += adj; + } else { + avg[c] = MAX(0, sig[c] - adj); + total_adj -= adj; + } + } + } + sig += sig_stride; + avg += avg_stride; + mc_avg += mc_avg_stride; + } + + // If the strong filter did not modify the signal too much, we're all set. + if (abs(total_adj) <= total_adj_strong_thresh(bs, increase_denoising)) { + return FILTER_BLOCK; + } + + // Otherwise, we try to dampen the filter if the delta is not too high. + delta = ((abs(total_adj) - total_adj_strong_thresh(bs, increase_denoising)) + >> 8) + 1; + + if (delta >= delta_thresh(bs, increase_denoising)) { + return COPY_BLOCK; + } + + mc_avg = mc_avg_start; + avg = avg_start; + sig = sig_start; + for (r = 0; r < heights[bs]; ++r) { + for (c = 0; c < widths[bs]; ++c) { + diff = mc_avg[c] - sig[c]; + adj = abs(diff); + if (adj > delta) { + adj = delta; + } + if (diff > 0) { + // Diff positive means we made positive adjustment above + // (in first try/attempt), so now make negative adjustment to bring + // denoised signal down. + avg[c] = MAX(0, avg[c] - adj); + total_adj -= adj; + } else { + // Diff negative means we made negative adjustment above + // (in first try/attempt), so now make positive adjustment to bring + // denoised signal up. + avg[c] = MIN(UINT8_MAX, avg[c] + adj); + total_adj += adj; + } + } + sig += sig_stride; + avg += avg_stride; + mc_avg += mc_avg_stride; + } + + // We can use the filter if it has been sufficiently dampened + if (abs(total_adj) <= total_adj_weak_thresh(bs, increase_denoising)) { + return FILTER_BLOCK; + } + return COPY_BLOCK; +} + +static uint8_t *block_start(uint8_t *framebuf, int stride, + int mi_row, int mi_col) { + return framebuf + (stride * mi_row * 8) + (mi_col * 8); +} + +static void copy_block(uint8_t *dest, int dest_stride, + const uint8_t *src, int src_stride, BLOCK_SIZE bs) { + int r; + for (r = 0; r < heights[bs]; ++r) { + vpx_memcpy(dest, src, widths[bs]); + dest += dest_stride; + src += src_stride; + } +} + +static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser, + MACROBLOCK *mb, + BLOCK_SIZE bs, + int increase_denoising, + int mi_row, + int mi_col, + PICK_MODE_CONTEXT *ctx, + int *motion_magnitude + ) { + int mv_col, mv_row; + int sse_diff = ctx->zeromv_sse - ctx->newmv_sse; + MV_REFERENCE_FRAME frame; + MACROBLOCKD *filter_mbd = &mb->e_mbd; + MB_MODE_INFO *mbmi = &filter_mbd->mi[0]->mbmi; + + MB_MODE_INFO saved_mbmi; + int i, j; + struct buf_2d saved_dst[MAX_MB_PLANE]; + struct buf_2d saved_pre[MAX_MB_PLANE][2]; // 2 pre buffers + + // We will restore these after motion compensation. + saved_mbmi = *mbmi; + for (i = 0; i < MAX_MB_PLANE; ++i) { + for (j = 0; j < 2; ++j) { + saved_pre[i][j] = filter_mbd->plane[i].pre[j]; + } + saved_dst[i] = filter_mbd->plane[i].dst; + } + + mv_col = ctx->best_sse_mv.as_mv.col; + mv_row = ctx->best_sse_mv.as_mv.row; + + *motion_magnitude = mv_row * mv_row + mv_col * mv_col; + + frame = ctx->best_reference_frame; + + // If the best reference frame uses inter-prediction and there is enough of a + // difference in sum-squared-error, use it. + if (frame != INTRA_FRAME && + sse_diff > sse_diff_thresh(bs, increase_denoising, mv_row, mv_col)) { + mbmi->ref_frame[0] = ctx->best_reference_frame; + mbmi->mode = ctx->best_sse_inter_mode; + mbmi->mv[0] = ctx->best_sse_mv; + } else { + // Otherwise, use the zero reference frame. + frame = ctx->best_zeromv_reference_frame; + + mbmi->ref_frame[0] = ctx->best_zeromv_reference_frame; + mbmi->mode = ZEROMV; + mbmi->mv[0].as_int = 0; + + ctx->best_sse_inter_mode = ZEROMV; + ctx->best_sse_mv.as_int = 0; + ctx->newmv_sse = ctx->zeromv_sse; + } + + // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser + // struct. + for (j = 0; j < 2; ++j) { + filter_mbd->plane[0].pre[j].buf = + block_start(denoiser->running_avg_y[frame].y_buffer, + denoiser->running_avg_y[frame].y_stride, + mi_row, mi_col); + filter_mbd->plane[0].pre[j].stride = + denoiser->running_avg_y[frame].y_stride; + filter_mbd->plane[1].pre[j].buf = + block_start(denoiser->running_avg_y[frame].u_buffer, + denoiser->running_avg_y[frame].uv_stride, + mi_row, mi_col); + filter_mbd->plane[1].pre[j].stride = + denoiser->running_avg_y[frame].uv_stride; + filter_mbd->plane[2].pre[j].buf = + block_start(denoiser->running_avg_y[frame].v_buffer, + denoiser->running_avg_y[frame].uv_stride, + mi_row, mi_col); + filter_mbd->plane[2].pre[j].stride = + denoiser->running_avg_y[frame].uv_stride; + } + filter_mbd->plane[0].dst.buf = + block_start(denoiser->mc_running_avg_y.y_buffer, + denoiser->mc_running_avg_y.y_stride, + mi_row, mi_col); + filter_mbd->plane[0].dst.stride = denoiser->mc_running_avg_y.y_stride; + filter_mbd->plane[1].dst.buf = + block_start(denoiser->mc_running_avg_y.u_buffer, + denoiser->mc_running_avg_y.uv_stride, + mi_row, mi_col); + filter_mbd->plane[1].dst.stride = denoiser->mc_running_avg_y.uv_stride; + filter_mbd->plane[2].dst.buf = + block_start(denoiser->mc_running_avg_y.v_buffer, + denoiser->mc_running_avg_y.uv_stride, + mi_row, mi_col); + filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y.uv_stride; + + vp9_build_inter_predictors_sby(filter_mbd, mv_row, mv_col, bs); + + // Restore everything to its original state + *mbmi = saved_mbmi; + for (i = 0; i < MAX_MB_PLANE; ++i) { + for (j = 0; j < 2; ++j) { + filter_mbd->plane[i].pre[j] = saved_pre[i][j]; + } + filter_mbd->plane[i].dst = saved_dst[i]; + } + + mv_row = ctx->best_sse_mv.as_mv.row; + mv_col = ctx->best_sse_mv.as_mv.col; + + if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) { + return COPY_BLOCK; + } + if (mv_row * mv_row + mv_col * mv_col > + 8 * noise_motion_thresh(bs, increase_denoising)) { + return COPY_BLOCK; + } + return FILTER_BLOCK; +} + +void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb, + int mi_row, int mi_col, BLOCK_SIZE bs, + PICK_MODE_CONTEXT *ctx) { + int motion_magnitude = 0; + VP9_DENOISER_DECISION decision = FILTER_BLOCK; + YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME]; + YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y; + uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col); + uint8_t *mc_avg_start = block_start(mc_avg.y_buffer, mc_avg.y_stride, + mi_row, mi_col); + struct buf_2d src = mb->plane[0].src; + + decision = perform_motion_compensation(denoiser, mb, bs, + denoiser->increase_denoising, + mi_row, mi_col, ctx, + &motion_magnitude); + + if (decision == FILTER_BLOCK) { + decision = denoiser_filter(src.buf, src.stride, + mc_avg_start, mc_avg.y_stride, + avg_start, avg.y_stride, + 0, bs, motion_magnitude); + } + + if (decision == FILTER_BLOCK) { + copy_block(src.buf, src.stride, avg_start, avg.y_stride, bs); + } else { // COPY_BLOCK + copy_block(avg_start, avg.y_stride, src.buf, src.stride, bs); + } +} + +static void copy_frame(YV12_BUFFER_CONFIG dest, const YV12_BUFFER_CONFIG src) { + int r; + const uint8_t *srcbuf = src.y_buffer; + uint8_t *destbuf = dest.y_buffer; + assert(dest.y_width == src.y_width); + assert(dest.y_height == src.y_height); + + for (r = 0; r < dest.y_height; ++r) { + vpx_memcpy(destbuf, srcbuf, dest.y_width); + destbuf += dest.y_stride; + srcbuf += src.y_stride; + } +} + +void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser, + YV12_BUFFER_CONFIG src, + FRAME_TYPE frame_type, + int refresh_alt_ref_frame, + int refresh_golden_frame, + int refresh_last_frame) { + if (frame_type == KEY_FRAME) { + int i; + // Start at 1 so as not to overwrite the INTRA_FRAME + for (i = 1; i < MAX_REF_FRAMES; ++i) { + copy_frame(denoiser->running_avg_y[i], src); + } + } else { /* For non key frames */ + if (refresh_alt_ref_frame) { + copy_frame(denoiser->running_avg_y[ALTREF_FRAME], + denoiser->running_avg_y[INTRA_FRAME]); + } + if (refresh_golden_frame) { + copy_frame(denoiser->running_avg_y[GOLDEN_FRAME], + denoiser->running_avg_y[INTRA_FRAME]); + } + if (refresh_last_frame) { + copy_frame(denoiser->running_avg_y[LAST_FRAME], + denoiser->running_avg_y[INTRA_FRAME]); + } + } +} + +void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) { + ctx->zeromv_sse = UINT_MAX; + ctx->newmv_sse = UINT_MAX; +} + +void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse, + PREDICTION_MODE mode, + PICK_MODE_CONTEXT *ctx) { + // TODO(tkopp): Use both MVs if possible + if (mbmi->mv[0].as_int == 0 && sse < ctx->zeromv_sse) { + ctx->zeromv_sse = sse; + ctx->best_zeromv_reference_frame = mbmi->ref_frame[0]; + } + + if (mode == NEWMV) { + ctx->newmv_sse = sse; + ctx->best_sse_inter_mode = mode; + ctx->best_sse_mv = mbmi->mv[0]; + ctx->best_reference_frame = mbmi->ref_frame[0]; + } +} + +int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, + int ssx, int ssy, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif + int border) { + int i, fail; + assert(denoiser != NULL); + + for (i = 0; i < MAX_REF_FRAMES; ++i) { + fail = vp9_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height, + ssx, ssy, +#if CONFIG_VP9_HIGHBITDEPTH + use_highbitdepth, +#endif + border); + if (fail) { + vp9_denoiser_free(denoiser); + return 1; + } +#ifdef OUTPUT_YUV_DENOISED + make_grayscale(&denoiser->running_avg_y[i]); +#endif + } + + fail = vp9_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height, + ssx, ssy, +#if CONFIG_VP9_HIGHBITDEPTH + use_highbitdepth, +#endif + border); + if (fail) { + vp9_denoiser_free(denoiser); + return 1; + } +#ifdef OUTPUT_YUV_DENOISED + make_grayscale(&denoiser->running_avg_y[i]); +#endif + denoiser->increase_denoising = 0; + + return 0; +} + +void vp9_denoiser_free(VP9_DENOISER *denoiser) { + int i; + if (denoiser == NULL) { + return; + } + for (i = 0; i < MAX_REF_FRAMES; ++i) { + if (&denoiser->running_avg_y[i] != NULL) { + vp9_free_frame_buffer(&denoiser->running_avg_y[i]); + } + } + if (&denoiser->mc_running_avg_y != NULL) { + vp9_free_frame_buffer(&denoiser->mc_running_avg_y); + } +} + +#ifdef OUTPUT_YUV_DENOISED +static void make_grayscale(YV12_BUFFER_CONFIG *yuv) { + int r, c; + uint8_t *u = yuv->u_buffer; + uint8_t *v = yuv->v_buffer; + + // The '/2's are there because we have a 440 buffer, but we want to output + // 420. + for (r = 0; r < yuv->uv_height / 2; ++r) { + for (c = 0; c < yuv->uv_width / 2; ++c) { + u[c] = UINT8_MAX / 2; + v[c] = UINT8_MAX / 2; + } + u += yuv->uv_stride + yuv->uv_width / 2; + v += yuv->uv_stride + yuv->uv_width / 2; + } +} +#endif diff --git a/media/libvpx/vp9/encoder/vp9_denoiser.h b/media/libvpx/vp9/encoder/vp9_denoiser.h new file mode 100644 index 000000000000..a913add86e4a --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_denoiser.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_ENCODER_DENOISER_H_ +#define VP9_ENCODER_DENOISER_H_ + +#include "vp9/encoder/vp9_block.h" +#include "vpx_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MOTION_MAGNITUDE_THRESHOLD (8*3) + +typedef enum vp9_denoiser_decision { + COPY_BLOCK, + FILTER_BLOCK +} VP9_DENOISER_DECISION; + +typedef struct vp9_denoiser { + YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES]; + YV12_BUFFER_CONFIG mc_running_avg_y; + int increase_denoising; +} VP9_DENOISER; + +void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser, + YV12_BUFFER_CONFIG src, + FRAME_TYPE frame_type, + int refresh_alt_ref_frame, + int refresh_golden_frame, + int refresh_last_frame); + +void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb, + int mi_row, int mi_col, BLOCK_SIZE bs, + PICK_MODE_CONTEXT *ctx); + +void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx); + +void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, + unsigned int sse, PREDICTION_MODE mode, + PICK_MODE_CONTEXT *ctx); + +int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, + int ssx, int ssy, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif + int border); + +void vp9_denoiser_free(VP9_DENOISER *denoiser); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_ENCODER_DENOISER_H_ diff --git a/media/libvpx/vp9/encoder/vp9_encodeframe.c b/media/libvpx/vp9/encoder/vp9_encodeframe.c index 8e71d6b57482..72ced052918b 100644 --- a/media/libvpx/vp9/encoder/vp9_encodeframe.c +++ b/media/libvpx/vp9/encoder/vp9_encodeframe.c @@ -20,8 +20,6 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropymode.h" -#include "vp9/common/vp9_extend.h" -#include "vp9/common/vp9_findnearmv.h" #include "vp9/common/vp9_idct.h" #include "vp9/common/vp9_mvref_common.h" #include "vp9/common/vp9_pred_common.h" @@ -29,72 +27,39 @@ #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_systemdependent.h" #include "vp9/common/vp9_tile_common.h" + +#include "vp9/encoder/vp9_aq_complexity.h" +#include "vp9/encoder/vp9_aq_cyclicrefresh.h" +#include "vp9/encoder/vp9_aq_variance.h" #include "vp9/encoder/vp9_encodeframe.h" -#include "vp9/encoder/vp9_encodeintra.h" #include "vp9/encoder/vp9_encodemb.h" #include "vp9/encoder/vp9_encodemv.h" -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_extend.h" +#include "vp9/encoder/vp9_pickmode.h" +#include "vp9/encoder/vp9_rd.h" #include "vp9/encoder/vp9_rdopt.h" #include "vp9/encoder/vp9_segmentation.h" -#include "vp9/common/vp9_systemdependent.h" #include "vp9/encoder/vp9_tokenize.h" -#include "vp9/encoder/vp9_vaq.h" - -#define DBG_PRNT_SEGMAP 0 - - -// #define ENC_DEBUG -#ifdef ENC_DEBUG -int enc_debug = 0; -#endif - -static INLINE uint8_t *get_sb_index(MACROBLOCK *x, BLOCK_SIZE subsize) { - switch (subsize) { - case BLOCK_64X64: - case BLOCK_64X32: - case BLOCK_32X64: - case BLOCK_32X32: - return &x->sb_index; - case BLOCK_32X16: - case BLOCK_16X32: - case BLOCK_16X16: - return &x->mb_index; - case BLOCK_16X8: - case BLOCK_8X16: - case BLOCK_8X8: - return &x->b_index; - case BLOCK_8X4: - case BLOCK_4X8: - case BLOCK_4X4: - return &x->ab_index; - default: - assert(0); - return NULL; - } -} +#define GF_ZEROMV_ZBIN_BOOST 0 +#define LF_ZEROMV_ZBIN_BOOST 0 +#define MV_ZBIN_BOOST 0 +#define SPLIT_MV_ZBIN_BOOST 0 +#define INTRA_ZBIN_BOOST 0 static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, - int mi_row, int mi_col, BLOCK_SIZE bsize); + int mi_row, int mi_col, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx); -static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x); +// Motion vector component magnitude threshold for defining fast motion. +#define FAST_MOTION_MV_THRESH 24 -/* activity_avg must be positive, or flat regions could get a zero weight - * (infinite lambda), which confounds analysis. - * This also avoids the need for divide by zero checks in - * vp9_activity_masking(). - */ -#define ACTIVITY_AVG_MIN (64) - -/* Motion vector component magnitude threshold for defining fast motion. */ -#define FAST_MOTION_MV_THRESH (24) - -/* This is used as a reference when computing the source variance for the - * purposes of activity masking. - * Eventually this should be replaced by custom no-reference routines, - * which will be faster. - */ +// This is used as a reference when computing the source variance for the +// purposes of activity masking. +// Eventually this should be replaced by custom no-reference routines, +// which will be faster. static const uint8_t VP9_VAR_OFFS[64] = { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, @@ -106,299 +71,493 @@ static const uint8_t VP9_VAR_OFFS[64] = { 128, 128, 128, 128, 128, 128, 128, 128 }; -static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi, MACROBLOCK *x, +static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi, + const struct buf_2d *ref, BLOCK_SIZE bs) { - unsigned int var, sse; - var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, - x->plane[0].src.stride, - VP9_VAR_OFFS, 0, &sse); - return (var + (1 << (num_pels_log2_lookup[bs] - 1))) >> - num_pels_log2_lookup[bs]; -} - -// Original activity measure from Tim T's code. -static unsigned int tt_activity_measure(MACROBLOCK *x) { - unsigned int act; unsigned int sse; - /* TODO: This could also be done over smaller areas (8x8), but that would - * require extensive changes elsewhere, as lambda is assumed to be fixed - * over an entire MB in most of the code. - * Another option is to compute four 8x8 variances, and pick a single - * lambda using a non-linear combination (e.g., the smallest, or second - * smallest, etc.). - */ - act = vp9_variance16x16(x->plane[0].src.buf, x->plane[0].src.stride, - VP9_VAR_OFFS, 0, &sse); - act <<= 4; - - /* If the region is flat, lower the activity some more. */ - if (act < 8 << 12) - act = act < 5 << 12 ? act : 5 << 12; - - return act; + const unsigned int var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, + VP9_VAR_OFFS, 0, &sse); + return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); } -// Stub for alternative experimental activity measures. -static unsigned int alt_activity_measure(MACROBLOCK *x, int use_dc_pred) { - return vp9_encode_intra(x, use_dc_pred); +static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi, + const struct buf_2d *ref, + int mi_row, int mi_col, + BLOCK_SIZE bs) { + const YV12_BUFFER_CONFIG *last = get_ref_frame_buffer(cpi, LAST_FRAME); + const uint8_t* last_y = &last->y_buffer[mi_row * MI_SIZE * last->y_stride + + mi_col * MI_SIZE]; + unsigned int sse; + const unsigned int var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, + last_y, last->y_stride, &sse); + return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]); } -// Measure the activity of the current macroblock -// What we measure here is TBD so abstracted to this function -#define ALT_ACT_MEASURE 1 -static unsigned int mb_activity_measure(MACROBLOCK *x, int mb_row, int mb_col) { - unsigned int mb_activity; +static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi, + int mi_row, + int mi_col) { + unsigned int var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src, + mi_row, mi_col, + BLOCK_64X64); + if (var < 8) + return BLOCK_64X64; + else if (var < 128) + return BLOCK_32X32; + else if (var < 2048) + return BLOCK_16X16; + else + return BLOCK_8X8; +} - if (ALT_ACT_MEASURE) { - int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); +static BLOCK_SIZE get_nonrd_var_based_fixed_partition(VP9_COMP *cpi, + int mi_row, + int mi_col) { + unsigned int var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src, + mi_row, mi_col, + BLOCK_64X64); + if (var < 4) + return BLOCK_64X64; + else if (var < 10) + return BLOCK_32X32; + else + return BLOCK_16X16; +} - // Or use and alternative. - mb_activity = alt_activity_measure(x, use_dc_pred); +// Lighter version of set_offsets that only sets the mode info +// pointers. +static INLINE void set_modeinfo_offsets(VP9_COMMON *const cm, + MACROBLOCKD *const xd, + int mi_row, + int mi_col) { + const int idx_str = xd->mi_stride * mi_row + mi_col; + xd->mi = cm->mi_grid_visible + idx_str; + xd->mi[0] = cm->mi + idx_str; +} + +static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile, + int mi_row, int mi_col, BLOCK_SIZE bsize) { + MACROBLOCK *const x = &cpi->mb; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const struct segmentation *const seg = &cm->seg; + + set_skip_context(xd, mi_row, mi_col); + + set_modeinfo_offsets(cm, xd, mi_row, mi_col); + + mbmi = &xd->mi[0]->mbmi; + + // Set up destination pointers. + vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col); + + // Set up limit values for MV components. + // Mv beyond the range do not produce new/different prediction block. + x->mv_row_min = -(((mi_row + mi_height) * MI_SIZE) + VP9_INTERP_EXTEND); + x->mv_col_min = -(((mi_col + mi_width) * MI_SIZE) + VP9_INTERP_EXTEND); + x->mv_row_max = (cm->mi_rows - mi_row) * MI_SIZE + VP9_INTERP_EXTEND; + x->mv_col_max = (cm->mi_cols - mi_col) * MI_SIZE + VP9_INTERP_EXTEND; + + // Set up distance of MB to edge of frame in 1/8th pel units. + assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1))); + set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width, + cm->mi_rows, cm->mi_cols); + + // Set up source buffers. + vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); + + // R/D setup. + x->rddiv = cpi->rd.RDDIV; + x->rdmult = cpi->rd.RDMULT; + + // Setup segment ID. + if (seg->enabled) { + if (cpi->oxcf.aq_mode != VARIANCE_AQ) { + const uint8_t *const map = seg->update_map ? cpi->segmentation_map + : cm->last_frame_seg_map; + mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col); + } + vp9_init_plane_quantizers(cpi, x); + + x->encode_breakout = cpi->segment_encode_breakout[mbmi->segment_id]; } else { - // Original activity measure from Tim T's code. - mb_activity = tt_activity_measure(x); + mbmi->segment_id = 0; + x->encode_breakout = cpi->encode_breakout; + } +} + +static void duplicate_mode_info_in_sb(VP9_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const int block_width = num_8x8_blocks_wide_lookup[bsize]; + const int block_height = num_8x8_blocks_high_lookup[bsize]; + int i, j; + for (j = 0; j < block_height; ++j) + for (i = 0; i < block_width; ++i) { + if (mi_row + j < cm->mi_rows && mi_col + i < cm->mi_cols) + xd->mi[j * xd->mi_stride + i] = xd->mi[0]; + } +} + +static void set_block_size(VP9_COMP * const cpi, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) { + MACROBLOCKD *const xd = &cpi->mb.e_mbd; + set_modeinfo_offsets(&cpi->common, xd, mi_row, mi_col); + xd->mi[0]->mbmi.sb_type = bsize; + duplicate_mode_info_in_sb(&cpi->common, xd, mi_row, mi_col, bsize); + } +} + +typedef struct { + int64_t sum_square_error; + int64_t sum_error; + int count; + int variance; +} var; + +typedef struct { + var none; + var horz[2]; + var vert[2]; +} partition_variance; + +typedef struct { + partition_variance part_variances; + var split[4]; +} v8x8; + +typedef struct { + partition_variance part_variances; + v8x8 split[4]; +} v16x16; + +typedef struct { + partition_variance part_variances; + v16x16 split[4]; +} v32x32; + +typedef struct { + partition_variance part_variances; + v32x32 split[4]; +} v64x64; + +typedef struct { + partition_variance *part_variances; + var *split[4]; +} variance_node; + +typedef enum { + V16X16, + V32X32, + V64X64, +} TREE_LEVEL; + +static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) { + int i; + node->part_variances = NULL; + vpx_memset(node->split, 0, sizeof(node->split)); + switch (bsize) { + case BLOCK_64X64: { + v64x64 *vt = (v64x64 *) data; + node->part_variances = &vt->part_variances; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].part_variances.none; + break; + } + case BLOCK_32X32: { + v32x32 *vt = (v32x32 *) data; + node->part_variances = &vt->part_variances; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].part_variances.none; + break; + } + case BLOCK_16X16: { + v16x16 *vt = (v16x16 *) data; + node->part_variances = &vt->part_variances; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].part_variances.none; + break; + } + case BLOCK_8X8: { + v8x8 *vt = (v8x8 *) data; + node->part_variances = &vt->part_variances; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i]; + break; + } + default: { + assert(0); + break; + } + } +} + +// Set variance values given sum square error, sum error, count. +static void fill_variance(int64_t s2, int64_t s, int c, var *v) { + v->sum_square_error = s2; + v->sum_error = s; + v->count = c; + if (c > 0) + v->variance = (int)(256 * + (v->sum_square_error - v->sum_error * v->sum_error / + v->count) / v->count); + else + v->variance = 0; +} + +void sum_2_variances(const var *a, const var *b, var *r) { + fill_variance(a->sum_square_error + b->sum_square_error, + a->sum_error + b->sum_error, a->count + b->count, r); +} + +static void fill_variance_tree(void *data, BLOCK_SIZE bsize) { + variance_node node; + tree_to_node(data, bsize, &node); + sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]); + sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]); + sum_2_variances(node.split[0], node.split[2], &node.part_variances->vert[0]); + sum_2_variances(node.split[1], node.split[3], &node.part_variances->vert[1]); + sum_2_variances(&node.part_variances->vert[0], &node.part_variances->vert[1], + &node.part_variances->none); +} + +static int set_vt_partitioning(VP9_COMP *cpi, + void *data, + BLOCK_SIZE bsize, + int mi_row, + int mi_col) { + VP9_COMMON * const cm = &cpi->common; + variance_node vt; + const int block_width = num_8x8_blocks_wide_lookup[bsize]; + const int block_height = num_8x8_blocks_high_lookup[bsize]; + // TODO(debargha): Choose this more intelligently. + const int64_t threshold_multiplier = 25; + int64_t threshold = threshold_multiplier * cpi->common.base_qindex; + assert(block_height == block_width); + + tree_to_node(data, bsize, &vt); + + // Split none is available only if we have more than half a block size + // in width and height inside the visible image. + if (mi_col + block_width / 2 < cm->mi_cols && + mi_row + block_height / 2 < cm->mi_rows && + vt.part_variances->none.variance < threshold) { + set_block_size(cpi, mi_row, mi_col, bsize); + return 1; } - if (mb_activity < ACTIVITY_AVG_MIN) - mb_activity = ACTIVITY_AVG_MIN; + // Vertical split is available on all but the bottom border. + if (mi_row + block_height / 2 < cm->mi_rows && + vt.part_variances->vert[0].variance < threshold && + vt.part_variances->vert[1].variance < threshold) { + BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT); + set_block_size(cpi, mi_row, mi_col, subsize); + set_block_size(cpi, mi_row, mi_col + block_width / 2, subsize); + return 1; + } - return mb_activity; + // Horizontal split is available on all but the right border. + if (mi_col + block_width / 2 < cm->mi_cols && + vt.part_variances->horz[0].variance < threshold && + vt.part_variances->horz[1].variance < threshold) { + BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ); + set_block_size(cpi, mi_row, mi_col, subsize); + set_block_size(cpi, mi_row + block_height / 2, mi_col, subsize); + return 1; + } + return 0; } -// Calculate an "average" mb activity value for the frame -#define ACT_MEDIAN 0 -static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) { -#if ACT_MEDIAN - // Find median: Simple n^2 algorithm for experimentation - { - unsigned int median; - unsigned int i, j; - unsigned int *sortlist; - unsigned int tmp; +// TODO(debargha): Fix this function and make it work as expected. +static void choose_partitioning(VP9_COMP *cpi, + const TileInfo *const tile, + int mi_row, int mi_col) { + VP9_COMMON * const cm = &cpi->common; + MACROBLOCK *x = &cpi->mb; + MACROBLOCKD *xd = &cpi->mb.e_mbd; - // Create a list to sort to - CHECK_MEM_ERROR(&cpi->common, sortlist, vpx_calloc(sizeof(unsigned int), - cpi->common.MBs)); + int i, j, k; + v64x64 vt; + uint8_t *s; + const uint8_t *d; + int sp; + int dp; + int pixels_wide = 64, pixels_high = 64; + int_mv nearest_mv, near_mv; + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); + const struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf; - // Copy map to sort list - vpx_memcpy(sortlist, cpi->mb_activity_map, - sizeof(unsigned int) * cpi->common.MBs); + vp9_zero(vt); + set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64); - // Ripple each value down to its correct position - for (i = 1; i < cpi->common.MBs; i ++) { - for (j = i; j > 0; j --) { - if (sortlist[j] < sortlist[j - 1]) { - // Swap values - tmp = sortlist[j - 1]; - sortlist[j - 1] = sortlist[j]; - sortlist[j] = tmp; - } else { - break; + if (xd->mb_to_right_edge < 0) + pixels_wide += (xd->mb_to_right_edge >> 3); + if (xd->mb_to_bottom_edge < 0) + pixels_high += (xd->mb_to_bottom_edge >> 3); + + s = x->plane[0].src.buf; + sp = x->plane[0].src.stride; + + if (cm->frame_type != KEY_FRAME) { + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, sf); + + xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME; + xd->mi[0]->mbmi.sb_type = BLOCK_64X64; + vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, + xd->mi[0]->mbmi.ref_mvs[LAST_FRAME], + &nearest_mv, &near_mv); + + xd->mi[0]->mbmi.mv[0] = nearest_mv; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_64X64); + + d = xd->plane[0].dst.buf; + dp = xd->plane[0].dst.stride; + } else { + d = VP9_VAR_OFFS; + dp = 0; + } + + // Fill in the entire tree of 8x8 variances for splits. + for (i = 0; i < 4; i++) { + const int x32_idx = ((i & 1) << 5); + const int y32_idx = ((i >> 1) << 5); + for (j = 0; j < 4; j++) { + const int x16_idx = x32_idx + ((j & 1) << 4); + const int y16_idx = y32_idx + ((j >> 1) << 4); + v16x16 *vst = &vt.split[i].split[j]; + for (k = 0; k < 4; k++) { + int x_idx = x16_idx + ((k & 1) << 3); + int y_idx = y16_idx + ((k >> 1) << 3); + unsigned int sse = 0; + int sum = 0; + if (x_idx < pixels_wide && y_idx < pixels_high) + vp9_get8x8var(s + y_idx * sp + x_idx, sp, + d + y_idx * dp + x_idx, dp, &sse, &sum); + fill_variance(sse, sum, 64, &vst->split[k].part_variances.none); + } + } + } + // Fill the rest of the variance tree by summing split partition values. + for (i = 0; i < 4; i++) { + for (j = 0; j < 4; j++) { + fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16); + } + fill_variance_tree(&vt.split[i], BLOCK_32X32); + } + fill_variance_tree(&vt, BLOCK_64X64); + + // Now go through the entire structure, splitting every block size until + // we get to one that's got a variance lower than our threshold, or we + // hit 8x8. + if (!set_vt_partitioning(cpi, &vt, BLOCK_64X64, + mi_row, mi_col)) { + for (i = 0; i < 4; ++i) { + const int x32_idx = ((i & 1) << 2); + const int y32_idx = ((i >> 1) << 2); + if (!set_vt_partitioning(cpi, &vt.split[i], BLOCK_32X32, + (mi_row + y32_idx), (mi_col + x32_idx))) { + for (j = 0; j < 4; ++j) { + const int x16_idx = ((j & 1) << 1); + const int y16_idx = ((j >> 1) << 1); + // NOTE: This is a temporary hack to disable 8x8 partitions, + // since it works really bad - possibly due to a bug +#define DISABLE_8X8_VAR_BASED_PARTITION +#ifdef DISABLE_8X8_VAR_BASED_PARTITION + if (mi_row + y32_idx + y16_idx + 1 < cm->mi_rows && + mi_row + x32_idx + x16_idx + 1 < cm->mi_cols) { + set_block_size(cpi, + (mi_row + y32_idx + y16_idx), + (mi_col + x32_idx + x16_idx), + BLOCK_16X16); + } else { + for (k = 0; k < 4; ++k) { + const int x8_idx = (k & 1); + const int y8_idx = (k >> 1); + set_block_size(cpi, + (mi_row + y32_idx + y16_idx + y8_idx), + (mi_col + x32_idx + x16_idx + x8_idx), + BLOCK_8X8); + } + } +#else + if (!set_vt_partitioning(cpi, &vt.split[i].split[j], tile, + BLOCK_16X16, + (mi_row + y32_idx + y16_idx), + (mi_col + x32_idx + x16_idx), 2)) { + for (k = 0; k < 4; ++k) { + const int x8_idx = (k & 1); + const int y8_idx = (k >> 1); + set_block_size(cpi, + (mi_row + y32_idx + y16_idx + y8_idx), + (mi_col + x32_idx + x16_idx + x8_idx), + BLOCK_8X8); + } + } +#endif } } } - - // Even number MBs so estimate median as mean of two either side. - median = (1 + sortlist[cpi->common.MBs >> 1] + - sortlist[(cpi->common.MBs >> 1) + 1]) >> 1; - - cpi->activity_avg = median; - - vpx_free(sortlist); } -#else - // Simple mean for now - cpi->activity_avg = (unsigned int) (activity_sum / cpi->common.MBs); -#endif // ACT_MEDIAN - - if (cpi->activity_avg < ACTIVITY_AVG_MIN) - cpi->activity_avg = ACTIVITY_AVG_MIN; - - // Experimental code: return fixed value normalized for several clips - if (ALT_ACT_MEASURE) - cpi->activity_avg = 100000; -} - -#define USE_ACT_INDEX 0 -#define OUTPUT_NORM_ACT_STATS 0 - -#if USE_ACT_INDEX -// Calculate an activity index for each mb -static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) { - VP9_COMMON *const cm = &cpi->common; - int mb_row, mb_col; - - int64_t act; - int64_t a; - int64_t b; - -#if OUTPUT_NORM_ACT_STATS - FILE *f = fopen("norm_act.stt", "a"); - fprintf(f, "\n%12d\n", cpi->activity_avg); -#endif - - // Reset pointers to start of activity map - x->mb_activity_ptr = cpi->mb_activity_map; - - // Calculate normalized mb activity number. - for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { - // for each macroblock col in image - for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { - // Read activity from the map - act = *(x->mb_activity_ptr); - - // Calculate a normalized activity number - a = act + 4 * cpi->activity_avg; - b = 4 * act + cpi->activity_avg; - - if (b >= a) - *(x->activity_ptr) = (int)((b + (a >> 1)) / a) - 1; - else - *(x->activity_ptr) = 1 - (int)((a + (b >> 1)) / b); - -#if OUTPUT_NORM_ACT_STATS - fprintf(f, " %6d", *(x->mb_activity_ptr)); -#endif - // Increment activity map pointers - x->mb_activity_ptr++; - } - -#if OUTPUT_NORM_ACT_STATS - fprintf(f, "\n"); -#endif - } - -#if OUTPUT_NORM_ACT_STATS - fclose(f); -#endif -} -#endif // USE_ACT_INDEX - -// Loop through all MBs. Note activity of each, average activity and -// calculate a normalized activity for each -static void build_activity_map(VP9_COMP *cpi) { - MACROBLOCK * const x = &cpi->mb; - MACROBLOCKD *xd = &x->e_mbd; - VP9_COMMON * const cm = &cpi->common; - -#if ALT_ACT_MEASURE - YV12_BUFFER_CONFIG *new_yv12 = get_frame_new_buffer(cm); - int recon_yoffset; - int recon_y_stride = new_yv12->y_stride; -#endif - - int mb_row, mb_col; - unsigned int mb_activity; - int64_t activity_sum = 0; - - x->mb_activity_ptr = cpi->mb_activity_map; - - // for each macroblock row in image - for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { -#if ALT_ACT_MEASURE - // reset above block coeffs - xd->up_available = (mb_row != 0); - recon_yoffset = (mb_row * recon_y_stride * 16); -#endif - // for each macroblock col in image - for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { -#if ALT_ACT_MEASURE - xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset; - xd->left_available = (mb_col != 0); - recon_yoffset += 16; -#endif - - // measure activity - mb_activity = mb_activity_measure(x, mb_row, mb_col); - - // Keep frame sum - activity_sum += mb_activity; - - // Store MB level activity details. - *x->mb_activity_ptr = mb_activity; - - // Increment activity map pointer - x->mb_activity_ptr++; - - // adjust to the next column of source macroblocks - x->plane[0].src.buf += 16; - } - - // adjust to the next row of mbs - x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols; - } - - // Calculate an "average" MB activity - calc_av_activity(cpi, activity_sum); - -#if USE_ACT_INDEX - // Calculate an activity index number of each mb - calc_activity_index(cpi, x); -#endif -} - -// Macroblock activity masking -void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) { -#if USE_ACT_INDEX - x->rdmult += *(x->mb_activity_ptr) * (x->rdmult >> 2); - x->errorperbit = x->rdmult * 100 / (110 * x->rddiv); - x->errorperbit += (x->errorperbit == 0); -#else - int64_t a; - int64_t b; - int64_t act = *(x->mb_activity_ptr); - - // Apply the masking to the RD multiplier. - a = act + (2 * cpi->activity_avg); - b = (2 * act) + cpi->activity_avg; - - x->rdmult = (unsigned int) (((int64_t) x->rdmult * b + (a >> 1)) / a); - x->errorperbit = x->rdmult * 100 / (110 * x->rddiv); - x->errorperbit += (x->errorperbit == 0); -#endif - - // Activity based Zbin adjustment - adjust_act_zbin(cpi, x); } static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, - BLOCK_SIZE bsize, int output_enabled) { + int mi_row, int mi_col, BLOCK_SIZE bsize, + int output_enabled) { int i, x_idx, y; VP9_COMMON *const cm = &cpi->common; + RD_OPT *const rd_opt = &cpi->rd; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; struct macroblock_plane *const p = x->plane; struct macroblockd_plane *const pd = xd->plane; MODE_INFO *mi = &ctx->mic; - MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; - MODE_INFO *mi_addr = xd->mi_8x8[0]; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + MODE_INFO *mi_addr = xd->mi[0]; + const struct segmentation *const seg = &cm->seg; - int mb_mode_index = ctx->best_mode_index; - const int mis = cm->mode_info_stride; + const int mis = cm->mi_stride; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const int mi_height = num_8x8_blocks_high_lookup[bsize]; int max_plane; - assert(mi->mbmi.mode < MB_MODE_COUNT); - assert(mi->mbmi.ref_frame[0] < MAX_REF_FRAMES); - assert(mi->mbmi.ref_frame[1] < MAX_REF_FRAMES); assert(mi->mbmi.sb_type == bsize); *mi_addr = *mi; + // If segmentation in use + if (seg->enabled && output_enabled) { + // For in frame complexity AQ copy the segment id from the segment map. + if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { + const uint8_t *const map = seg->update_map ? cpi->segmentation_map + : cm->last_frame_seg_map; + mi_addr->mbmi.segment_id = + vp9_get_segment_id(cm, map, bsize, mi_row, mi_col); + } + // Else for cyclic refresh mode update the segment map, set the segment id + // and then update the quantizer. + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, + mi_row, mi_col, bsize, 1); + } + } + max_plane = is_inter_block(mbmi) ? MAX_MB_PLANE : 1; for (i = 0; i < max_plane; ++i) { p[i].coeff = ctx->coeff_pbuf[i][1]; - pd[i].qcoeff = ctx->qcoeff_pbuf[i][1]; + p[i].qcoeff = ctx->qcoeff_pbuf[i][1]; pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1]; - pd[i].eobs = ctx->eobs_pbuf[i][1]; + p[i].eobs = ctx->eobs_pbuf[i][1]; } for (i = max_plane; i < MAX_MB_PLANE; ++i) { p[i].coeff = ctx->coeff_pbuf[i][2]; - pd[i].qcoeff = ctx->qcoeff_pbuf[i][2]; + p[i].qcoeff = ctx->qcoeff_pbuf[i][2]; pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2]; - pd[i].eobs = ctx->eobs_pbuf[i][2]; + p[i].eobs = ctx->eobs_pbuf[i][2]; } // Restore the coding context of the MB to that that was in place @@ -406,12 +565,12 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, for (y = 0; y < mi_height; y++) for (x_idx = 0; x_idx < mi_width; x_idx++) if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx - && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) - xd->mi_8x8[x_idx + y * mis] = mi_addr; + && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) { + xd->mi[x_idx + y * mis] = mi_addr; + } - if (cpi->oxcf.aq_mode == VARIANCE_AQ) { - vp9_mb_init_quantizer(cpi, x); - } + if (cpi->oxcf.aq_mode) + vp9_init_plane_quantizers(cpi, x); // FIXME(rbultje) I'm pretty sure this should go to the end of this block // (i.e. after the output_enabled) @@ -435,183 +594,122 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, if (!vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { for (i = 0; i < TX_MODES; i++) - cpi->rd_tx_select_diff[i] += ctx->tx_rd_diff[i]; + rd_opt->tx_select_diff[i] += ctx->tx_rd_diff[i]; } - if (frame_is_intra_only(cm)) { #if CONFIG_INTERNAL_STATS + if (frame_is_intra_only(cm)) { static const int kf_mode_index[] = { - THR_DC /*DC_PRED*/, - THR_V_PRED /*V_PRED*/, - THR_H_PRED /*H_PRED*/, - THR_D45_PRED /*D45_PRED*/, + THR_DC /*DC_PRED*/, + THR_V_PRED /*V_PRED*/, + THR_H_PRED /*H_PRED*/, + THR_D45_PRED /*D45_PRED*/, THR_D135_PRED /*D135_PRED*/, THR_D117_PRED /*D117_PRED*/, THR_D153_PRED /*D153_PRED*/, THR_D207_PRED /*D207_PRED*/, - THR_D63_PRED /*D63_PRED*/, - THR_TM /*TM_PRED*/, + THR_D63_PRED /*D63_PRED*/, + THR_TM /*TM_PRED*/, }; - cpi->mode_chosen_counts[kf_mode_index[mi->mbmi.mode]]++; -#endif + ++cpi->mode_chosen_counts[kf_mode_index[mbmi->mode]]; } else { // Note how often each mode chosen as best - cpi->mode_chosen_counts[mb_mode_index]++; - if (is_inter_block(mbmi) - && (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV)) { - int_mv best_mv[2]; - const MV_REFERENCE_FRAME rf1 = mbmi->ref_frame[0]; - const MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1]; - best_mv[0].as_int = ctx->best_ref_mv.as_int; - best_mv[1].as_int = ctx->second_best_ref_mv.as_int; - if (mbmi->mode == NEWMV) { - best_mv[0].as_int = mbmi->ref_mvs[rf1][0].as_int; - if (rf2 > 0) - best_mv[1].as_int = mbmi->ref_mvs[rf2][0].as_int; + ++cpi->mode_chosen_counts[ctx->best_mode_index]; + } +#endif + if (!frame_is_intra_only(cm)) { + if (is_inter_block(mbmi)) { + vp9_update_mv_count(cm, xd); + + if (cm->interp_filter == SWITCHABLE) { + const int ctx = vp9_get_pred_context_switchable_interp(xd); + ++cm->counts.switchable_interp[ctx][mbmi->interp_filter]; } - mbmi->best_mv[0].as_int = best_mv[0].as_int; - mbmi->best_mv[1].as_int = best_mv[1].as_int; - vp9_update_mv_count(cpi, x, best_mv); } - if (cm->mcomp_filter_type == SWITCHABLE && is_inter_mode(mbmi->mode)) { - const int ctx = vp9_get_pred_context_switchable_interp(xd); - ++cm->counts.switchable_interp[ctx][mbmi->interp_filter]; - } + rd_opt->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff; + rd_opt->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff; + rd_opt->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff; - cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff; - cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff; - cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff; - - for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) - cpi->rd_filter_diff[i] += ctx->best_filter_diff[i]; + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) + rd_opt->filter_diff[i] += ctx->best_filter_diff[i]; } } void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col) { - uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, - src->alpha_buffer}; - const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, - src->alpha_stride}; + uint8_t *const buffers[3] = {src->y_buffer, src->u_buffer, src->v_buffer }; + const int strides[3] = {src->y_stride, src->uv_stride, src->uv_stride }; int i; + // Set current frame pointer. + x->e_mbd.cur_buf = src; + for (i = 0; i < MAX_MB_PLANE; i++) setup_pred_plane(&x->plane[i].src, buffers[i], strides[i], mi_row, mi_col, NULL, x->e_mbd.plane[i].subsampling_x, x->e_mbd.plane[i].subsampling_y); } -static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile, - int mi_row, int mi_col, BLOCK_SIZE bsize) { - MACROBLOCK *const x = &cpi->mb; - VP9_COMMON *const cm = &cpi->common; +static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, int *rate, + int64_t *dist, BLOCK_SIZE bsize) { MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *mbmi; - const int dst_fb_idx = cm->new_fb_idx; - const int idx_str = xd->mode_info_stride * mi_row + mi_col; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mb_row = mi_row >> 1; - const int mb_col = mi_col >> 1; - const int idx_map = mb_row * cm->mb_cols + mb_col; - const struct segmentation *const seg = &cm->seg; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + INTERP_FILTER filter_ref; - set_skip_context(xd, cpi->above_context, cpi->left_context, mi_row, mi_col); + if (xd->up_available) + filter_ref = xd->mi[-xd->mi_stride]->mbmi.interp_filter; + else if (xd->left_available) + filter_ref = xd->mi[-1]->mbmi.interp_filter; + else + filter_ref = EIGHTTAP; - // Activity map pointer - x->mb_activity_ptr = &cpi->mb_activity_map[idx_map]; - x->active_ptr = cpi->active_map + idx_map; + mbmi->sb_type = bsize; + mbmi->mode = ZEROMV; + mbmi->tx_size = MIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[tx_mode]); + mbmi->skip = 1; + mbmi->uv_mode = DC_PRED; + mbmi->ref_frame[0] = LAST_FRAME; + mbmi->ref_frame[1] = NONE; + mbmi->mv[0].as_int = 0; + mbmi->interp_filter = filter_ref; - xd->mi_8x8 = cm->mi_grid_visible + idx_str; - xd->prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str; + xd->mi[0]->bmi[0].as_mv[0].as_int = 0; + x->skip = 1; - // Special case: if prev_mi is NULL, the previous mode info context - // cannot be used. - xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL; - - xd->mi_8x8[0] = cm->mi + idx_str; - - mbmi = &xd->mi_8x8[0]->mbmi; - - // Set up destination pointers - setup_dst_planes(xd, &cm->yv12_fb[dst_fb_idx], mi_row, mi_col); - - // Set up limit values for MV components - // mv beyond the range do not produce new/different prediction block - x->mv_row_min = -(((mi_row + mi_height) * MI_SIZE) + VP9_INTERP_EXTEND); - x->mv_col_min = -(((mi_col + mi_width) * MI_SIZE) + VP9_INTERP_EXTEND); - x->mv_row_max = (cm->mi_rows - mi_row) * MI_SIZE + VP9_INTERP_EXTEND; - x->mv_col_max = (cm->mi_cols - mi_col) * MI_SIZE + VP9_INTERP_EXTEND; - - // Set up distance of MB to edge of frame in 1/8th pel units - assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1))); - set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width, - cm->mi_rows, cm->mi_cols); - - /* set up source buffers */ - vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); - - /* R/D setup */ - x->rddiv = cpi->RDDIV; - x->rdmult = cpi->RDMULT; - - /* segment ID */ - if (seg->enabled) { - if (!cpi->oxcf.aq_mode == VARIANCE_AQ) { - uint8_t *map = seg->update_map ? cpi->segmentation_map - : cm->last_frame_seg_map; - mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col); - } - vp9_mb_init_quantizer(cpi, x); - - if (seg->enabled && cpi->seg0_cnt > 0 - && !vp9_segfeature_active(seg, 0, SEG_LVL_REF_FRAME) - && vp9_segfeature_active(seg, 1, SEG_LVL_REF_FRAME)) { - cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt; - } else { - const int y = mb_row & ~3; - const int x = mb_col & ~3; - const int p16 = ((mb_row & 1) << 1) + (mb_col & 1); - const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1); - const int tile_progress = tile->mi_col_start * cm->mb_rows >> 1; - const int mb_cols = (tile->mi_col_end - tile->mi_col_start) >> 1; - - cpi->seg0_progress = ((y * mb_cols + x * 4 + p32 + p16 + tile_progress) - << 16) / cm->MBs; - } - - x->encode_breakout = cpi->segment_encode_breakout[mbmi->segment_id]; - } else { - mbmi->segment_id = 0; - x->encode_breakout = cpi->oxcf.encode_breakout; - } + *rate = 0; + *dist = 0; } -static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, - int mi_row, int mi_col, - int *totalrate, int64_t *totaldist, - BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, - int64_t best_rd) { +static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, + int mi_row, int mi_col, + int *totalrate, int64_t *totaldist, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + int64_t best_rd, int block) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi; struct macroblock_plane *const p = x->plane; struct macroblockd_plane *const pd = xd->plane; - int i; - int orig_rdmult = x->rdmult; + const AQ_MODE aq_mode = cpi->oxcf.aq_mode; + int i, orig_rdmult; double rdmult_ratio; - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); rdmult_ratio = 1.0; // avoid uninitialized warnings // Use the lower precision, but faster, 32x32 fdct for mode selection. x->use_lp32x32fdct = 1; + // TODO(JBB): Most other places in the code instead of calling the function + // and then checking if its not the first 8x8 we put the check in the + // calling function. Do that here. if (bsize < BLOCK_8X8) { // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 // there is nothing to be done. - if (x->ab_index != 0) { + if (block != 0) { *totalrate = 0; *totaldist = 0; return; @@ -619,41 +717,57 @@ static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, } set_offsets(cpi, tile, mi_row, mi_col, bsize); - xd->mi_8x8[0]->mbmi.sb_type = bsize; + mbmi = &xd->mi[0]->mbmi; + mbmi->sb_type = bsize; for (i = 0; i < MAX_MB_PLANE; ++i) { p[i].coeff = ctx->coeff_pbuf[i][0]; - pd[i].qcoeff = ctx->qcoeff_pbuf[i][0]; + p[i].qcoeff = ctx->qcoeff_pbuf[i][0]; pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0]; - pd[i].eobs = ctx->eobs_pbuf[i][0]; + p[i].eobs = ctx->eobs_pbuf[i][0]; } ctx->is_coded = 0; + ctx->skippable = 0; x->skip_recode = 0; // Set to zero to make sure we do not use the previous encoded frame stats - xd->mi_8x8[0]->mbmi.skip_coeff = 0; + mbmi->skip = 0; - x->source_variance = get_sby_perpixel_variance(cpi, x, bsize); + x->source_variance = get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); - if (cpi->oxcf.aq_mode == VARIANCE_AQ) { - int energy; - if (bsize <= BLOCK_16X16) { - energy = x->mb_energy; + // Save rdmult before it might be changed, so it can be restored later. + orig_rdmult = x->rdmult; + + if (aq_mode == VARIANCE_AQ) { + const int energy = bsize <= BLOCK_16X16 ? x->mb_energy + : vp9_block_energy(cpi, x, bsize); + if (cm->frame_type == KEY_FRAME || + cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { + mbmi->segment_id = vp9_vaq_segment_id(energy); } else { - energy = vp9_block_energy(cpi, x, bsize); + const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map + : cm->last_frame_seg_map; + mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col); } - xd->mi_8x8[0]->mbmi.segment_id = vp9_vaq_segment_id(energy); rdmult_ratio = vp9_vaq_rdmult_ratio(energy); - vp9_mb_init_quantizer(cpi, x); - } - - if (cpi->oxcf.tuning == VP8_TUNE_SSIM) - vp9_activity_masking(cpi, x); - - if (cpi->oxcf.aq_mode == VARIANCE_AQ) { - vp9_clear_system_state(); // __asm emms; - x->rdmult = round(x->rdmult * rdmult_ratio); + vp9_init_plane_quantizers(cpi, x); + vp9_clear_system_state(); + x->rdmult = (int)round(x->rdmult * rdmult_ratio); + } else if (aq_mode == COMPLEXITY_AQ) { + const int mi_offset = mi_row * cm->mi_cols + mi_col; + unsigned char complexity = cpi->complexity_map[mi_offset]; + const int is_edge = (mi_row <= 1) || (mi_row >= (cm->mi_rows - 2)) || + (mi_col <= 1) || (mi_col >= (cm->mi_cols - 2)); + if (!is_edge && (complexity > 128)) + x->rdmult += ((x->rdmult * (complexity - 128)) / 256); + } else if (aq_mode == CYCLIC_REFRESH_AQ) { + const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map + : cm->last_frame_seg_map; + // If segment 1, use rdmult for that segment. + if (vp9_get_segment_id(cm, map, bsize, mi_row, mi_col)) + x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh); } // Find best coding mode & reconstruct the MB so it is available @@ -662,76 +776,66 @@ static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, vp9_rd_pick_intra_mode_sb(cpi, x, totalrate, totaldist, bsize, ctx, best_rd); } else { - if (bsize >= BLOCK_8X8) - vp9_rd_pick_inter_mode_sb(cpi, x, tile, mi_row, mi_col, - totalrate, totaldist, bsize, ctx, best_rd); - else + if (bsize >= BLOCK_8X8) { + if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) + vp9_rd_pick_inter_mode_sb_seg_skip(cpi, x, totalrate, totaldist, bsize, + ctx, best_rd); + else + vp9_rd_pick_inter_mode_sb(cpi, x, tile, mi_row, mi_col, + totalrate, totaldist, bsize, ctx, best_rd); + } else { vp9_rd_pick_inter_mode_sub8x8(cpi, x, tile, mi_row, mi_col, totalrate, totaldist, bsize, ctx, best_rd); + } } - if (cpi->oxcf.aq_mode == VARIANCE_AQ) { - x->rdmult = orig_rdmult; - if (*totalrate != INT_MAX) { - vp9_clear_system_state(); // __asm emms; - *totalrate = round(*totalrate * rdmult_ratio); - } + x->rdmult = orig_rdmult; + + if (aq_mode == VARIANCE_AQ && *totalrate != INT_MAX) { + vp9_clear_system_state(); + *totalrate = (int)round(*totalrate * rdmult_ratio); } } -static void update_stats(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; - MODE_INFO *mi = xd->mi_8x8[0]; - MB_MODE_INFO *const mbmi = &mi->mbmi; +static void update_stats(VP9_COMMON *cm, const MACROBLOCK *x) { + const MACROBLOCKD *const xd = &x->e_mbd; + const MODE_INFO *const mi = xd->mi[0]; + const MB_MODE_INFO *const mbmi = &mi->mbmi; if (!frame_is_intra_only(cm)) { const int seg_ref_active = vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); + if (!seg_ref_active) { + FRAME_COUNTS *const counts = &cm->counts; + const int inter_block = is_inter_block(mbmi); - if (!seg_ref_active) - cpi->intra_inter_count[vp9_get_pred_context_intra_inter(xd)] - [is_inter_block(mbmi)]++; + counts->intra_inter[vp9_get_intra_inter_context(xd)][inter_block]++; - // If the segment reference feature is enabled we have only a single - // reference frame allowed for the segment so exclude it from - // the reference frame counts used to work out probabilities. - if (is_inter_block(mbmi) && !seg_ref_active) { - if (cm->comp_pred_mode == HYBRID_PREDICTION) - cpi->comp_inter_count[vp9_get_pred_context_comp_inter_inter(cm, xd)] - [has_second_ref(mbmi)]++; + // If the segment reference feature is enabled we have only a single + // reference frame allowed for the segment so exclude it from + // the reference frame counts used to work out probabilities. + if (inter_block) { + const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0]; - if (has_second_ref(mbmi)) { - cpi->comp_ref_count[vp9_get_pred_context_comp_ref_p(cm, xd)] - [mbmi->ref_frame[0] == GOLDEN_FRAME]++; - } else { - cpi->single_ref_count[vp9_get_pred_context_single_ref_p1(xd)][0] - [mbmi->ref_frame[0] != LAST_FRAME]++; - if (mbmi->ref_frame[0] != LAST_FRAME) - cpi->single_ref_count[vp9_get_pred_context_single_ref_p2(xd)][1] - [mbmi->ref_frame[0] != GOLDEN_FRAME]++; + if (cm->reference_mode == REFERENCE_MODE_SELECT) + counts->comp_inter[vp9_get_reference_mode_context(cm, xd)] + [has_second_ref(mbmi)]++; + + if (has_second_ref(mbmi)) { + counts->comp_ref[vp9_get_pred_context_comp_ref_p(cm, xd)] + [ref0 == GOLDEN_FRAME]++; + } else { + counts->single_ref[vp9_get_pred_context_single_ref_p1(xd)][0] + [ref0 != LAST_FRAME]++; + if (ref0 != LAST_FRAME) + counts->single_ref[vp9_get_pred_context_single_ref_p2(xd)][1] + [ref0 != GOLDEN_FRAME]++; + } } } } } -static BLOCK_SIZE *get_sb_partitioning(MACROBLOCK *x, BLOCK_SIZE bsize) { - switch (bsize) { - case BLOCK_64X64: - return &x->sb64_partitioning; - case BLOCK_32X32: - return &x->sb_partitioning[x->sb_index]; - case BLOCK_16X16: - return &x->mb_partitioning[x->sb_index][x->mb_index]; - case BLOCK_8X8: - return &x->b_partitioning[x->sb_index][x->mb_index][x->b_index]; - default: - assert(0); - return NULL; - } -} - static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col, ENTROPY_CONTEXT a[16 * MAX_MB_PLANE], ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], @@ -746,22 +850,23 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col, int mi_height = num_8x8_blocks_high_lookup[bsize]; for (p = 0; p < MAX_MB_PLANE; p++) { vpx_memcpy( - cpi->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x), + xd->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x), a + num_4x4_blocks_wide * p, (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >> xd->plane[p].subsampling_x); vpx_memcpy( - cpi->left_context[p] + xd->left_context[p] + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y), l + num_4x4_blocks_high * p, (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >> xd->plane[p].subsampling_y); } - vpx_memcpy(cpi->above_seg_context + mi_col, sa, - sizeof(*cpi->above_seg_context) * mi_width); - vpx_memcpy(cpi->left_seg_context + (mi_row & MI_MASK), sl, - sizeof(cpi->left_seg_context[0]) * mi_height); + vpx_memcpy(xd->above_seg_context + mi_col, sa, + sizeof(*xd->above_seg_context) * mi_width); + vpx_memcpy(xd->left_seg_context + (mi_row & MI_MASK), sl, + sizeof(xd->left_seg_context[0]) * mi_height); } + static void save_context(VP9_COMP *cpi, int mi_row, int mi_col, ENTROPY_CONTEXT a[16 * MAX_MB_PLANE], ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], @@ -779,46 +884,32 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col, for (p = 0; p < MAX_MB_PLANE; ++p) { vpx_memcpy( a + num_4x4_blocks_wide * p, - cpi->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x), + xd->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x), (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >> xd->plane[p].subsampling_x); vpx_memcpy( l + num_4x4_blocks_high * p, - cpi->left_context[p] + xd->left_context[p] + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y), (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >> xd->plane[p].subsampling_y); } - vpx_memcpy(sa, cpi->above_seg_context + mi_col, - sizeof(*cpi->above_seg_context) * mi_width); - vpx_memcpy(sl, cpi->left_seg_context + (mi_row & MI_MASK), - sizeof(cpi->left_seg_context[0]) * mi_height); + vpx_memcpy(sa, xd->above_seg_context + mi_col, + sizeof(*xd->above_seg_context) * mi_width); + vpx_memcpy(sl, xd->left_seg_context + (mi_row & MI_MASK), + sizeof(xd->left_seg_context[0]) * mi_height); } static void encode_b(VP9_COMP *cpi, const TileInfo *const tile, TOKENEXTRA **tp, int mi_row, int mi_col, - int output_enabled, BLOCK_SIZE bsize, int sub_index) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; - - if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) - return; - - if (sub_index != -1) - *get_sb_index(x, bsize) = sub_index; - - if (bsize < BLOCK_8X8) { - // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 - // there is nothing to be done. - if (x->ab_index > 0) - return; - } + int output_enabled, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx) { set_offsets(cpi, tile, mi_row, mi_col, bsize); - update_state(cpi, get_block_context(x, bsize), bsize, output_enabled); - encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize); + update_state(cpi, ctx, mi_row, mi_col, bsize, output_enabled); + encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize, ctx); if (output_enabled) { - update_stats(cpi); + update_stats(&cpi->common, &cpi->mb); (*tp)->token = EOSB_TOKEN; (*tp)++; @@ -827,67 +918,75 @@ static void encode_b(VP9_COMP *cpi, const TileInfo *const tile, static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile, TOKENEXTRA **tp, int mi_row, int mi_col, - int output_enabled, BLOCK_SIZE bsize) { + int output_enabled, BLOCK_SIZE bsize, + PC_TREE *pc_tree) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; - BLOCK_SIZE c1 = BLOCK_8X8; - const int bsl = b_width_log2(bsize), bs = (1 << bsl) / 4; - int pl = 0; + MACROBLOCKD *const xd = &x->e_mbd; + + const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4; + int ctx; PARTITION_TYPE partition; - BLOCK_SIZE subsize; - int i; + BLOCK_SIZE subsize = bsize; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - c1 = BLOCK_4X4; if (bsize >= BLOCK_8X8) { - pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context, - mi_row, mi_col, bsize); - c1 = *(get_sb_partitioning(x, bsize)); + ctx = partition_plane_context(xd, mi_row, mi_col, bsize); + subsize = get_subsize(bsize, pc_tree->partitioning); + } else { + ctx = 0; + subsize = BLOCK_4X4; } - partition = partition_lookup[bsl][c1]; + + partition = partition_lookup[bsl][subsize]; + if (output_enabled && bsize != BLOCK_4X4) + cm->counts.partition[ctx][partition]++; switch (partition) { case PARTITION_NONE: - if (output_enabled && bsize >= BLOCK_8X8) - cpi->partition_count[pl][PARTITION_NONE]++; - encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, -1); + encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize, + &pc_tree->none); break; case PARTITION_VERT: - if (output_enabled) - cpi->partition_count[pl][PARTITION_VERT]++; - encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, 0); - encode_b(cpi, tile, tp, mi_row, mi_col + bs, output_enabled, c1, 1); + encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize, + &pc_tree->vertical[0]); + if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) { + encode_b(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled, subsize, + &pc_tree->vertical[1]); + } break; case PARTITION_HORZ: - if (output_enabled) - cpi->partition_count[pl][PARTITION_HORZ]++; - encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, 0); - encode_b(cpi, tile, tp, mi_row + bs, mi_col, output_enabled, c1, 1); + encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize, + &pc_tree->horizontal[0]); + if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) { + encode_b(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled, subsize, + &pc_tree->horizontal[1]); + } break; case PARTITION_SPLIT: - subsize = get_subsize(bsize, PARTITION_SPLIT); - - if (output_enabled) - cpi->partition_count[pl][PARTITION_SPLIT]++; - - for (i = 0; i < 4; i++) { - const int x_idx = i & 1, y_idx = i >> 1; - - *get_sb_index(x, subsize) = i; - encode_sb(cpi, tile, tp, mi_row + y_idx * bs, mi_col + x_idx * bs, - output_enabled, subsize); + if (bsize == BLOCK_8X8) { + encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize, + pc_tree->leaf_split[0]); + } else { + encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize, + pc_tree->split[0]); + encode_sb(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled, subsize, + pc_tree->split[1]); + encode_sb(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled, subsize, + pc_tree->split[2]); + encode_sb(cpi, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled, + subsize, pc_tree->split[3]); } break; default: - assert(0); + assert("Invalid partition type."); break; } if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) - update_partition_context(cpi->above_seg_context, cpi->left_seg_context, - mi_row, mi_col, c1, bsize); + update_partition_context(xd, mi_row, mi_col, subsize, bsize); } // Check to see if the given partition size is allowed for a specified number @@ -896,10 +995,10 @@ static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile, static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left, int cols_left, int *bh, int *bw) { - if ((rows_left <= 0) || (cols_left <= 0)) { + if (rows_left <= 0 || cols_left <= 0) { return MIN(bsize, BLOCK_8X8); } else { - for (; bsize > 0; --bsize) { + for (; bsize > 0; bsize -= 3) { *bh = num_8x8_blocks_high_lookup[bsize]; *bw = num_8x8_blocks_wide_lookup[bsize]; if ((*bh <= rows_left) && (*bw <= cols_left)) { @@ -910,20 +1009,36 @@ static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, return bsize; } +static void set_partial_b64x64_partition(MODE_INFO *mi, int mis, + int bh_in, int bw_in, int row8x8_remaining, int col8x8_remaining, + BLOCK_SIZE bsize, MODE_INFO **mi_8x8) { + int bh = bh_in; + int r, c; + for (r = 0; r < MI_BLOCK_SIZE; r += bh) { + int bw = bw_in; + for (c = 0; c < MI_BLOCK_SIZE; c += bw) { + const int index = r * mis + c; + mi_8x8[index] = mi + index; + mi_8x8[index]->mbmi.sb_type = find_partition_size(bsize, + row8x8_remaining - r, col8x8_remaining - c, &bh, &bw); + } + } +} + // This function attempts to set all mode info entries in a given SB64 // to the same block partition size. // However, at the bottom and right borders of the image the requested size // may not be allowed in which case this code attempts to choose the largest // allowable partition. -static void set_partitioning(VP9_COMP *cpi, const TileInfo *const tile, - MODE_INFO **mi_8x8, int mi_row, int mi_col) { +static void set_fixed_partitioning(VP9_COMP *cpi, const TileInfo *const tile, + MODE_INFO **mi_8x8, int mi_row, int mi_col, + BLOCK_SIZE bsize) { VP9_COMMON *const cm = &cpi->common; - BLOCK_SIZE bsize = cpi->sf.always_this_block_size; - const int mis = cm->mode_info_stride; - int row8x8_remaining = tile->mi_row_end - mi_row; - int col8x8_remaining = tile->mi_col_end - mi_col; + const int mis = cm->mi_stride; + const int row8x8_remaining = tile->mi_row_end - mi_row; + const int col8x8_remaining = tile->mi_col_end - mi_col; int block_row, block_col; - MODE_INFO * mi_upper_left = cm->mi + mi_row * mis + mi_col; + MODE_INFO *mi_upper_left = cm->mi + mi_row * mis + mi_col; int bh = num_8x8_blocks_high_lookup[bsize]; int bw = num_8x8_blocks_wide_lookup[bsize]; @@ -941,34 +1056,23 @@ static void set_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } } else { // Else this is a partial SB64. - for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) { - for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) { - int index = block_row * mis + block_col; - // Find a partition size that fits - bsize = find_partition_size(cpi->sf.always_this_block_size, - (row8x8_remaining - block_row), - (col8x8_remaining - block_col), &bh, &bw); - mi_8x8[index] = mi_upper_left + index; - mi_8x8[index]->mbmi.sb_type = bsize; - } - } + set_partial_b64x64_partition(mi_upper_left, mis, bh, bw, row8x8_remaining, + col8x8_remaining, bsize, mi_8x8); } } -static void copy_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8, - MODE_INFO **prev_mi_8x8) { - VP9_COMMON *const cm = &cpi->common; - const int mis = cm->mode_info_stride; +static void copy_partitioning(VP9_COMMON *cm, MODE_INFO **mi_8x8, + MODE_INFO **prev_mi_8x8) { + const int mis = cm->mi_stride; int block_row, block_col; for (block_row = 0; block_row < 8; ++block_row) { for (block_col = 0; block_col < 8; ++block_col) { - MODE_INFO * prev_mi = prev_mi_8x8[block_row * mis + block_col]; - BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0; - ptrdiff_t offset; + MODE_INFO *const prev_mi = prev_mi_8x8[block_row * mis + block_col]; + const BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0; if (prev_mi) { - offset = prev_mi - cm->prev_mi; + const ptrdiff_t offset = prev_mi - cm->prev_mi; mi_8x8[block_row * mis + block_col] = cm->mi + offset; mi_8x8[block_row * mis + block_col]->mbmi.sb_type = sb_type; } @@ -976,18 +1080,201 @@ static void copy_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8, } } -static int sb_has_motion(VP9_COMP *cpi, MODE_INFO **prev_mi_8x8) { +static void constrain_copy_partitioning(VP9_COMP *const cpi, + const TileInfo *const tile, + MODE_INFO **mi_8x8, + MODE_INFO **prev_mi_8x8, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { VP9_COMMON *const cm = &cpi->common; - const int mis = cm->mode_info_stride; + const int mis = cm->mi_stride; + const int row8x8_remaining = tile->mi_row_end - mi_row; + const int col8x8_remaining = tile->mi_col_end - mi_col; + MODE_INFO *const mi_upper_left = cm->mi + mi_row * mis + mi_col; + const int bh = num_8x8_blocks_high_lookup[bsize]; + const int bw = num_8x8_blocks_wide_lookup[bsize]; + int block_row, block_col; + + assert((row8x8_remaining > 0) && (col8x8_remaining > 0)); + + // If the SB64 if it is all "in image". + if ((col8x8_remaining >= MI_BLOCK_SIZE) && + (row8x8_remaining >= MI_BLOCK_SIZE)) { + for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) { + for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) { + const int index = block_row * mis + block_col; + MODE_INFO *prev_mi = prev_mi_8x8[index]; + const BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0; + // Use previous partition if block size is not larger than bsize. + if (prev_mi && sb_type <= bsize) { + int block_row2, block_col2; + for (block_row2 = 0; block_row2 < bh; ++block_row2) { + for (block_col2 = 0; block_col2 < bw; ++block_col2) { + const int index2 = (block_row + block_row2) * mis + + block_col + block_col2; + prev_mi = prev_mi_8x8[index2]; + if (prev_mi) { + const ptrdiff_t offset = prev_mi - cm->prev_mi; + mi_8x8[index2] = cm->mi + offset; + mi_8x8[index2]->mbmi.sb_type = prev_mi->mbmi.sb_type; + } + } + } + } else { + // Otherwise, use fixed partition of size bsize. + mi_8x8[index] = mi_upper_left + index; + mi_8x8[index]->mbmi.sb_type = bsize; + } + } + } + } else { + // Else this is a partial SB64, copy previous partition. + copy_partitioning(cm, mi_8x8, prev_mi_8x8); + } +} + +const struct { + int row; + int col; +} coord_lookup[16] = { + // 32x32 index = 0 + {0, 0}, {0, 2}, {2, 0}, {2, 2}, + // 32x32 index = 1 + {0, 4}, {0, 6}, {2, 4}, {2, 6}, + // 32x32 index = 2 + {4, 0}, {4, 2}, {6, 0}, {6, 2}, + // 32x32 index = 3 + {4, 4}, {4, 6}, {6, 4}, {6, 6}, +}; + +static void set_source_var_based_partition(VP9_COMP *cpi, + const TileInfo *const tile, + MODE_INFO **mi_8x8, + int mi_row, int mi_col) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; + const int mis = cm->mi_stride; + const int row8x8_remaining = tile->mi_row_end - mi_row; + const int col8x8_remaining = tile->mi_col_end - mi_col; + MODE_INFO *mi_upper_left = cm->mi + mi_row * mis + mi_col; + + vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); + + assert((row8x8_remaining > 0) && (col8x8_remaining > 0)); + + // In-image SB64 + if ((col8x8_remaining >= MI_BLOCK_SIZE) && + (row8x8_remaining >= MI_BLOCK_SIZE)) { + int i, j; + int index; + diff d32[4]; + const int offset = (mi_row >> 1) * cm->mb_cols + (mi_col >> 1); + int is_larger_better = 0; + int use32x32 = 0; + unsigned int thr = cpi->source_var_thresh; + + vpx_memset(d32, 0, 4 * sizeof(diff)); + + for (i = 0; i < 4; i++) { + diff *d16[4]; + + for (j = 0; j < 4; j++) { + int b_mi_row = coord_lookup[i * 4 + j].row; + int b_mi_col = coord_lookup[i * 4 + j].col; + int boffset = b_mi_row / 2 * cm->mb_cols + + b_mi_col / 2; + + d16[j] = cpi->source_diff_var + offset + boffset; + + index = b_mi_row * mis + b_mi_col; + mi_8x8[index] = mi_upper_left + index; + mi_8x8[index]->mbmi.sb_type = BLOCK_16X16; + + // TODO(yunqingwang): If d16[j].var is very large, use 8x8 partition + // size to further improve quality. + } + + is_larger_better = (d16[0]->var < thr) && (d16[1]->var < thr) && + (d16[2]->var < thr) && (d16[3]->var < thr); + + // Use 32x32 partition + if (is_larger_better) { + use32x32 += 1; + + for (j = 0; j < 4; j++) { + d32[i].sse += d16[j]->sse; + d32[i].sum += d16[j]->sum; + } + + d32[i].var = d32[i].sse - (((int64_t)d32[i].sum * d32[i].sum) >> 10); + + index = coord_lookup[i*4].row * mis + coord_lookup[i*4].col; + mi_8x8[index] = mi_upper_left + index; + mi_8x8[index]->mbmi.sb_type = BLOCK_32X32; + } + } + + if (use32x32 == 4) { + thr <<= 1; + is_larger_better = (d32[0].var < thr) && (d32[1].var < thr) && + (d32[2].var < thr) && (d32[3].var < thr); + + // Use 64x64 partition + if (is_larger_better) { + mi_8x8[0] = mi_upper_left; + mi_8x8[0]->mbmi.sb_type = BLOCK_64X64; + } + } + } else { // partial in-image SB64 + int bh = num_8x8_blocks_high_lookup[BLOCK_16X16]; + int bw = num_8x8_blocks_wide_lookup[BLOCK_16X16]; + set_partial_b64x64_partition(mi_upper_left, mis, bh, bw, + row8x8_remaining, col8x8_remaining, BLOCK_16X16, mi_8x8); + } +} + +static int is_background(const VP9_COMP *cpi, const TileInfo *const tile, + int mi_row, int mi_col) { + // This assumes the input source frames are of the same dimension. + const int row8x8_remaining = tile->mi_row_end - mi_row; + const int col8x8_remaining = tile->mi_col_end - mi_col; + const int x = mi_col * MI_SIZE; + const int y = mi_row * MI_SIZE; + const int src_stride = cpi->Source->y_stride; + const uint8_t *const src = &cpi->Source->y_buffer[y * src_stride + x]; + const int pre_stride = cpi->Last_Source->y_stride; + const uint8_t *const pre = &cpi->Last_Source->y_buffer[y * pre_stride + x]; + int this_sad = 0; + int threshold = 0; + + if (row8x8_remaining >= MI_BLOCK_SIZE && + col8x8_remaining >= MI_BLOCK_SIZE) { + this_sad = cpi->fn_ptr[BLOCK_64X64].sdf(src, src_stride, pre, pre_stride); + threshold = (1 << 12); + } else { + int r, c; + for (r = 0; r < row8x8_remaining; r += 2) + for (c = 0; c < col8x8_remaining; c += 2) + this_sad += cpi->fn_ptr[BLOCK_16X16].sdf(src, src_stride, + pre, pre_stride); + threshold = (row8x8_remaining * col8x8_remaining) << 6; + } + + return this_sad < 2 * threshold; +} + +static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO **prev_mi_8x8, + const int motion_thresh) { + const int mis = cm->mi_stride; int block_row, block_col; if (cm->prev_mi) { for (block_row = 0; block_row < 8; ++block_row) { for (block_col = 0; block_col < 8; ++block_col) { - MODE_INFO * prev_mi = prev_mi_8x8[block_row * mis + block_col]; + const MODE_INFO *prev_mi = prev_mi_8x8[block_row * mis + block_col]; if (prev_mi) { - if (abs(prev_mi->mbmi.mv[0].as_mv.row) >= 8 || - abs(prev_mi->mbmi.mv[0].as_mv.col) >= 8) + if (abs(prev_mi->mbmi.mv[0].as_mv.row) > motion_thresh || + abs(prev_mi->mbmi.mv[0].as_mv.col) > motion_thresh) return 1; } } @@ -996,67 +1283,182 @@ static int sb_has_motion(VP9_COMP *cpi, MODE_INFO **prev_mi_8x8) { return 0; } +static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, + int mi_row, int mi_col, int bsize) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const struct segmentation *const seg = &cm->seg; + + *(xd->mi[0]) = ctx->mic; + + // For in frame adaptive Q, check for reseting the segment_id and updating + // the cyclic refresh map. + if ((cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) && seg->enabled) { + vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, + mi_row, mi_col, bsize, 1); + vp9_init_plane_quantizers(cpi, x); + } + + if (is_inter_block(mbmi)) { + vp9_update_mv_count(cm, xd); + + if (cm->interp_filter == SWITCHABLE) { + const int pred_ctx = vp9_get_pred_context_switchable_interp(xd); + ++cm->counts.switchable_interp[pred_ctx][mbmi->interp_filter]; + } + } + + x->skip = ctx->skip; + x->skip_txfm[0] = mbmi->segment_id ? 0 : ctx->skip_txfm[0]; +} + +static void encode_b_rt(VP9_COMP *cpi, const TileInfo *const tile, + TOKENEXTRA **tp, int mi_row, int mi_col, + int output_enabled, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx) { + set_offsets(cpi, tile, mi_row, mi_col, bsize); + update_state_rt(cpi, ctx, mi_row, mi_col, bsize); + +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && output_enabled) { + vp9_denoiser_denoise(&cpi->denoiser, &cpi->mb, mi_row, mi_col, + MAX(BLOCK_8X8, bsize), ctx); + } +#endif + + encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize, ctx); + update_stats(&cpi->common, &cpi->mb); + + (*tp)->token = EOSB_TOKEN; + (*tp)++; +} + +static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile, + TOKENEXTRA **tp, int mi_row, int mi_col, + int output_enabled, BLOCK_SIZE bsize, + PC_TREE *pc_tree) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; + MACROBLOCKD *const xd = &x->e_mbd; + + const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4; + int ctx; + PARTITION_TYPE partition; + BLOCK_SIZE subsize; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) + return; + + if (bsize >= BLOCK_8X8) { + const int idx_str = xd->mi_stride * mi_row + mi_col; + MODE_INFO ** mi_8x8 = cm->mi_grid_visible + idx_str; + ctx = partition_plane_context(xd, mi_row, mi_col, bsize); + subsize = mi_8x8[0]->mbmi.sb_type; + } else { + ctx = 0; + subsize = BLOCK_4X4; + } + + partition = partition_lookup[bsl][subsize]; + if (output_enabled && bsize != BLOCK_4X4) + cm->counts.partition[ctx][partition]++; + + switch (partition) { + case PARTITION_NONE: + encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize, + &pc_tree->none); + break; + case PARTITION_VERT: + encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize, + &pc_tree->vertical[0]); + if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) { + encode_b_rt(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled, + subsize, &pc_tree->vertical[1]); + } + break; + case PARTITION_HORZ: + encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize, + &pc_tree->horizontal[0]); + if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) { + encode_b_rt(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled, + subsize, &pc_tree->horizontal[1]); + } + break; + case PARTITION_SPLIT: + subsize = get_subsize(bsize, PARTITION_SPLIT); + encode_sb_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize, + pc_tree->split[0]); + encode_sb_rt(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled, + subsize, pc_tree->split[1]); + encode_sb_rt(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled, + subsize, pc_tree->split[2]); + encode_sb_rt(cpi, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled, + subsize, pc_tree->split[3]); + break; + default: + assert("Invalid partition type."); + break; + } + + if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) + update_partition_context(xd, mi_row, mi_col, subsize, bsize); +} + static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile, MODE_INFO **mi_8x8, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rate, int64_t *dist, - int do_recon) { + int do_recon, PC_TREE *pc_tree) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; - const int mis = cm->mode_info_stride; - int bsl = b_width_log2(bsize); - const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; - const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; - int ms = num_4x4_blocks_wide / 2; - int mh = num_4x4_blocks_high / 2; - int bss = (1 << bsl) / 4; + MACROBLOCKD *const xd = &x->e_mbd; + const int mis = cm->mi_stride; + const int bsl = b_width_log2(bsize); + const int mi_step = num_4x4_blocks_wide_lookup[bsize] / 2; + const int bss = (1 << bsl) / 4; int i, pl; PARTITION_TYPE partition = PARTITION_NONE; BLOCK_SIZE subsize; ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; PARTITION_CONTEXT sl[8], sa[8]; int last_part_rate = INT_MAX; - int64_t last_part_dist = INT_MAX; - int split_rate = INT_MAX; - int64_t split_dist = INT_MAX; + int64_t last_part_dist = INT64_MAX; + int64_t last_part_rd = INT64_MAX; int none_rate = INT_MAX; - int64_t none_dist = INT_MAX; + int64_t none_dist = INT64_MAX; + int64_t none_rd = INT64_MAX; int chosen_rate = INT_MAX; - int64_t chosen_dist = INT_MAX; + int64_t chosen_dist = INT64_MAX; + int64_t chosen_rd = INT64_MAX; BLOCK_SIZE sub_subsize = BLOCK_4X4; int splits_below = 0; BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type; + int do_partition_search = 1; + PICK_MODE_CONTEXT *ctx = &pc_tree->none; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - partition = partition_lookup[bsl][bs_type]; + assert(num_4x4_blocks_wide_lookup[bsize] == + num_4x4_blocks_high_lookup[bsize]); + partition = partition_lookup[bsl][bs_type]; subsize = get_subsize(bsize, partition); - if (bsize < BLOCK_8X8) { - // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 - // there is nothing to be done. - if (x->ab_index != 0) { - *rate = 0; - *dist = 0; - return; - } - } else { - *(get_sb_partitioning(x, bsize)) = subsize; - } + pc_tree->partitioning = partition; save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); - if (bsize == BLOCK_16X16) { + if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode) { set_offsets(cpi, tile, mi_row, mi_col, bsize); x->mb_energy = vp9_block_energy(cpi, x, bsize); } - x->fast_ms = 0; - x->subblock_ref = 0; - - if (cpi->sf.adjust_partitioning_from_last_frame) { + if (do_partition_search && + cpi->sf.partition_search_type == SEARCH_PARTITION && + cpi->sf.adjust_partitioning_from_last_frame) { // Check if any of the sub blocks are further split. if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) { sub_subsize = get_subsize(subsize, PARTITION_SPLIT); @@ -1073,44 +1475,46 @@ static void rd_use_partition(VP9_COMP *cpi, // If partition is not none try none unless each of the 4 splits are split // even further.. if (partition != PARTITION_NONE && !splits_below && - mi_row + (ms >> 1) < cm->mi_rows && - mi_col + (ms >> 1) < cm->mi_cols) { - *(get_sb_partitioning(x, bsize)) = bsize; - pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize, - get_block_context(x, bsize), INT64_MAX); + mi_row + (mi_step >> 1) < cm->mi_rows && + mi_col + (mi_step >> 1) < cm->mi_cols) { + pc_tree->partitioning = PARTITION_NONE; + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize, + ctx, INT64_MAX, 0); - pl = partition_plane_context(cpi->above_seg_context, - cpi->left_seg_context, - mi_row, mi_col, bsize); - none_rate += x->partition_cost[pl][PARTITION_NONE]; + pl = partition_plane_context(xd, mi_row, mi_col, bsize); + + if (none_rate < INT_MAX) { + none_rate += cpi->partition_cost[pl][PARTITION_NONE]; + none_rd = RDCOST(x->rdmult, x->rddiv, none_rate, none_dist); + } restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); mi_8x8[0]->mbmi.sb_type = bs_type; - *(get_sb_partitioning(x, bsize)) = subsize; + pc_tree->partitioning = partition; } } switch (partition) { case PARTITION_NONE: - pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist, - bsize, get_block_context(x, bsize), INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, + &last_part_dist, bsize, ctx, INT64_MAX, 0); break; case PARTITION_HORZ: - *get_sb_index(x, subsize) = 0; - pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist, - subsize, get_block_context(x, subsize), INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, + &last_part_dist, subsize, &pc_tree->horizontal[0], + INT64_MAX, 0); if (last_part_rate != INT_MAX && - bsize >= BLOCK_8X8 && mi_row + (mh >> 1) < cm->mi_rows) { + bsize >= BLOCK_8X8 && mi_row + (mi_step >> 1) < cm->mi_rows) { int rt = 0; int64_t dt = 0; - update_state(cpi, get_block_context(x, subsize), subsize, 0); - encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); - *get_sb_index(x, subsize) = 1; - pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize, - get_block_context(x, subsize), INT64_MAX); - if (rt == INT_MAX || dt == INT_MAX) { + PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0]; + update_state(cpi, ctx, mi_row, mi_col, subsize, 0); + encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx); + rd_pick_sb_modes(cpi, tile, mi_row + (mi_step >> 1), mi_col, &rt, &dt, + subsize, &pc_tree->horizontal[1], INT64_MAX, 1); + if (rt == INT_MAX || dt == INT64_MAX) { last_part_rate = INT_MAX; - last_part_dist = INT_MAX; + last_part_dist = INT64_MAX; break; } @@ -1119,21 +1523,22 @@ static void rd_use_partition(VP9_COMP *cpi, } break; case PARTITION_VERT: - *get_sb_index(x, subsize) = 0; - pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist, - subsize, get_block_context(x, subsize), INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, + &last_part_dist, subsize, &pc_tree->vertical[0], + INT64_MAX, 0); if (last_part_rate != INT_MAX && - bsize >= BLOCK_8X8 && mi_col + (ms >> 1) < cm->mi_cols) { + bsize >= BLOCK_8X8 && mi_col + (mi_step >> 1) < cm->mi_cols) { int rt = 0; int64_t dt = 0; - update_state(cpi, get_block_context(x, subsize), subsize, 0); - encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); - *get_sb_index(x, subsize) = 1; - pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize, - get_block_context(x, subsize), INT64_MAX); - if (rt == INT_MAX || dt == INT_MAX) { + PICK_MODE_CONTEXT *ctx = &pc_tree->vertical[0]; + update_state(cpi, ctx, mi_row, mi_col, subsize, 0); + encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col + (mi_step >> 1), &rt, &dt, + subsize, &pc_tree->vertical[bsize > BLOCK_8X8], + INT64_MAX, 1); + if (rt == INT_MAX || dt == INT64_MAX) { last_part_rate = INT_MAX; - last_part_dist = INT_MAX; + last_part_dist = INT64_MAX; break; } last_part_rate += rt; @@ -1141,12 +1546,17 @@ static void rd_use_partition(VP9_COMP *cpi, } break; case PARTITION_SPLIT: - // Split partition. + if (bsize == BLOCK_8X8) { + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, + &last_part_dist, subsize, pc_tree->leaf_split[0], + INT64_MAX, 0); + break; + } last_part_rate = 0; last_part_dist = 0; for (i = 0; i < 4; i++) { - int x_idx = (i & 1) * (ms >> 1); - int y_idx = (i >> 1) * (ms >> 1); + int x_idx = (i & 1) * (mi_step >> 1); + int y_idx = (i >> 1) * (mi_step >> 1); int jj = i >> 1, ii = i & 0x01; int rt; int64_t dt; @@ -1154,14 +1564,12 @@ static void rd_use_partition(VP9_COMP *cpi, if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) continue; - *get_sb_index(x, subsize) = i; - rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp, mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt, - i != 3); - if (rt == INT_MAX || dt == INT_MAX) { + i != 3, pc_tree->split[i]); + if (rt == INT_MAX || dt == INT64_MAX) { last_part_rate = INT_MAX; - last_part_dist = INT_MAX; + last_part_dist = INT64_MAX; break; } last_part_rate += rt; @@ -1170,26 +1578,33 @@ static void rd_use_partition(VP9_COMP *cpi, break; default: assert(0); + break; } - pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context, - mi_row, mi_col, bsize); - if (last_part_rate < INT_MAX) - last_part_rate += x->partition_cost[pl][partition]; + pl = partition_plane_context(xd, mi_row, mi_col, bsize); + if (last_part_rate < INT_MAX) { + last_part_rate += cpi->partition_cost[pl][partition]; + last_part_rd = RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist); + } - if (cpi->sf.adjust_partitioning_from_last_frame + if (do_partition_search + && cpi->sf.adjust_partitioning_from_last_frame + && cpi->sf.partition_search_type == SEARCH_PARTITION && partition != PARTITION_SPLIT && bsize > BLOCK_8X8 - && (mi_row + ms < cm->mi_rows || mi_row + (ms >> 1) == cm->mi_rows) - && (mi_col + ms < cm->mi_cols || mi_col + (ms >> 1) == cm->mi_cols)) { + && (mi_row + mi_step < cm->mi_rows || + mi_row + (mi_step >> 1) == cm->mi_rows) + && (mi_col + mi_step < cm->mi_cols || + mi_col + (mi_step >> 1) == cm->mi_cols)) { BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT); - split_rate = 0; - split_dist = 0; + chosen_rate = 0; + chosen_dist = 0; restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); + pc_tree->partitioning = PARTITION_SPLIT; // Split partition. for (i = 0; i < 4; i++) { - int x_idx = (i & 1) * (num_4x4_blocks_wide >> 2); - int y_idx = (i >> 1) * (num_4x4_blocks_wide >> 2); + int x_idx = (i & 1) * (mi_step >> 1); + int y_idx = (i >> 1) * (mi_step >> 1); int rt = 0; int64_t dt = 0; ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; @@ -1198,59 +1613,51 @@ static void rd_use_partition(VP9_COMP *cpi, if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) continue; - *get_sb_index(x, split_subsize) = i; - *get_sb_partitioning(x, bsize) = split_subsize; - *get_sb_partitioning(x, split_subsize) = split_subsize; - save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); - - pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &rt, &dt, - split_subsize, get_block_context(x, split_subsize), - INT64_MAX); + pc_tree->split[i]->partitioning = PARTITION_NONE; + rd_pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &rt, &dt, + split_subsize, &pc_tree->split[i]->none, + INT64_MAX, i); restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); - if (rt == INT_MAX || dt == INT_MAX) { - split_rate = INT_MAX; - split_dist = INT_MAX; + if (rt == INT_MAX || dt == INT64_MAX) { + chosen_rate = INT_MAX; + chosen_dist = INT64_MAX; break; } + chosen_rate += rt; + chosen_dist += dt; + if (i != 3) encode_sb(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, 0, - split_subsize); + split_subsize, pc_tree->split[i]); - split_rate += rt; - split_dist += dt; - pl = partition_plane_context(cpi->above_seg_context, - cpi->left_seg_context, - mi_row + y_idx, mi_col + x_idx, bsize); - split_rate += x->partition_cost[pl][PARTITION_NONE]; + pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx, + split_subsize); + chosen_rate += cpi->partition_cost[pl][PARTITION_NONE]; } - pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context, - mi_row, mi_col, bsize); - if (split_rate < INT_MAX) { - split_rate += x->partition_cost[pl][PARTITION_SPLIT]; - - chosen_rate = split_rate; - chosen_dist = split_dist; + pl = partition_plane_context(xd, mi_row, mi_col, bsize); + if (chosen_rate < INT_MAX) { + chosen_rate += cpi->partition_cost[pl][PARTITION_SPLIT]; + chosen_rd = RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist); } } - // If last_part is better set the partitioning to that... - if (RDCOST(x->rdmult, x->rddiv, last_part_rate, last_part_dist) - < RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist)) { + // If last_part is better set the partitioning to that. + if (last_part_rd < chosen_rd) { mi_8x8[0]->mbmi.sb_type = bsize; if (bsize >= BLOCK_8X8) - *(get_sb_partitioning(x, bsize)) = subsize; + pc_tree->partitioning = partition; chosen_rate = last_part_rate; chosen_dist = last_part_dist; + chosen_rd = last_part_rd; } - // If none was better set the partitioning to that... - if (RDCOST(x->rdmult, x->rddiv, chosen_rate, chosen_dist) - > RDCOST(x->rdmult, x->rddiv, none_rate, none_dist)) { + // If none was better set the partitioning to that. + if (none_rd < chosen_rd) { if (bsize >= BLOCK_8X8) - *(get_sb_partitioning(x, bsize)) = bsize; + pc_tree->partitioning = PARTITION_NONE; chosen_rate = none_rate; chosen_dist = none_dist; } @@ -1260,25 +1667,44 @@ static void rd_use_partition(VP9_COMP *cpi, // We must have chosen a partitioning and encoding or we'll fail later on. // No other opportunities for success. if ( bsize == BLOCK_64X64) - assert(chosen_rate < INT_MAX && chosen_dist < INT_MAX); + assert(chosen_rate < INT_MAX && chosen_dist < INT64_MAX); - if (do_recon) - encode_sb(cpi, tile, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize); + if (do_recon) { + int output_enabled = (bsize == BLOCK_64X64); + + // Check the projected output rate for this SB against it's target + // and and if necessary apply a Q delta using segmentation to get + // closer to the target. + if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) { + vp9_select_in_frame_q_segment(cpi, mi_row, mi_col, + output_enabled, chosen_rate); + } + + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh, + chosen_rate, chosen_dist); + encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize, + pc_tree); + } *rate = chosen_rate; *dist = chosen_dist; } static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = { - BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, - BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, BLOCK_8X8, - BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16 + BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, + BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, + BLOCK_8X8, BLOCK_8X8, BLOCK_8X8, + BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, + BLOCK_16X16 }; static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = { - BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, - BLOCK_32X32, BLOCK_32X32, BLOCK_32X32, BLOCK_64X64, - BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64 + BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, + BLOCK_16X16, BLOCK_32X32, BLOCK_32X32, + BLOCK_32X32, BLOCK_64X64, BLOCK_64X64, + BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, + BLOCK_64X64 }; // Look at all the mode_info entries for blocks that are part of this @@ -1288,10 +1714,10 @@ static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = { // // The min and max are assumed to have been initialized prior to calling this // function so repeat calls can accumulate a min and max of more than one sb64. -static void get_sb_partition_size_range(VP9_COMP *cpi, MODE_INFO ** mi_8x8, - BLOCK_SIZE * min_block_size, - BLOCK_SIZE * max_block_size ) { - MACROBLOCKD *const xd = &cpi->mb.e_mbd; +static void get_sb_partition_size_range(MACROBLOCKD *xd, MODE_INFO **mi_8x8, + BLOCK_SIZE *min_block_size, + BLOCK_SIZE *max_block_size, + int bs_hist[BLOCK_SIZES]) { int sb_width_in_blocks = MI_BLOCK_SIZE; int sb_height_in_blocks = MI_BLOCK_SIZE; int i, j; @@ -1302,167 +1728,233 @@ static void get_sb_partition_size_range(VP9_COMP *cpi, MODE_INFO ** mi_8x8, for (j = 0; j < sb_width_in_blocks; ++j) { MODE_INFO * mi = mi_8x8[index+j]; BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : 0; + bs_hist[sb_type]++; *min_block_size = MIN(*min_block_size, sb_type); *max_block_size = MAX(*max_block_size, sb_type); } - index += xd->mode_info_stride; + index += xd->mi_stride; } } +// Next square block size less or equal than current block size. +static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = { + BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, + BLOCK_8X8, BLOCK_8X8, BLOCK_8X8, + BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, + BLOCK_32X32, BLOCK_32X32, BLOCK_32X32, + BLOCK_64X64 +}; + // Look at neighboring blocks and set a min and max partition size based on // what they chose. static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile, - int row, int col, + int mi_row, int mi_col, BLOCK_SIZE *min_block_size, BLOCK_SIZE *max_block_size) { - VP9_COMMON * const cm = &cpi->common; + VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->mb.e_mbd; - MODE_INFO ** mi_8x8 = xd->mi_8x8; - MODE_INFO ** prev_mi_8x8 = xd->prev_mi_8x8; - - const int left_in_image = xd->left_available && mi_8x8[-1]; - const int above_in_image = xd->up_available && - mi_8x8[-xd->mode_info_stride]; - MODE_INFO ** above_sb64_mi_8x8; - MODE_INFO ** left_sb64_mi_8x8; - - int row8x8_remaining = tile->mi_row_end - row; - int col8x8_remaining = tile->mi_col_end - col; + MODE_INFO **mi = xd->mi; + const int left_in_image = xd->left_available && mi[-1]; + const int above_in_image = xd->up_available && mi[-xd->mi_stride]; + const int row8x8_remaining = tile->mi_row_end - mi_row; + const int col8x8_remaining = tile->mi_col_end - mi_col; int bh, bw; + BLOCK_SIZE min_size = BLOCK_4X4; + BLOCK_SIZE max_size = BLOCK_64X64; + int i = 0; + int bs_hist[BLOCK_SIZES] = {0}; // Trap case where we do not have a prediction. - if (!left_in_image && !above_in_image && - ((cm->frame_type == KEY_FRAME) || !cm->prev_mi)) { - *min_block_size = BLOCK_4X4; - *max_block_size = BLOCK_64X64; - } else { + if (left_in_image || above_in_image || cm->frame_type != KEY_FRAME) { // Default "min to max" and "max to min" - *min_block_size = BLOCK_64X64; - *max_block_size = BLOCK_4X4; + min_size = BLOCK_64X64; + max_size = BLOCK_4X4; // NOTE: each call to get_sb_partition_size_range() uses the previous // passed in values for min and max as a starting point. - // // Find the min and max partition used in previous frame at this location - if (cm->prev_mi && (cm->frame_type != KEY_FRAME)) { - get_sb_partition_size_range(cpi, prev_mi_8x8, - min_block_size, max_block_size); + if (cm->frame_type != KEY_FRAME) { + MODE_INFO **const prev_mi = + &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col]; + get_sb_partition_size_range(xd, prev_mi, &min_size, &max_size, bs_hist); } - // Find the min and max partition sizes used in the left SB64 if (left_in_image) { - left_sb64_mi_8x8 = &mi_8x8[-MI_BLOCK_SIZE]; - get_sb_partition_size_range(cpi, left_sb64_mi_8x8, - min_block_size, max_block_size); + MODE_INFO **left_sb64_mi = &mi[-MI_BLOCK_SIZE]; + get_sb_partition_size_range(xd, left_sb64_mi, &min_size, &max_size, + bs_hist); } - // Find the min and max partition sizes used in the above SB64. if (above_in_image) { - above_sb64_mi_8x8 = &mi_8x8[-xd->mode_info_stride * MI_BLOCK_SIZE]; - get_sb_partition_size_range(cpi, above_sb64_mi_8x8, - min_block_size, max_block_size); + MODE_INFO **above_sb64_mi = &mi[-xd->mi_stride * MI_BLOCK_SIZE]; + get_sb_partition_size_range(xd, above_sb64_mi, &min_size, &max_size, + bs_hist); + } + + // adjust observed min and max + if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) { + min_size = min_partition_size[min_size]; + max_size = max_partition_size[max_size]; + } else if (cpi->sf.auto_min_max_partition_size == + CONSTRAIN_NEIGHBORING_MIN_MAX) { + // adjust the search range based on the histogram of the observed + // partition sizes from left, above the previous co-located blocks + int sum = 0; + int first_moment = 0; + int second_moment = 0; + int var_unnormalized = 0; + + for (i = 0; i < BLOCK_SIZES; i++) { + sum += bs_hist[i]; + first_moment += bs_hist[i] * i; + second_moment += bs_hist[i] * i * i; + } + + // if variance is small enough, + // adjust the range around its mean size, which gives a tighter range + var_unnormalized = second_moment - first_moment * first_moment / sum; + if (var_unnormalized <= 4 * sum) { + int mean = first_moment / sum; + min_size = min_partition_size[mean]; + max_size = max_partition_size[mean]; + } else { + min_size = min_partition_size[min_size]; + max_size = max_partition_size[max_size]; + } } } - // Give a bit of leaway either side of the observed min and max - *min_block_size = min_partition_size[*min_block_size]; - *max_block_size = max_partition_size[*max_block_size]; + // Check border cases where max and min from neighbors may not be legal. + max_size = find_partition_size(max_size, + row8x8_remaining, col8x8_remaining, + &bh, &bw); + min_size = MIN(min_size, max_size); - // Check border cases where max and min from neighbours may not be legal. - *max_block_size = find_partition_size(*max_block_size, - row8x8_remaining, col8x8_remaining, - &bh, &bw); - *min_block_size = MIN(*min_block_size, *max_block_size); + // When use_square_partition_only is true, make sure at least one square + // partition is allowed by selecting the next smaller square size as + // *min_block_size. + if (cpi->sf.use_square_partition_only && + next_square_size[max_size] < min_size) { + min_size = next_square_size[max_size]; + } + + *min_block_size = min_size; + *max_block_size = max_size; } -static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) { +static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile, + int mi_row, int mi_col, + BLOCK_SIZE *min_block_size, + BLOCK_SIZE *max_block_size) { VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; + MACROBLOCKD *const xd = &cpi->mb.e_mbd; + MODE_INFO **mi_8x8 = xd->mi; + const int left_in_image = xd->left_available && mi_8x8[-1]; + const int above_in_image = xd->up_available && + mi_8x8[-xd->mi_stride]; + int row8x8_remaining = tile->mi_row_end - mi_row; + int col8x8_remaining = tile->mi_col_end - mi_col; + int bh, bw; + BLOCK_SIZE min_size = BLOCK_32X32; + BLOCK_SIZE max_size = BLOCK_8X8; + int bsl = mi_width_log2(BLOCK_64X64); + const int search_range_ctrl = (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_video_frame)) & 0x1; + // Trap case where we do not have a prediction. + if (search_range_ctrl && + (left_in_image || above_in_image || cm->frame_type != KEY_FRAME)) { + int block; + MODE_INFO **mi; + BLOCK_SIZE sb_type; - // Only use 8x8 result for non HD videos. - // int use_8x8 = (MIN(cpi->common.width, cpi->common.height) < 720) ? 1 : 0; - int use_8x8 = 1; - - if (cm->frame_type && !cpi->is_src_frame_alt_ref && - ((use_8x8 && bsize == BLOCK_16X16) || - bsize == BLOCK_32X32 || bsize == BLOCK_64X64)) { - int ref0 = 0, ref1 = 0, ref2 = 0, ref3 = 0; - PICK_MODE_CONTEXT *block_context = NULL; - - if (bsize == BLOCK_16X16) { - block_context = x->sb8x8_context[x->sb_index][x->mb_index]; - } else if (bsize == BLOCK_32X32) { - block_context = x->mb_context[x->sb_index]; - } else if (bsize == BLOCK_64X64) { - block_context = x->sb32_context; + // Find the min and max partition sizes used in the left SB64. + if (left_in_image) { + MODE_INFO *cur_mi; + mi = &mi_8x8[-1]; + for (block = 0; block < MI_BLOCK_SIZE; ++block) { + cur_mi = mi[block * xd->mi_stride]; + sb_type = cur_mi ? cur_mi->mbmi.sb_type : 0; + min_size = MIN(min_size, sb_type); + max_size = MAX(max_size, sb_type); + } + } + // Find the min and max partition sizes used in the above SB64. + if (above_in_image) { + mi = &mi_8x8[-xd->mi_stride * MI_BLOCK_SIZE]; + for (block = 0; block < MI_BLOCK_SIZE; ++block) { + sb_type = mi[block] ? mi[block]->mbmi.sb_type : 0; + min_size = MIN(min_size, sb_type); + max_size = MAX(max_size, sb_type); + } } - if (block_context) { - ref0 = block_context[0].mic.mbmi.ref_frame[0]; - ref1 = block_context[1].mic.mbmi.ref_frame[0]; - ref2 = block_context[2].mic.mbmi.ref_frame[0]; - ref3 = block_context[3].mic.mbmi.ref_frame[0]; - } + min_size = min_partition_size[min_size]; + max_size = find_partition_size(max_size, row8x8_remaining, col8x8_remaining, + &bh, &bw); + min_size = MIN(min_size, max_size); + min_size = MAX(min_size, BLOCK_8X8); + max_size = MIN(max_size, BLOCK_32X32); + } else { + min_size = BLOCK_8X8; + max_size = BLOCK_32X32; + } - // Currently, only consider 4 inter reference frames. - if (ref0 && ref1 && ref2 && ref3) { - int d01, d23, d02, d13; + *min_block_size = min_size; + *max_block_size = max_size; +} - // Motion vectors for the four subblocks. - int16_t mvr0 = block_context[0].mic.mbmi.mv[0].as_mv.row; - int16_t mvc0 = block_context[0].mic.mbmi.mv[0].as_mv.col; - int16_t mvr1 = block_context[1].mic.mbmi.mv[0].as_mv.row; - int16_t mvc1 = block_context[1].mic.mbmi.mv[0].as_mv.col; - int16_t mvr2 = block_context[2].mic.mbmi.mv[0].as_mv.row; - int16_t mvc2 = block_context[2].mic.mbmi.mv[0].as_mv.col; - int16_t mvr3 = block_context[3].mic.mbmi.mv[0].as_mv.row; - int16_t mvc3 = block_context[3].mic.mbmi.mv[0].as_mv.col; +// TODO(jingning) refactor functions setting partition search range +static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, BLOCK_SIZE bsize, + BLOCK_SIZE *min_bs, BLOCK_SIZE *max_bs) { + int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int mi_height = num_8x8_blocks_high_lookup[bsize]; + int idx, idy; - // Adjust sign if ref is alt_ref. - if (cm->ref_frame_sign_bias[ref0]) { - mvr0 *= -1; - mvc0 *= -1; - } + MODE_INFO *mi; + MODE_INFO **prev_mi = + &cm->prev_mi_grid_visible[mi_row * cm->mi_stride + mi_col]; + BLOCK_SIZE bs, min_size, max_size; - if (cm->ref_frame_sign_bias[ref1]) { - mvr1 *= -1; - mvc1 *= -1; - } + min_size = BLOCK_64X64; + max_size = BLOCK_4X4; - if (cm->ref_frame_sign_bias[ref2]) { - mvr2 *= -1; - mvc2 *= -1; - } - - if (cm->ref_frame_sign_bias[ref3]) { - mvr3 *= -1; - mvc3 *= -1; - } - - // Calculate mv distances. - d01 = MAX(abs(mvr0 - mvr1), abs(mvc0 - mvc1)); - d23 = MAX(abs(mvr2 - mvr3), abs(mvc2 - mvc3)); - d02 = MAX(abs(mvr0 - mvr2), abs(mvc0 - mvc2)); - d13 = MAX(abs(mvr1 - mvr3), abs(mvc1 - mvc3)); - - if (d01 < FAST_MOTION_MV_THRESH && d23 < FAST_MOTION_MV_THRESH && - d02 < FAST_MOTION_MV_THRESH && d13 < FAST_MOTION_MV_THRESH) { - // Set fast motion search level. - x->fast_ms = 1; - - if (ref0 == ref1 && ref1 == ref2 && ref2 == ref3 && - d01 < 2 && d23 < 2 && d02 < 2 && d13 < 2) { - // Set fast motion search level. - x->fast_ms = 2; - - if (!d01 && !d23 && !d02 && !d13) { - x->fast_ms = 3; - x->subblock_ref = ref0; - } - } + if (prev_mi) { + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + mi = prev_mi[idy * cm->mi_stride + idx]; + bs = mi ? mi->mbmi.sb_type : bsize; + min_size = MIN(min_size, bs); + max_size = MAX(max_size, bs); } } } + + if (xd->left_available) { + for (idy = 0; idy < mi_height; ++idy) { + mi = xd->mi[idy * cm->mi_stride - 1]; + bs = mi ? mi->mbmi.sb_type : bsize; + min_size = MIN(min_size, bs); + max_size = MAX(max_size, bs); + } + } + + if (xd->up_available) { + for (idx = 0; idx < mi_width; ++idx) { + mi = xd->mi[idx - cm->mi_stride]; + bs = mi ? mi->mbmi.sb_type : bsize; + min_size = MIN(min_size, bs); + max_size = MAX(max_size, bs); + } + } + + if (min_size == max_size) { + min_size = min_partition_size[min_size]; + max_size = max_partition_size[max_size]; + } + + *min_bs = min_size; + *max_bs = max_size; } static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { @@ -1473,19 +1965,67 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { vpx_memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv)); } +#if CONFIG_FP_MB_STATS +const int num_16x16_blocks_wide_lookup[BLOCK_SIZES] = + {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4}; +const int num_16x16_blocks_high_lookup[BLOCK_SIZES] = + {1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4}; +const int qindex_skip_threshold_lookup[BLOCK_SIZES] = + {0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120}; +const int qindex_split_threshold_lookup[BLOCK_SIZES] = + {0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120}; +const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6}; + +typedef enum { + MV_ZERO = 0, + MV_LEFT = 1, + MV_UP = 2, + MV_RIGHT = 3, + MV_DOWN = 4, + MV_INVALID +} MOTION_DIRECTION; + +static INLINE MOTION_DIRECTION get_motion_direction_fp(uint8_t fp_byte) { + if (fp_byte & FPMB_MOTION_ZERO_MASK) { + return MV_ZERO; + } else if (fp_byte & FPMB_MOTION_LEFT_MASK) { + return MV_LEFT; + } else if (fp_byte & FPMB_MOTION_RIGHT_MASK) { + return MV_RIGHT; + } else if (fp_byte & FPMB_MOTION_UP_MASK) { + return MV_UP; + } else { + return MV_DOWN; + } +} + +static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv, + MOTION_DIRECTION that_mv) { + if (this_mv == that_mv) { + return 0; + } else { + return abs(this_mv - that_mv) == 2 ? 2 : 1; + } +} +#endif + // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are // unlikely to be selected depending on previous rate-distortion optimization // results, for encoding speed-up. static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rate, - int64_t *dist, int do_recon, int64_t best_rd) { + int64_t *dist, int64_t best_rd, + PC_TREE *pc_tree) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; - const int ms = num_8x8_blocks_wide_lookup[bsize] / 2; + MACROBLOCKD *const xd = &x->e_mbd; + const int mi_step = num_8x8_blocks_wide_lookup[bsize] / 2; ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; PARTITION_CONTEXT sl[8], sa[8]; TOKENEXTRA *tp_orig = *tp; + PICK_MODE_CONTEXT *ctx = &pc_tree->none; int i, pl; BLOCK_SIZE subsize; int this_rate, sum_rate = 0, best_rate = INT_MAX; @@ -1493,46 +2033,53 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, int64_t sum_rd = 0; int do_split = bsize >= BLOCK_8X8; int do_rect = 1; + // Override skipping rectangular partition operations for edge blocks - const int force_horz_split = (mi_row + ms >= cm->mi_rows); - const int force_vert_split = (mi_col + ms >= cm->mi_cols); + const int force_horz_split = (mi_row + mi_step >= cm->mi_rows); + const int force_vert_split = (mi_col + mi_step >= cm->mi_cols); + const int xss = x->e_mbd.plane[1].subsampling_x; + const int yss = x->e_mbd.plane[1].subsampling_y; + + BLOCK_SIZE min_size = cpi->sf.min_partition_size; + BLOCK_SIZE max_size = cpi->sf.max_partition_size; + +#if CONFIG_FP_MB_STATS + unsigned int src_diff_var = UINT_MAX; + int none_complexity = 0; +#endif int partition_none_allowed = !force_horz_split && !force_vert_split; - int partition_horz_allowed = !force_vert_split && bsize >= BLOCK_8X8; - int partition_vert_allowed = !force_horz_split && bsize >= BLOCK_8X8; - - int partition_split_done = 0; + int partition_horz_allowed = !force_vert_split && yss <= xss && + bsize >= BLOCK_8X8; + int partition_vert_allowed = !force_horz_split && xss <= yss && + bsize >= BLOCK_8X8; (void) *tp_orig; - if (bsize < BLOCK_8X8) { - // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 - // there is nothing to be done. - if (x->ab_index != 0) { - *rate = 0; - *dist = 0; - return; - } - } assert(num_8x8_blocks_wide_lookup[bsize] == num_8x8_blocks_high_lookup[bsize]); - if (bsize == BLOCK_16X16) { - set_offsets(cpi, tile, mi_row, mi_col, bsize); + set_offsets(cpi, tile, mi_row, mi_col, bsize); + + if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode) x->mb_energy = vp9_block_energy(cpi, x, bsize); + + if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) { + int cb_partition_search_ctrl = ((pc_tree->index == 0 || pc_tree->index == 3) + + get_chessboard_index(cm->current_video_frame)) & 0x1; + + if (cb_partition_search_ctrl && bsize > min_size && bsize < max_size) + set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size); } // Determine partition types in search according to the speed features. // The threshold set here has to be of square block size. if (cpi->sf.auto_min_max_partition_size) { - partition_none_allowed &= (bsize <= cpi->sf.max_partition_size && - bsize >= cpi->sf.min_partition_size); - partition_horz_allowed &= ((bsize <= cpi->sf.max_partition_size && - bsize > cpi->sf.min_partition_size) || + partition_none_allowed &= (bsize <= max_size && bsize >= min_size); + partition_horz_allowed &= ((bsize <= max_size && bsize > min_size) || force_horz_split); - partition_vert_allowed &= ((bsize <= cpi->sf.max_partition_size && - bsize > cpi->sf.min_partition_size) || + partition_vert_allowed &= ((bsize <= max_size && bsize > min_size) || force_vert_split); - do_split &= bsize > cpi->sf.min_partition_size; + do_split &= bsize > min_size; } if (cpi->sf.use_square_partition_only) { partition_horz_allowed &= force_horz_split; @@ -1541,48 +2088,147 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); - if (cpi->sf.disable_split_var_thresh && partition_none_allowed) { - unsigned int source_variancey; - vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); - source_variancey = get_sby_perpixel_variance(cpi, x, bsize); - if (source_variancey < cpi->sf.disable_split_var_thresh) { - do_split = 0; - if (source_variancey < cpi->sf.disable_split_var_thresh / 2) - do_rect = 0; +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + set_offsets(cpi, tile, mi_row, mi_col, bsize); + src_diff_var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src, + mi_row, mi_col, bsize); + } +#endif + +#if CONFIG_FP_MB_STATS + // Decide whether we shall split directly and skip searching NONE by using + // the first pass block statistics + if (cpi->use_fp_mb_stats && bsize >= BLOCK_32X32 && do_split && + partition_none_allowed && src_diff_var > 4 && + cm->base_qindex < qindex_split_threshold_lookup[bsize]) { + int mb_row = mi_row >> 1; + int mb_col = mi_col >> 1; + int mb_row_end = + MIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows); + int mb_col_end = + MIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols); + int r, c; + + // compute a complexity measure, basically measure inconsistency of motion + // vectors obtained from the first pass in the current block + for (r = mb_row; r < mb_row_end ; r++) { + for (c = mb_col; c < mb_col_end; c++) { + const int mb_index = r * cm->mb_cols + c; + + MOTION_DIRECTION this_mv; + MOTION_DIRECTION right_mv; + MOTION_DIRECTION bottom_mv; + + this_mv = + get_motion_direction_fp(cpi->twopass.this_frame_mb_stats[mb_index]); + + // to its right + if (c != mb_col_end - 1) { + right_mv = get_motion_direction_fp( + cpi->twopass.this_frame_mb_stats[mb_index + 1]); + none_complexity += get_motion_inconsistency(this_mv, right_mv); + } + + // to its bottom + if (r != mb_row_end - 1) { + bottom_mv = get_motion_direction_fp( + cpi->twopass.this_frame_mb_stats[mb_index + cm->mb_cols]); + none_complexity += get_motion_inconsistency(this_mv, bottom_mv); + } + + // do not count its left and top neighbors to avoid double counting + } + } + + if (none_complexity > complexity_16x16_blocks_threshold[bsize]) { + partition_none_allowed = 0; } } +#endif // PARTITION_NONE if (partition_none_allowed) { - pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize, - get_block_context(x, bsize), best_rd); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize, + ctx, best_rd, 0); if (this_rate != INT_MAX) { if (bsize >= BLOCK_8X8) { - pl = partition_plane_context(cpi->above_seg_context, - cpi->left_seg_context, - mi_row, mi_col, bsize); - this_rate += x->partition_cost[pl][PARTITION_NONE]; + pl = partition_plane_context(xd, mi_row, mi_col, bsize); + this_rate += cpi->partition_cost[pl][PARTITION_NONE]; } sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist); + if (sum_rd < best_rd) { - int64_t stop_thresh = 2048; + int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_dist_thr; + int rate_breakout_thr = cpi->sf.partition_search_breakout_rate_thr; best_rate = this_rate; best_dist = this_dist; best_rd = sum_rd; if (bsize >= BLOCK_8X8) - *(get_sb_partitioning(x, bsize)) = bsize; + pc_tree->partitioning = PARTITION_NONE; - // Adjust threshold according to partition size. - stop_thresh >>= 8 - (b_width_log2_lookup[bsize] + - b_height_log2_lookup[bsize]); + // Adjust dist breakout threshold according to the partition size. + dist_breakout_thr >>= 8 - (b_width_log2(bsize) + + b_height_log2(bsize)); - // If obtained distortion is very small, choose current partition - // and stop splitting. - if (this_dist < stop_thresh) { + // If all y, u, v transform blocks in this partition are skippable, and + // the dist & rate are within the thresholds, the partition search is + // terminated for current branch of the partition search tree. + // The dist & rate thresholds are set to 0 at speed 0 to disable the + // early termination at that speed. + if (!x->e_mbd.lossless && + (ctx->skippable && best_dist < dist_breakout_thr && + best_rate < rate_breakout_thr)) { do_split = 0; do_rect = 0; } + +#if CONFIG_FP_MB_STATS + // Check if every 16x16 first pass block statistics has zero + // motion and the corresponding first pass residue is small enough. + // If that is the case, check the difference variance between the + // current frame and the last frame. If the variance is small enough, + // stop further splitting in RD optimization + if (cpi->use_fp_mb_stats && do_split != 0 && + cm->base_qindex > qindex_skip_threshold_lookup[bsize]) { + int mb_row = mi_row >> 1; + int mb_col = mi_col >> 1; + int mb_row_end = + MIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows); + int mb_col_end = + MIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols); + int r, c; + + int skip = 1; + for (r = mb_row; r < mb_row_end; r++) { + for (c = mb_col; c < mb_col_end; c++) { + const int mb_index = r * cm->mb_cols + c; + if (!(cpi->twopass.this_frame_mb_stats[mb_index] & + FPMB_MOTION_ZERO_MASK) || + !(cpi->twopass.this_frame_mb_stats[mb_index] & + FPMB_ERROR_SMALL_MASK)) { + skip = 0; + break; + } + } + if (skip == 0) { + break; + } + } + if (skip) { + if (src_diff_var == UINT_MAX) { + set_offsets(cpi, tile, mi_row, mi_col, bsize); + src_diff_var = get_sby_perpixel_diff_variance( + cpi, &cpi->mb.plane[0].src, mi_row, mi_col, bsize); + } + if (src_diff_var < 8) { + do_split = 0; + do_rect = 0; + } + } + } +#endif } } restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); @@ -1590,7 +2236,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, // store estimated motion vector if (cpi->sf.adaptive_motion_search) - store_pred_mv(x, get_block_context(x, bsize)); + store_pred_mv(x, ctx); // PARTITION_SPLIT sum_rd = 0; @@ -1598,38 +2244,53 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, // the starting point of motion search in the following partition type check. if (do_split) { subsize = get_subsize(bsize, PARTITION_SPLIT); - for (i = 0; i < 4 && sum_rd < best_rd; ++i) { - const int x_idx = (i & 1) * ms; - const int y_idx = (i >> 1) * ms; - - if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) - continue; - - *get_sb_index(x, subsize) = i; - if (cpi->sf.adaptive_motion_search) - load_pred_mv(x, get_block_context(x, bsize)); - rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, subsize, - &this_rate, &this_dist, i != 3, best_rd - sum_rd); - - if (this_rate == INT_MAX) { + if (bsize == BLOCK_8X8) { + i = 4; + if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed) + pc_tree->leaf_split[0]->pred_interp_filter = + ctx->mic.mbmi.interp_filter; + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, + pc_tree->leaf_split[0], best_rd, 0); + if (sum_rate == INT_MAX) sum_rd = INT64_MAX; - } else { - sum_rate += this_rate; - sum_dist += this_dist; + else sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); + } else { + for (i = 0; i < 4 && sum_rd < best_rd; ++i) { + const int x_idx = (i & 1) * mi_step; + const int y_idx = (i >> 1) * mi_step; + + if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) + continue; + + if (cpi->sf.adaptive_motion_search) + load_pred_mv(x, ctx); + + pc_tree->split[i]->index = i; + rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, + subsize, &this_rate, &this_dist, + best_rd - sum_rd, pc_tree->split[i]); + + if (this_rate == INT_MAX) { + sum_rd = INT64_MAX; + } else { + sum_rate += this_rate; + sum_dist += this_dist; + sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); + } } } + if (sum_rd < best_rd && i == 4) { - pl = partition_plane_context(cpi->above_seg_context, - cpi->left_seg_context, - mi_row, mi_col, bsize); - sum_rate += x->partition_cost[pl][PARTITION_SPLIT]; + pl = partition_plane_context(xd, mi_row, mi_col, bsize); + sum_rate += cpi->partition_cost[pl][PARTITION_SPLIT]; sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); + if (sum_rd < best_rd) { best_rate = sum_rate; best_dist = sum_dist; best_rd = sum_rd; - *(get_sb_partitioning(x, bsize)) = subsize; + pc_tree->partitioning = PARTITION_SPLIT; } } else { // skip rectangular partition test when larger block size @@ -1637,38 +2298,36 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, if (cpi->sf.less_rectangular_check) do_rect &= !partition_none_allowed; } - partition_split_done = 1; restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); } - x->fast_ms = 0; - x->subblock_ref = 0; - - if (partition_split_done && - cpi->sf.using_small_partition_info) { - compute_fast_motion_search_level(cpi, bsize); - } - // PARTITION_HORZ if (partition_horz_allowed && do_rect) { subsize = get_subsize(bsize, PARTITION_HORZ); - *get_sb_index(x, subsize) = 0; if (cpi->sf.adaptive_motion_search) - load_pred_mv(x, get_block_context(x, bsize)); - pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, - get_block_context(x, subsize), best_rd); + load_pred_mv(x, ctx); + if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && + partition_none_allowed) + pc_tree->horizontal[0].pred_interp_filter = + ctx->mic.mbmi.interp_filter; + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, + &pc_tree->horizontal[0], best_rd, 0); sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); - if (sum_rd < best_rd && mi_row + ms < cm->mi_rows) { - update_state(cpi, get_block_context(x, subsize), subsize, 0); - encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); + if (sum_rd < best_rd && mi_row + mi_step < cm->mi_rows) { + PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0]; + update_state(cpi, ctx, mi_row, mi_col, subsize, 0); + encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx); - *get_sb_index(x, subsize) = 1; if (cpi->sf.adaptive_motion_search) - load_pred_mv(x, get_block_context(x, bsize)); - pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate, - &this_dist, subsize, get_block_context(x, subsize), - best_rd - sum_rd); + load_pred_mv(x, ctx); + if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && + partition_none_allowed) + pc_tree->horizontal[1].pred_interp_filter = + ctx->mic.mbmi.interp_filter; + rd_pick_sb_modes(cpi, tile, mi_row + mi_step, mi_col, &this_rate, + &this_dist, subsize, &pc_tree->horizontal[1], + best_rd - sum_rd, 1); if (this_rate == INT_MAX) { sum_rd = INT64_MAX; } else { @@ -1678,41 +2337,46 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, } } if (sum_rd < best_rd) { - pl = partition_plane_context(cpi->above_seg_context, - cpi->left_seg_context, - mi_row, mi_col, bsize); - sum_rate += x->partition_cost[pl][PARTITION_HORZ]; + pl = partition_plane_context(xd, mi_row, mi_col, bsize); + sum_rate += cpi->partition_cost[pl][PARTITION_HORZ]; sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); if (sum_rd < best_rd) { best_rd = sum_rd; best_rate = sum_rate; best_dist = sum_dist; - *(get_sb_partitioning(x, bsize)) = subsize; + pc_tree->partitioning = PARTITION_HORZ; } } restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); } - // PARTITION_VERT if (partition_vert_allowed && do_rect) { subsize = get_subsize(bsize, PARTITION_VERT); - *get_sb_index(x, subsize) = 0; if (cpi->sf.adaptive_motion_search) - load_pred_mv(x, get_block_context(x, bsize)); - pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, - get_block_context(x, subsize), best_rd); + load_pred_mv(x, ctx); + if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && + partition_none_allowed) + pc_tree->vertical[0].pred_interp_filter = + ctx->mic.mbmi.interp_filter; + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, + &pc_tree->vertical[0], best_rd, 0); sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); - if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) { - update_state(cpi, get_block_context(x, subsize), subsize, 0); - encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); + if (sum_rd < best_rd && mi_col + mi_step < cm->mi_cols) { + update_state(cpi, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0); + encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, + &pc_tree->vertical[0]); - *get_sb_index(x, subsize) = 1; if (cpi->sf.adaptive_motion_search) - load_pred_mv(x, get_block_context(x, bsize)); - pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate, - &this_dist, subsize, get_block_context(x, subsize), - best_rd - sum_rd); + load_pred_mv(x, ctx); + if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && + partition_none_allowed) + pc_tree->vertical[1].pred_interp_filter = + ctx->mic.mbmi.interp_filter; + rd_pick_sb_modes(cpi, tile, mi_row, mi_col + mi_step, &this_rate, + &this_dist, subsize, + &pc_tree->vertical[1], best_rd - sum_rd, + 1); if (this_rate == INT_MAX) { sum_rd = INT64_MAX; } else { @@ -1722,79 +2386,62 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, } } if (sum_rd < best_rd) { - pl = partition_plane_context(cpi->above_seg_context, - cpi->left_seg_context, - mi_row, mi_col, bsize); - sum_rate += x->partition_cost[pl][PARTITION_VERT]; + pl = partition_plane_context(xd, mi_row, mi_col, bsize); + sum_rate += cpi->partition_cost[pl][PARTITION_VERT]; sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); if (sum_rd < best_rd) { best_rate = sum_rate; best_dist = sum_dist; best_rd = sum_rd; - *(get_sb_partitioning(x, bsize)) = subsize; + pc_tree->partitioning = PARTITION_VERT; } } restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); } - + // TODO(jbb): This code added so that we avoid static analysis + // warning related to the fact that best_rd isn't used after this + // point. This code should be refactored so that the duplicate + // checks occur in some sub function and thus are used... + (void) best_rd; *rate = best_rate; *dist = best_dist; - if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) - encode_sb(cpi, tile, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize); + if (best_rate < INT_MAX && best_dist < INT64_MAX && pc_tree->index != 3) { + int output_enabled = (bsize == BLOCK_64X64); + + // Check the projected output rate for this SB against it's target + // and and if necessary apply a Q delta using segmentation to get + // closer to the target. + if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) + vp9_select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled, + best_rate); + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh, + best_rate, best_dist); + + encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize, pc_tree); + } + if (bsize == BLOCK_64X64) { assert(tp_orig < *tp); assert(best_rate < INT_MAX); - assert(best_dist < INT_MAX); + assert(best_dist < INT64_MAX); } else { assert(tp_orig == *tp); } } -// Examines 64x64 block and chooses a best reference frame -static void rd_pick_reference_frame(VP9_COMP *cpi, const TileInfo *const tile, - int mi_row, int mi_col) { - VP9_COMMON * const cm = &cpi->common; - MACROBLOCK * const x = &cpi->mb; - int bsl = b_width_log2(BLOCK_64X64), bs = 1 << bsl; - int ms = bs / 2; - ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; - PARTITION_CONTEXT sl[8], sa[8]; - int pl; - int r; - int64_t d; - - save_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64); - - // Default is non mask (all reference frames allowed. - cpi->ref_frame_mask = 0; - - // Do RD search for 64x64. - if ((mi_row + (ms >> 1) < cm->mi_rows) && - (mi_col + (ms >> 1) < cm->mi_cols)) { - cpi->set_ref_frame_mask = 1; - pick_sb_modes(cpi, tile, mi_row, mi_col, &r, &d, BLOCK_64X64, - get_block_context(x, BLOCK_64X64), INT64_MAX); - pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context, - mi_row, mi_col, BLOCK_64X64); - r += x->partition_cost[pl][PARTITION_NONE]; - - *(get_sb_partitioning(x, BLOCK_64X64)) = BLOCK_64X64; - cpi->set_ref_frame_mask = 0; - } - - restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64); -} - -static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile, - int mi_row, TOKENEXTRA **tp) { - VP9_COMMON * const cm = &cpi->common; +static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, + int mi_row, TOKENEXTRA **tp) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->mb.e_mbd; + SPEED_FEATURES *const sf = &cpi->sf; int mi_col; // Initialize the left context for the new SB row - vpx_memset(&cpi->left_context, 0, sizeof(cpi->left_context)); - vpx_memset(cpi->left_seg_context, 0, sizeof(cpi->left_seg_context)); + vpx_memset(&xd->left_context, 0, sizeof(xd->left_context)); + vpx_memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context)); // Code each SB in the row for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; @@ -1802,58 +2449,98 @@ static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile, int dummy_rate; int64_t dummy_dist; + int i; + + if (sf->adaptive_pred_interp_filter) { + for (i = 0; i < 64; ++i) + cpi->leaf_tree[i].pred_interp_filter = SWITCHABLE; + + for (i = 0; i < 64; ++i) { + cpi->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE; + cpi->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE; + cpi->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE; + cpi->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE; + } + } + vp9_zero(cpi->mb.pred_mv); + cpi->pc_root->index = 0; - if (cpi->sf.reference_masking) - rd_pick_reference_frame(cpi, tile, mi_row, mi_col); - - if (cpi->sf.use_lastframe_partitioning || - cpi->sf.use_one_partition_size_always ) { - const int idx_str = cm->mode_info_stride * mi_row + mi_col; - MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str; - MODE_INFO **prev_mi_8x8 = cm->prev_mi_grid_visible + idx_str; - + if ((sf->partition_search_type == SEARCH_PARTITION && + sf->use_lastframe_partitioning) || + sf->partition_search_type == FIXED_PARTITION || + sf->partition_search_type == VAR_BASED_PARTITION || + sf->partition_search_type == VAR_BASED_FIXED_PARTITION) { + const int idx_str = cm->mi_stride * mi_row + mi_col; + MODE_INFO **mi = cm->mi_grid_visible + idx_str; + MODE_INFO **prev_mi = cm->prev_mi_grid_visible + idx_str; cpi->mb.source_variance = UINT_MAX; - if (cpi->sf.use_one_partition_size_always) { + if (sf->partition_search_type == FIXED_PARTITION) { set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64); - set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col); - rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1); + set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, + sf->always_this_block_size); + rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, + &dummy_rate, &dummy_dist, 1, cpi->pc_root); + } else if (cpi->skippable_frame || + sf->partition_search_type == VAR_BASED_FIXED_PARTITION) { + BLOCK_SIZE bsize; + set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64); + bsize = get_rd_var_based_fixed_partition(cpi, mi_row, mi_col); + set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize); + rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, + &dummy_rate, &dummy_dist, 1, cpi->pc_root); + } else if (sf->partition_search_type == VAR_BASED_PARTITION) { + choose_partitioning(cpi, tile, mi_row, mi_col); + rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, + &dummy_rate, &dummy_dist, 1, cpi->pc_root); } else { - if ((cpi->common.current_video_frame - % cpi->sf.last_partitioning_redo_frequency) == 0 + GF_GROUP * gf_grp = &cpi->twopass.gf_group; + int last_was_mid_sequence_overlay = 0; + if ((cpi->oxcf.pass == 2) && (gf_grp->index)) { + if (gf_grp->update_type[gf_grp->index - 1] == OVERLAY_UPDATE) + last_was_mid_sequence_overlay = 1; + } + if ((cpi->rc.frames_since_key + % sf->last_partitioning_redo_frequency) == 0 + || last_was_mid_sequence_overlay || cm->prev_mi == 0 - || cpi->common.show_frame == 0 - || cpi->common.frame_type == KEY_FRAME - || cpi->is_src_frame_alt_ref - || ((cpi->sf.use_lastframe_partitioning == + || cm->show_frame == 0 + || cm->frame_type == KEY_FRAME + || cpi->rc.is_src_frame_alt_ref + || ((sf->use_lastframe_partitioning == LAST_FRAME_PARTITION_LOW_MOTION) && - sb_has_motion(cpi, prev_mi_8x8))) { + sb_has_motion(cm, prev_mi, sf->lf_motion_threshold))) { // If required set upper and lower partition size limits - if (cpi->sf.auto_min_max_partition_size) { + if (sf->auto_min_max_partition_size) { set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64); rd_auto_partition_range(cpi, tile, mi_row, mi_col, - &cpi->sf.min_partition_size, - &cpi->sf.max_partition_size); + &sf->min_partition_size, + &sf->max_partition_size); } rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1, INT64_MAX); + &dummy_rate, &dummy_dist, INT64_MAX, + cpi->pc_root); } else { - copy_partitioning(cpi, mi_8x8, prev_mi_8x8); - rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1); + if (sf->constrain_copy_partition && + sb_has_motion(cm, prev_mi, sf->lf_motion_threshold)) + constrain_copy_partitioning(cpi, tile, mi, prev_mi, + mi_row, mi_col, BLOCK_16X16); + else + copy_partitioning(cm, mi, prev_mi); + rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, + &dummy_rate, &dummy_dist, 1, cpi->pc_root); } } } else { // If required set upper and lower partition size limits - if (cpi->sf.auto_min_max_partition_size) { + if (sf->auto_min_max_partition_size) { set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64); rd_auto_partition_range(cpi, tile, mi_row, mi_col, - &cpi->sf.min_partition_size, - &cpi->sf.max_partition_size); + &sf->min_partition_size, + &sf->max_partition_size); } rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1, INT64_MAX); + &dummy_rate, &dummy_dist, INT64_MAX, cpi->pc_root); } } } @@ -1864,181 +2551,18 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { MACROBLOCKD *const xd = &x->e_mbd; const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); - x->act_zbin_adj = 0; - cpi->seg0_idx = 0; - - xd->mode_info_stride = cm->mode_info_stride; - - // reset intra mode contexts - if (frame_is_intra_only(cm)) - vp9_init_mbmode_probs(cm); - // Copy data over into macro block data structures. vp9_setup_src_planes(x, cpi->Source, 0, 0); - // TODO(jkoleszar): are these initializations required? - setup_pre_planes(xd, 0, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]], - 0, 0, NULL); - setup_dst_planes(xd, get_frame_new_buffer(cm), 0, 0); - - setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); - - xd->mi_8x8[0]->mbmi.mode = DC_PRED; - xd->mi_8x8[0]->mbmi.uv_mode = DC_PRED; - - vp9_zero(cpi->y_mode_count); - vp9_zero(cpi->y_uv_mode_count); - vp9_zero(cm->counts.inter_mode); - vp9_zero(cpi->partition_count); - vp9_zero(cpi->intra_inter_count); - vp9_zero(cpi->comp_inter_count); - vp9_zero(cpi->single_ref_count); - vp9_zero(cpi->comp_ref_count); - vp9_zero(cm->counts.tx); - vp9_zero(cm->counts.mbskip); + vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); // Note: this memset assumes above_context[0], [1] and [2] // are allocated as part of the same buffer. - vpx_memset(cpi->above_context[0], 0, - sizeof(*cpi->above_context[0]) * + vpx_memset(xd->above_context[0], 0, + sizeof(*xd->above_context[0]) * 2 * aligned_mi_cols * MAX_MB_PLANE); - vpx_memset(cpi->above_seg_context, 0, - sizeof(*cpi->above_seg_context) * aligned_mi_cols); -} - -static void switch_lossless_mode(VP9_COMP *cpi, int lossless) { - if (lossless) { - // printf("Switching to lossless\n"); - cpi->mb.fwd_txm4x4 = vp9_fwht4x4; - cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add; - cpi->mb.optimize = 0; - cpi->common.lf.filter_level = 0; - cpi->zbin_mode_boost_enabled = 0; - cpi->common.tx_mode = ONLY_4X4; - } else { - // printf("Not lossless\n"); - cpi->mb.fwd_txm4x4 = vp9_fdct4x4; - cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add; - } -} - -static void switch_tx_mode(VP9_COMP *cpi) { - if (cpi->sf.tx_size_search_method == USE_LARGESTALL && - cpi->common.tx_mode >= ALLOW_32X32) - cpi->common.tx_mode = ALLOW_32X32; -} - -static void encode_frame_internal(VP9_COMP *cpi) { - int mi_row; - MACROBLOCK * const x = &cpi->mb; - VP9_COMMON * const cm = &cpi->common; - MACROBLOCKD * const xd = &x->e_mbd; - -// fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n", -// cpi->common.current_video_frame, cpi->common.show_frame, -// cm->frame_type); - -// debug output -#if DBG_PRNT_SEGMAP - { - FILE *statsfile; - statsfile = fopen("segmap2.stt", "a"); - fprintf(statsfile, "\n"); - fclose(statsfile); - } -#endif - - vp9_zero(cm->counts.switchable_interp); - vp9_zero(cpi->tx_stepdown_count); - - xd->mi_8x8 = cm->mi_grid_visible; - // required for vp9_frame_init_quantizer - xd->mi_8x8[0] = cm->mi; - - xd->last_mi = cm->prev_mi; - - vp9_zero(cpi->NMVcount); - vp9_zero(cpi->coef_counts); - vp9_zero(cm->counts.eob_branch); - - cpi->mb.e_mbd.lossless = cm->base_qindex == 0 && cm->y_dc_delta_q == 0 - && cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0; - switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless); - - vp9_frame_init_quantizer(cpi); - - vp9_initialize_rd_consts(cpi); - vp9_initialize_me_consts(cpi, cm->base_qindex); - switch_tx_mode(cpi); - - if (cpi->oxcf.tuning == VP8_TUNE_SSIM) { - // Initialize encode frame context. - init_encode_frame_mb_context(cpi); - - // Build a frame level activity map - build_activity_map(cpi); - } - - // Re-initialize encode frame context. - init_encode_frame_mb_context(cpi); - - vp9_zero(cpi->rd_comp_pred_diff); - vp9_zero(cpi->rd_filter_diff); - vp9_zero(cpi->rd_tx_select_diff); - vp9_zero(cpi->rd_tx_select_threshes); - - set_prev_mi(cm); - - { - struct vpx_usec_timer emr_timer; - vpx_usec_timer_start(&emr_timer); - - { - // Take tiles into account and give start/end MB - int tile_col, tile_row; - TOKENEXTRA *tp = cpi->tok; - const int tile_cols = 1 << cm->log2_tile_cols; - const int tile_rows = 1 << cm->log2_tile_rows; - - for (tile_row = 0; tile_row < tile_rows; tile_row++) { - for (tile_col = 0; tile_col < tile_cols; tile_col++) { - TileInfo tile; - TOKENEXTRA *tp_old = tp; - - // For each row of SBs in the frame - vp9_tile_init(&tile, cm, tile_row, tile_col); - for (mi_row = tile.mi_row_start; - mi_row < tile.mi_row_end; mi_row += 8) - encode_sb_row(cpi, &tile, mi_row, &tp); - - cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old); - assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols)); - } - } - } - - vpx_usec_timer_mark(&emr_timer); - cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer); - } - - if (cpi->sf.skip_encode_sb) { - int j; - unsigned int intra_count = 0, inter_count = 0; - for (j = 0; j < INTRA_INTER_CONTEXTS; ++j) { - intra_count += cpi->intra_inter_count[j][0]; - inter_count += cpi->intra_inter_count[j][1]; - } - cpi->sf.skip_encode_frame = ((intra_count << 2) < inter_count); - cpi->sf.skip_encode_frame &= (cm->frame_type != KEY_FRAME); - cpi->sf.skip_encode_frame &= cm->show_frame; - } else { - cpi->sf.skip_encode_frame = 0; - } - -#if 0 - // Keep record of the total distortion this time around for future use - cpi->last_frame_distortion = cpi->frame_distortion; -#endif + vpx_memset(xd->above_seg_context, 0, + sizeof(*xd->above_seg_context) * aligned_mi_cols); } static int check_dual_ref_flags(VP9_COMP *cpi) { @@ -2052,152 +2576,817 @@ static int check_dual_ref_flags(VP9_COMP *cpi) { } } -static int get_skip_flag(MODE_INFO **mi_8x8, int mis, int ymbs, int xmbs) { - int x, y; +static void reset_skip_tx_size(VP9_COMMON *cm, TX_SIZE max_tx_size) { + int mi_row, mi_col; + const int mis = cm->mi_stride; + MODE_INFO **mi_ptr = cm->mi_grid_visible; - for (y = 0; y < ymbs; y++) { - for (x = 0; x < xmbs; x++) { - if (!mi_8x8[y * mis + x]->mbmi.skip_coeff) - return 0; - } - } - - return 1; -} - -static void set_txfm_flag(MODE_INFO **mi_8x8, int mis, int ymbs, int xmbs, - TX_SIZE tx_size) { - int x, y; - - for (y = 0; y < ymbs; y++) { - for (x = 0; x < xmbs; x++) - mi_8x8[y * mis + x]->mbmi.tx_size = tx_size; - } -} - -static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, - int mis, TX_SIZE max_tx_size, int bw, int bh, - int mi_row, int mi_col, BLOCK_SIZE bsize) { - VP9_COMMON * const cm = &cpi->common; - - if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) { - return; - } else { - MB_MODE_INFO * const mbmi = &mi_8x8[0]->mbmi; - if (mbmi->tx_size > max_tx_size) { - const int ymbs = MIN(bh, cm->mi_rows - mi_row); - const int xmbs = MIN(bw, cm->mi_cols - mi_col); - - assert(vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) || - get_skip_flag(mi_8x8, mis, ymbs, xmbs)); - set_txfm_flag(mi_8x8, mis, ymbs, xmbs, max_tx_size); + for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row, mi_ptr += mis) { + for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { + if (mi_ptr[mi_col]->mbmi.tx_size > max_tx_size) + mi_ptr[mi_col]->mbmi.tx_size = max_tx_size; } } } -static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, - TX_SIZE max_tx_size, int mi_row, int mi_col, - BLOCK_SIZE bsize) { - VP9_COMMON * const cm = &cpi->common; - const int mis = cm->mode_info_stride; - int bw, bh; - const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2; +static MV_REFERENCE_FRAME get_frame_type(const VP9_COMP *cpi) { + if (frame_is_intra_only(&cpi->common)) + return INTRA_FRAME; + else if (cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame) + return ALTREF_FRAME; + else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) + return GOLDEN_FRAME; + else + return LAST_FRAME; +} + +static TX_MODE select_tx_mode(const VP9_COMP *cpi) { + if (cpi->mb.e_mbd.lossless) + return ONLY_4X4; + if (cpi->sf.tx_size_search_method == USE_LARGESTALL) + return ALLOW_32X32; + else if (cpi->sf.tx_size_search_method == USE_FULL_RD|| + cpi->sf.tx_size_search_method == USE_TX_8X8) + return TX_MODE_SELECT; + else + return cpi->common.tx_mode; +} + +static void nonrd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, + int mi_row, int mi_col, + int *rate, int64_t *dist, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *mbmi; + set_offsets(cpi, tile, mi_row, mi_col, bsize); + mbmi = &xd->mi[0]->mbmi; + mbmi->sb_type = bsize; + + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) + if (mbmi->segment_id && x->in_static_area) + x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh); + + if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) + set_mode_info_seg_skip(x, cm->tx_mode, rate, dist, bsize); + else + vp9_pick_inter_mode(cpi, x, tile, mi_row, mi_col, rate, dist, bsize, ctx); + + duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize); +} + +static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x, + int mi_row, int mi_col, + BLOCK_SIZE bsize, BLOCK_SIZE subsize, + PC_TREE *pc_tree) { + MACROBLOCKD *xd = &x->e_mbd; + int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4; + PARTITION_TYPE partition = pc_tree->partitioning; + + assert(bsize >= BLOCK_8X8); if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - bw = num_8x8_blocks_wide_lookup[mi_8x8[0]->mbmi.sb_type]; - bh = num_8x8_blocks_high_lookup[mi_8x8[0]->mbmi.sb_type]; + switch (partition) { + case PARTITION_NONE: + set_modeinfo_offsets(cm, xd, mi_row, mi_col); + *(xd->mi[0]) = pc_tree->none.mic; + duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize); + break; + case PARTITION_VERT: + set_modeinfo_offsets(cm, xd, mi_row, mi_col); + *(xd->mi[0]) = pc_tree->vertical[0].mic; + duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize); - if (bw == bs && bh == bs) { - reset_skip_txfm_size_b(cpi, mi_8x8, mis, max_tx_size, bs, bs, mi_row, - mi_col, bsize); - } else if (bw == bs && bh < bs) { - reset_skip_txfm_size_b(cpi, mi_8x8, mis, max_tx_size, bs, hbs, mi_row, - mi_col, bsize); - reset_skip_txfm_size_b(cpi, mi_8x8 + hbs * mis, mis, max_tx_size, bs, hbs, - mi_row + hbs, mi_col, bsize); - } else if (bw < bs && bh == bs) { - reset_skip_txfm_size_b(cpi, mi_8x8, mis, max_tx_size, hbs, bs, mi_row, - mi_col, bsize); - reset_skip_txfm_size_b(cpi, mi_8x8 + hbs, mis, max_tx_size, hbs, bs, mi_row, - mi_col + hbs, bsize); - - } else { - const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize]; - int n; - - assert(bw < bs && bh < bs); - - for (n = 0; n < 4; n++) { - const int mi_dc = hbs * (n & 1); - const int mi_dr = hbs * (n >> 1); - - reset_skip_txfm_size_sb(cpi, &mi_8x8[mi_dr * mis + mi_dc], max_tx_size, - mi_row + mi_dr, mi_col + mi_dc, subsize); + if (mi_col + hbs < cm->mi_cols) { + set_modeinfo_offsets(cm, xd, mi_row, mi_col + hbs); + *(xd->mi[0]) = pc_tree->vertical[1].mic; + duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col + hbs, bsize); + } + break; + case PARTITION_HORZ: + set_modeinfo_offsets(cm, xd, mi_row, mi_col); + *(xd->mi[0]) = pc_tree->horizontal[0].mic; + duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize); + if (mi_row + hbs < cm->mi_rows) { + set_modeinfo_offsets(cm, xd, mi_row + hbs, mi_col); + *(xd->mi[0]) = pc_tree->horizontal[1].mic; + duplicate_mode_info_in_sb(cm, xd, mi_row + hbs, mi_col, bsize); + } + break; + case PARTITION_SPLIT: { + BLOCK_SIZE subsubsize = get_subsize(subsize, PARTITION_SPLIT); + fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, + subsubsize, pc_tree->split[0]); + fill_mode_info_sb(cm, x, mi_row, mi_col + hbs, subsize, + subsubsize, pc_tree->split[1]); + fill_mode_info_sb(cm, x, mi_row + hbs, mi_col, subsize, + subsubsize, pc_tree->split[2]); + fill_mode_info_sb(cm, x, mi_row + hbs, mi_col + hbs, subsize, + subsubsize, pc_tree->split[3]); + break; } + default: + break; } } -static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) { - VP9_COMMON * const cm = &cpi->common; - int mi_row, mi_col; - const int mis = cm->mode_info_stride; -// MODE_INFO *mi, *mi_ptr = cm->mi; - MODE_INFO **mi_8x8, **mi_ptr = cm->mi_grid_visible; +static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, + TOKENEXTRA **tp, int mi_row, + int mi_col, BLOCK_SIZE bsize, int *rate, + int64_t *dist, int do_recon, int64_t best_rd, + PC_TREE *pc_tree) { + const SPEED_FEATURES *const sf = &cpi->sf; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int ms = num_8x8_blocks_wide_lookup[bsize] / 2; + TOKENEXTRA *tp_orig = *tp; + PICK_MODE_CONTEXT *ctx = &pc_tree->none; + int i; + BLOCK_SIZE subsize = bsize; + int this_rate, sum_rate = 0, best_rate = INT_MAX; + int64_t this_dist, sum_dist = 0, best_dist = INT64_MAX; + int64_t sum_rd = 0; + int do_split = bsize >= BLOCK_8X8; + int do_rect = 1; + // Override skipping rectangular partition operations for edge blocks + const int force_horz_split = (mi_row + ms >= cm->mi_rows); + const int force_vert_split = (mi_col + ms >= cm->mi_cols); + const int xss = x->e_mbd.plane[1].subsampling_x; + const int yss = x->e_mbd.plane[1].subsampling_y; - for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8, mi_ptr += 8 * mis) { - mi_8x8 = mi_ptr; - for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8, mi_8x8 += 8) { - reset_skip_txfm_size_sb(cpi, mi_8x8, txfm_max, mi_row, mi_col, - BLOCK_64X64); + int partition_none_allowed = !force_horz_split && !force_vert_split; + int partition_horz_allowed = !force_vert_split && yss <= xss && + bsize >= BLOCK_8X8; + int partition_vert_allowed = !force_horz_split && xss <= yss && + bsize >= BLOCK_8X8; + (void) *tp_orig; + + assert(num_8x8_blocks_wide_lookup[bsize] == + num_8x8_blocks_high_lookup[bsize]); + + // Determine partition types in search according to the speed features. + // The threshold set here has to be of square block size. + if (sf->auto_min_max_partition_size) { + partition_none_allowed &= (bsize <= sf->max_partition_size && + bsize >= sf->min_partition_size); + partition_horz_allowed &= ((bsize <= sf->max_partition_size && + bsize > sf->min_partition_size) || + force_horz_split); + partition_vert_allowed &= ((bsize <= sf->max_partition_size && + bsize > sf->min_partition_size) || + force_vert_split); + do_split &= bsize > sf->min_partition_size; + } + if (sf->use_square_partition_only) { + partition_horz_allowed &= force_horz_split; + partition_vert_allowed &= force_vert_split; + } + + // PARTITION_NONE + if (partition_none_allowed) { + nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, + &this_rate, &this_dist, bsize, ctx); + ctx->mic.mbmi = xd->mi[0]->mbmi; + ctx->skip_txfm[0] = x->skip_txfm[0]; + ctx->skip = x->skip; + + if (this_rate != INT_MAX) { + int pl = partition_plane_context(xd, mi_row, mi_col, bsize); + this_rate += cpi->partition_cost[pl][PARTITION_NONE]; + sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist); + if (sum_rd < best_rd) { + int64_t stop_thresh = 4096; + int64_t stop_thresh_rd; + + best_rate = this_rate; + best_dist = this_dist; + best_rd = sum_rd; + if (bsize >= BLOCK_8X8) + pc_tree->partitioning = PARTITION_NONE; + + // Adjust threshold according to partition size. + stop_thresh >>= 8 - (b_width_log2(bsize) + + b_height_log2(bsize)); + + stop_thresh_rd = RDCOST(x->rdmult, x->rddiv, 0, stop_thresh); + // If obtained distortion is very small, choose current partition + // and stop splitting. + if (!x->e_mbd.lossless && best_rd < stop_thresh_rd) { + do_split = 0; + do_rect = 0; + } + } } } -} -static int get_frame_type(VP9_COMP *cpi) { - int frame_type; - if (frame_is_intra_only(&cpi->common)) - frame_type = 0; - else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame) - frame_type = 3; - else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) - frame_type = 1; - else - frame_type = 2; - return frame_type; -} + // store estimated motion vector + store_pred_mv(x, ctx); -static void select_tx_mode(VP9_COMP *cpi) { - if (cpi->oxcf.lossless) { - cpi->common.tx_mode = ONLY_4X4; - } else if (cpi->common.current_video_frame == 0) { - cpi->common.tx_mode = TX_MODE_SELECT; - } else { - if (cpi->sf.tx_size_search_method == USE_LARGESTALL) { - cpi->common.tx_mode = ALLOW_32X32; - } else if (cpi->sf.tx_size_search_method == USE_FULL_RD) { - int frame_type = get_frame_type(cpi); - cpi->common.tx_mode = - cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] - > cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ? - ALLOW_32X32 : TX_MODE_SELECT; + // PARTITION_SPLIT + sum_rd = 0; + if (do_split) { + int pl = partition_plane_context(xd, mi_row, mi_col, bsize); + sum_rate += cpi->partition_cost[pl][PARTITION_SPLIT]; + subsize = get_subsize(bsize, PARTITION_SPLIT); + for (i = 0; i < 4 && sum_rd < best_rd; ++i) { + const int x_idx = (i & 1) * ms; + const int y_idx = (i >> 1) * ms; + + if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) + continue; + load_pred_mv(x, ctx); + nonrd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, + subsize, &this_rate, &this_dist, 0, + best_rd - sum_rd, pc_tree->split[i]); + + if (this_rate == INT_MAX) { + sum_rd = INT64_MAX; + } else { + sum_rate += this_rate; + sum_dist += this_dist; + sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); + } + } + + if (sum_rd < best_rd) { + best_rate = sum_rate; + best_dist = sum_dist; + best_rd = sum_rd; + pc_tree->partitioning = PARTITION_SPLIT; } else { - unsigned int total = 0; - int i; - for (i = 0; i < TX_SIZES; ++i) - total += cpi->tx_stepdown_count[i]; - if (total) { - double fraction = (double)cpi->tx_stepdown_count[0] / total; - cpi->common.tx_mode = fraction > 0.90 ? ALLOW_32X32 : TX_MODE_SELECT; - // printf("fraction = %f\n", fraction); - } // else keep unchanged + // skip rectangular partition test when larger block size + // gives better rd cost + if (sf->less_rectangular_check) + do_rect &= !partition_none_allowed; } } + + // PARTITION_HORZ + if (partition_horz_allowed && do_rect) { + subsize = get_subsize(bsize, PARTITION_HORZ); + if (sf->adaptive_motion_search) + load_pred_mv(x, ctx); + + nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, + &this_rate, &this_dist, subsize, + &pc_tree->horizontal[0]); + + pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->horizontal[0].skip = x->skip; + + sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); + + if (sum_rd < best_rd && mi_row + ms < cm->mi_rows) { + load_pred_mv(x, ctx); + nonrd_pick_sb_modes(cpi, tile, mi_row + ms, mi_col, + &this_rate, &this_dist, subsize, + &pc_tree->horizontal[1]); + + pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->horizontal[1].skip = x->skip; + + if (this_rate == INT_MAX) { + sum_rd = INT64_MAX; + } else { + int pl = partition_plane_context(xd, mi_row, mi_col, bsize); + this_rate += cpi->partition_cost[pl][PARTITION_HORZ]; + sum_rate += this_rate; + sum_dist += this_dist; + sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); + } + } + if (sum_rd < best_rd) { + best_rd = sum_rd; + best_rate = sum_rate; + best_dist = sum_dist; + pc_tree->partitioning = PARTITION_HORZ; + } + } + + // PARTITION_VERT + if (partition_vert_allowed && do_rect) { + subsize = get_subsize(bsize, PARTITION_VERT); + + if (sf->adaptive_motion_search) + load_pred_mv(x, ctx); + + nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, + &this_rate, &this_dist, subsize, + &pc_tree->vertical[0]); + pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->vertical[0].skip = x->skip; + sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); + if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) { + load_pred_mv(x, ctx); + nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + ms, + &this_rate, &this_dist, subsize, + &pc_tree->vertical[1]); + pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->vertical[1].skip = x->skip; + if (this_rate == INT_MAX) { + sum_rd = INT64_MAX; + } else { + int pl = partition_plane_context(xd, mi_row, mi_col, bsize); + this_rate += cpi->partition_cost[pl][PARTITION_VERT]; + sum_rate += this_rate; + sum_dist += this_dist; + sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); + } + } + if (sum_rd < best_rd) { + best_rate = sum_rate; + best_dist = sum_dist; + best_rd = sum_rd; + pc_tree->partitioning = PARTITION_VERT; + } + } + // TODO(JBB): The following line is here just to avoid a static warning + // that occurs because at this point we never again reuse best_rd + // despite setting it here. The code should be refactored to avoid this. + (void) best_rd; + + *rate = best_rate; + *dist = best_dist; + + if (best_rate == INT_MAX) + return; + + // update mode info array + subsize = get_subsize(bsize, pc_tree->partitioning); + fill_mode_info_sb(cm, x, mi_row, mi_col, bsize, subsize, + pc_tree); + + if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) { + int output_enabled = (bsize == BLOCK_64X64); + + // Check the projected output rate for this SB against it's target + // and and if necessary apply a Q delta using segmentation to get + // closer to the target. + if ((oxcf->aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) { + vp9_select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled, + best_rate); + } + + if (oxcf->aq_mode == CYCLIC_REFRESH_AQ) + vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh, + best_rate, best_dist); + + encode_sb_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize, pc_tree); + } + + if (bsize == BLOCK_64X64) { + assert(tp_orig < *tp); + assert(best_rate < INT_MAX); + assert(best_dist < INT64_MAX); + } else { + assert(tp_orig == *tp); + } +} + +static void nonrd_use_partition(VP9_COMP *cpi, + const TileInfo *const tile, + MODE_INFO **mi, + TOKENEXTRA **tp, + int mi_row, int mi_col, + BLOCK_SIZE bsize, int output_enabled, + int *totrate, int64_t *totdist, + PC_TREE *pc_tree) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int bsl = b_width_log2(bsize), hbs = (1 << bsl) / 4; + const int mis = cm->mi_stride; + PARTITION_TYPE partition; + BLOCK_SIZE subsize; + int rate = INT_MAX; + int64_t dist = INT64_MAX; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) + return; + + subsize = (bsize >= BLOCK_8X8) ? mi[0]->mbmi.sb_type : BLOCK_4X4; + partition = partition_lookup[bsl][subsize]; + + switch (partition) { + case PARTITION_NONE: + nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, + subsize, &pc_tree->none); + pc_tree->none.mic.mbmi = xd->mi[0]->mbmi; + pc_tree->none.skip_txfm[0] = x->skip_txfm[0]; + pc_tree->none.skip = x->skip; + break; + case PARTITION_VERT: + nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, + subsize, &pc_tree->vertical[0]); + pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->vertical[0].skip = x->skip; + if (mi_col + hbs < cm->mi_cols) { + nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + hbs, + &rate, &dist, subsize, &pc_tree->vertical[1]); + pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->vertical[1].skip = x->skip; + if (rate != INT_MAX && dist != INT64_MAX && + *totrate != INT_MAX && *totdist != INT64_MAX) { + *totrate += rate; + *totdist += dist; + } + } + break; + case PARTITION_HORZ: + nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, + subsize, &pc_tree->horizontal[0]); + pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->horizontal[0].skip = x->skip; + if (mi_row + hbs < cm->mi_rows) { + nonrd_pick_sb_modes(cpi, tile, mi_row + hbs, mi_col, + &rate, &dist, subsize, &pc_tree->horizontal[0]); + pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi; + pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0]; + pc_tree->horizontal[1].skip = x->skip; + if (rate != INT_MAX && dist != INT64_MAX && + *totrate != INT_MAX && *totdist != INT64_MAX) { + *totrate += rate; + *totdist += dist; + } + } + break; + case PARTITION_SPLIT: + subsize = get_subsize(bsize, PARTITION_SPLIT); + nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, + subsize, output_enabled, totrate, totdist, + pc_tree->split[0]); + nonrd_use_partition(cpi, tile, mi + hbs, tp, + mi_row, mi_col + hbs, subsize, output_enabled, + &rate, &dist, pc_tree->split[1]); + if (rate != INT_MAX && dist != INT64_MAX && + *totrate != INT_MAX && *totdist != INT64_MAX) { + *totrate += rate; + *totdist += dist; + } + nonrd_use_partition(cpi, tile, mi + hbs * mis, tp, + mi_row + hbs, mi_col, subsize, output_enabled, + &rate, &dist, pc_tree->split[2]); + if (rate != INT_MAX && dist != INT64_MAX && + *totrate != INT_MAX && *totdist != INT64_MAX) { + *totrate += rate; + *totdist += dist; + } + nonrd_use_partition(cpi, tile, mi + hbs * mis + hbs, tp, + mi_row + hbs, mi_col + hbs, subsize, output_enabled, + &rate, &dist, pc_tree->split[3]); + if (rate != INT_MAX && dist != INT64_MAX && + *totrate != INT_MAX && *totdist != INT64_MAX) { + *totrate += rate; + *totdist += dist; + } + break; + default: + assert("Invalid partition type."); + break; + } + + if (bsize == BLOCK_64X64 && output_enabled) { + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) + vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh, + *totrate, *totdist); + encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, bsize, pc_tree); + } +} + +static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, + int mi_row, TOKENEXTRA **tp) { + SPEED_FEATURES *const sf = &cpi->sf; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; + MACROBLOCKD *const xd = &x->e_mbd; + int mi_col; + + // Initialize the left context for the new SB row + vpx_memset(&xd->left_context, 0, sizeof(xd->left_context)); + vpx_memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context)); + + // Code each SB in the row + for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; + mi_col += MI_BLOCK_SIZE) { + int dummy_rate = 0; + int64_t dummy_dist = 0; + const int idx_str = cm->mi_stride * mi_row + mi_col; + MODE_INFO **mi = cm->mi_grid_visible + idx_str; + MODE_INFO **prev_mi = cm->prev_mi_grid_visible + idx_str; + BLOCK_SIZE bsize; + + x->in_static_area = 0; + x->source_variance = UINT_MAX; + vp9_zero(x->pred_mv); + + // Set the partition type of the 64X64 block + switch (sf->partition_search_type) { + case VAR_BASED_PARTITION: + choose_partitioning(cpi, tile, mi_row, mi_col); + nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, + 1, &dummy_rate, &dummy_dist, cpi->pc_root); + break; + case SOURCE_VAR_BASED_PARTITION: + set_source_var_based_partition(cpi, tile, mi, mi_row, mi_col); + nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, + 1, &dummy_rate, &dummy_dist, cpi->pc_root); + break; + case VAR_BASED_FIXED_PARTITION: + case FIXED_PARTITION: + bsize = sf->partition_search_type == FIXED_PARTITION ? + sf->always_this_block_size : + get_nonrd_var_based_fixed_partition(cpi, mi_row, mi_col); + set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize); + nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64, + 1, &dummy_rate, &dummy_dist, cpi->pc_root); + break; + case REFERENCE_PARTITION: + if (sf->partition_check || + !(x->in_static_area = is_background(cpi, tile, mi_row, mi_col))) { + set_modeinfo_offsets(cm, xd, mi_row, mi_col); + auto_partition_range(cpi, tile, mi_row, mi_col, + &sf->min_partition_size, + &sf->max_partition_size); + nonrd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64, + &dummy_rate, &dummy_dist, 1, INT64_MAX, + cpi->pc_root); + } else { + copy_partitioning(cm, mi, prev_mi); + nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, + BLOCK_64X64, 1, &dummy_rate, &dummy_dist, + cpi->pc_root); + } + break; + default: + assert(0); + break; + } + } +} +// end RTC play code + +static int set_var_thresh_from_histogram(VP9_COMP *cpi) { + const SPEED_FEATURES *const sf = &cpi->sf; + const VP9_COMMON *const cm = &cpi->common; + + const uint8_t *src = cpi->Source->y_buffer; + const uint8_t *last_src = cpi->Last_Source->y_buffer; + const int src_stride = cpi->Source->y_stride; + const int last_stride = cpi->Last_Source->y_stride; + + // Pick cutoff threshold + const int cutoff = (MIN(cm->width, cm->height) >= 720) ? + (cm->MBs * VAR_HIST_LARGE_CUT_OFF / 100) : + (cm->MBs * VAR_HIST_SMALL_CUT_OFF / 100); + DECLARE_ALIGNED_ARRAY(16, int, hist, VAR_HIST_BINS); + diff *var16 = cpi->source_diff_var; + + int sum = 0; + int i, j; + + vpx_memset(hist, 0, VAR_HIST_BINS * sizeof(hist[0])); + + for (i = 0; i < cm->mb_rows; i++) { + for (j = 0; j < cm->mb_cols; j++) { + vp9_get16x16var(src, src_stride, last_src, last_stride, + &var16->sse, &var16->sum); + + var16->var = var16->sse - + (((uint32_t)var16->sum * var16->sum) >> 8); + + if (var16->var >= VAR_HIST_MAX_BG_VAR) + hist[VAR_HIST_BINS - 1]++; + else + hist[var16->var / VAR_HIST_FACTOR]++; + + src += 16; + last_src += 16; + var16++; + } + + src = src - cm->mb_cols * 16 + 16 * src_stride; + last_src = last_src - cm->mb_cols * 16 + 16 * last_stride; + } + + cpi->source_var_thresh = 0; + + if (hist[VAR_HIST_BINS - 1] < cutoff) { + for (i = 0; i < VAR_HIST_BINS - 1; i++) { + sum += hist[i]; + + if (sum > cutoff) { + cpi->source_var_thresh = (i + 1) * VAR_HIST_FACTOR; + return 0; + } + } + } + + return sf->search_type_check_frequency; +} + +static void source_var_based_partition_search_method(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + SPEED_FEATURES *const sf = &cpi->sf; + + if (cm->frame_type == KEY_FRAME) { + // For key frame, use SEARCH_PARTITION. + sf->partition_search_type = SEARCH_PARTITION; + } else if (cm->intra_only) { + sf->partition_search_type = FIXED_PARTITION; + } else { + if (cm->last_width != cm->width || cm->last_height != cm->height) { + if (cpi->source_diff_var) + vpx_free(cpi->source_diff_var); + + CHECK_MEM_ERROR(cm, cpi->source_diff_var, + vpx_calloc(cm->MBs, sizeof(diff))); + } + + if (!cpi->frames_till_next_var_check) + cpi->frames_till_next_var_check = set_var_thresh_from_histogram(cpi); + + if (cpi->frames_till_next_var_check > 0) { + sf->partition_search_type = FIXED_PARTITION; + cpi->frames_till_next_var_check--; + } + } +} + +static int get_skip_encode_frame(const VP9_COMMON *cm) { + unsigned int intra_count = 0, inter_count = 0; + int j; + + for (j = 0; j < INTRA_INTER_CONTEXTS; ++j) { + intra_count += cm->counts.intra_inter[j][0]; + inter_count += cm->counts.intra_inter[j][1]; + } + + return (intra_count << 2) < inter_count && + cm->frame_type != KEY_FRAME && + cm->show_frame; +} + +static void encode_tiles(VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + int tile_col, tile_row; + TOKENEXTRA *tok = cpi->tok; + + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileInfo tile; + TOKENEXTRA *old_tok = tok; + int mi_row; + + vp9_tile_init(&tile, cm, tile_row, tile_col); + for (mi_row = tile.mi_row_start; mi_row < tile.mi_row_end; + mi_row += MI_BLOCK_SIZE) { + if (cpi->sf.use_nonrd_pick_mode && !frame_is_intra_only(cm)) + encode_nonrd_sb_row(cpi, &tile, mi_row, &tok); + else + encode_rd_sb_row(cpi, &tile, mi_row, &tok); + } + cpi->tok_count[tile_row][tile_col] = (unsigned int)(tok - old_tok); + assert(tok - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols)); + } + } +} + +#if CONFIG_FP_MB_STATS +static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats, + VP9_COMMON *cm, uint8_t **this_frame_mb_stats) { + uint8_t *mb_stats_in = firstpass_mb_stats->mb_stats_start + + cm->current_video_frame * cm->MBs * sizeof(uint8_t); + + if (mb_stats_in > firstpass_mb_stats->mb_stats_end) + return EOF; + + *this_frame_mb_stats = mb_stats_in; + + return 1; +} +#endif + +static void encode_frame_internal(VP9_COMP *cpi) { + SPEED_FEATURES *const sf = &cpi->sf; + RD_OPT *const rd_opt = &cpi->rd; + MACROBLOCK *const x = &cpi->mb; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + + xd->mi = cm->mi_grid_visible; + xd->mi[0] = cm->mi; + + vp9_zero(cm->counts); + vp9_zero(cpi->coef_counts); + vp9_zero(rd_opt->comp_pred_diff); + vp9_zero(rd_opt->filter_diff); + vp9_zero(rd_opt->tx_select_diff); + vp9_zero(rd_opt->tx_select_threshes); + + xd->lossless = cm->base_qindex == 0 && + cm->y_dc_delta_q == 0 && + cm->uv_dc_delta_q == 0 && + cm->uv_ac_delta_q == 0; + + cm->tx_mode = select_tx_mode(cpi); + + x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4; + x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; + + if (xd->lossless) { + x->optimize = 0; + cm->lf.filter_level = 0; + cpi->zbin_mode_boost_enabled = 0; + } + + vp9_frame_init_quantizer(cpi); + + vp9_initialize_rd_consts(cpi); + vp9_initialize_me_consts(cpi, cm->base_qindex); + init_encode_frame_mb_context(cpi); + set_prev_mi(cm); + + x->quant_fp = cpi->sf.use_quant_fp; + vp9_zero(x->skip_txfm); + if (sf->use_nonrd_pick_mode) { + // Initialize internal buffer pointers for rtc coding, where non-RD + // mode decision is used and hence no buffer pointer swap needed. + int i; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + PICK_MODE_CONTEXT *ctx = &cpi->pc_root->none; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + p[i].coeff = ctx->coeff_pbuf[i][0]; + p[i].qcoeff = ctx->qcoeff_pbuf[i][0]; + pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0]; + p[i].eobs = ctx->eobs_pbuf[i][0]; + } + vp9_zero(x->zcoeff_blk); + + if (sf->partition_search_type == SOURCE_VAR_BASED_PARTITION) + source_var_based_partition_search_method(cpi); + } + + { + struct vpx_usec_timer emr_timer; + vpx_usec_timer_start(&emr_timer); + +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + input_fpmb_stats(&cpi->twopass.firstpass_mb_stats, cm, + &cpi->twopass.this_frame_mb_stats); + } +#endif + + encode_tiles(cpi); + + vpx_usec_timer_mark(&emr_timer); + cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer); + } + + sf->skip_encode_frame = sf->skip_encode_sb ? get_skip_encode_frame(cm) : 0; + +#if 0 + // Keep record of the total distortion this time around for future use + cpi->last_frame_distortion = cpi->frame_distortion; +#endif +} + +static INTERP_FILTER get_interp_filter( + const int64_t threshes[SWITCHABLE_FILTER_CONTEXTS], int is_alt_ref) { + if (!is_alt_ref && + threshes[EIGHTTAP_SMOOTH] > threshes[EIGHTTAP] && + threshes[EIGHTTAP_SMOOTH] > threshes[EIGHTTAP_SHARP] && + threshes[EIGHTTAP_SMOOTH] > threshes[SWITCHABLE - 1]) { + return EIGHTTAP_SMOOTH; + } else if (threshes[EIGHTTAP_SHARP] > threshes[EIGHTTAP] && + threshes[EIGHTTAP_SHARP] > threshes[SWITCHABLE - 1]) { + return EIGHTTAP_SHARP; + } else if (threshes[EIGHTTAP] > threshes[SWITCHABLE - 1]) { + return EIGHTTAP; + } else { + return SWITCHABLE; + } } void vp9_encode_frame(VP9_COMP *cpi) { - VP9_COMMON * const cm = &cpi->common; + VP9_COMMON *const cm = &cpi->common; + RD_OPT *const rd_opt = &cpi->rd; // In the longer term the encoder should be generalized to match the // decoder such that we allow compound where one of the 3 buffers has a @@ -2206,10 +3395,10 @@ void vp9_encode_frame(VP9_COMP *cpi) { // side behavior is where the ALT ref buffer has opposite sign bias to // the other two. if (!frame_is_intra_only(cm)) { - if ((cm->ref_frame_sign_bias[ALTREF_FRAME] - == cm->ref_frame_sign_bias[GOLDEN_FRAME]) - || (cm->ref_frame_sign_bias[ALTREF_FRAME] - == cm->ref_frame_sign_bias[LAST_FRAME])) { + if ((cm->ref_frame_sign_bias[ALTREF_FRAME] == + cm->ref_frame_sign_bias[GOLDEN_FRAME]) || + (cm->ref_frame_sign_bias[ALTREF_FRAME] == + cm->ref_frame_sign_bias[LAST_FRAME])) { cm->allow_comp_inter_inter = 0; } else { cm->allow_comp_inter_inter = 1; @@ -2219,112 +3408,73 @@ void vp9_encode_frame(VP9_COMP *cpi) { } } - if (cpi->sf.RD) { - int i, pred_type; - INTERPOLATION_TYPE filter_type; - /* - * This code does a single RD pass over the whole frame assuming - * either compound, single or hybrid prediction as per whatever has - * worked best for that type of frame in the past. - * It also predicts whether another coding mode would have worked - * better that this coding mode. If that is the case, it remembers - * that for subsequent frames. - * It does the same analysis for transform size selection also. - */ - int frame_type = get_frame_type(cpi); + if (cpi->sf.frame_parameter_update) { + int i; + + // This code does a single RD pass over the whole frame assuming + // either compound, single or hybrid prediction as per whatever has + // worked best for that type of frame in the past. + // It also predicts whether another coding mode would have worked + // better that this coding mode. If that is the case, it remembers + // that for subsequent frames. + // It does the same analysis for transform size selection also. + const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi); + int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type]; + int64_t *const filter_thrs = rd_opt->filter_threshes[frame_type]; + int *const tx_thrs = rd_opt->tx_select_threshes[frame_type]; + const int is_alt_ref = frame_type == ALTREF_FRAME; /* prediction (compound, single or hybrid) mode selection */ - if (frame_type == 3 || !cm->allow_comp_inter_inter) - pred_type = SINGLE_PREDICTION_ONLY; - else if (cpi->rd_prediction_type_threshes[frame_type][1] - > cpi->rd_prediction_type_threshes[frame_type][0] - && cpi->rd_prediction_type_threshes[frame_type][1] - > cpi->rd_prediction_type_threshes[frame_type][2] - && check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100) - pred_type = COMP_PREDICTION_ONLY; - else if (cpi->rd_prediction_type_threshes[frame_type][0] - > cpi->rd_prediction_type_threshes[frame_type][2]) - pred_type = SINGLE_PREDICTION_ONLY; + if (is_alt_ref || !cm->allow_comp_inter_inter) + cm->reference_mode = SINGLE_REFERENCE; + else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] && + mode_thrs[COMPOUND_REFERENCE] > + mode_thrs[REFERENCE_MODE_SELECT] && + check_dual_ref_flags(cpi) && + cpi->static_mb_pct == 100) + cm->reference_mode = COMPOUND_REFERENCE; + else if (mode_thrs[SINGLE_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT]) + cm->reference_mode = SINGLE_REFERENCE; else - pred_type = HYBRID_PREDICTION; + cm->reference_mode = REFERENCE_MODE_SELECT; - /* filter type selection */ - // FIXME(rbultje) for some odd reason, we often select smooth_filter - // as default filter for ARF overlay frames. This is a REALLY BAD - // IDEA so we explicitly disable it here. - if (frame_type != 3 && - cpi->rd_filter_threshes[frame_type][1] > - cpi->rd_filter_threshes[frame_type][0] && - cpi->rd_filter_threshes[frame_type][1] > - cpi->rd_filter_threshes[frame_type][2] && - cpi->rd_filter_threshes[frame_type][1] > - cpi->rd_filter_threshes[frame_type][SWITCHABLE_FILTERS]) { - filter_type = EIGHTTAP_SMOOTH; - } else if (cpi->rd_filter_threshes[frame_type][2] > - cpi->rd_filter_threshes[frame_type][0] && - cpi->rd_filter_threshes[frame_type][2] > - cpi->rd_filter_threshes[frame_type][SWITCHABLE_FILTERS]) { - filter_type = EIGHTTAP_SHARP; - } else if (cpi->rd_filter_threshes[frame_type][0] > - cpi->rd_filter_threshes[frame_type][SWITCHABLE_FILTERS]) { - filter_type = EIGHTTAP; - } else { - filter_type = SWITCHABLE; - } + if (cm->interp_filter == SWITCHABLE) + cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref); - cpi->mb.e_mbd.lossless = 0; - if (cpi->oxcf.lossless) { - cpi->mb.e_mbd.lossless = 1; - } - - /* transform size selection (4x4, 8x8, 16x16 or select-per-mb) */ - select_tx_mode(cpi); - cpi->common.comp_pred_mode = pred_type; - cpi->common.mcomp_filter_type = filter_type; encode_frame_internal(cpi); - for (i = 0; i < NB_PREDICTION_TYPES; ++i) { - const int diff = (int) (cpi->rd_comp_pred_diff[i] / cpi->common.MBs); - cpi->rd_prediction_type_threshes[frame_type][i] += diff; - cpi->rd_prediction_type_threshes[frame_type][i] >>= 1; - } + for (i = 0; i < REFERENCE_MODES; ++i) + mode_thrs[i] = (mode_thrs[i] + rd_opt->comp_pred_diff[i] / cm->MBs) / 2; - for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { - const int64_t diff = cpi->rd_filter_diff[i] / cpi->common.MBs; - cpi->rd_filter_threshes[frame_type][i] = - (cpi->rd_filter_threshes[frame_type][i] + diff) / 2; - } + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) + filter_thrs[i] = (filter_thrs[i] + rd_opt->filter_diff[i] / cm->MBs) / 2; for (i = 0; i < TX_MODES; ++i) { - int64_t pd = cpi->rd_tx_select_diff[i]; - int diff; + int64_t pd = rd_opt->tx_select_diff[i]; if (i == TX_MODE_SELECT) - pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, - 2048 * (TX_SIZES - 1), 0); - diff = (int) (pd / cpi->common.MBs); - cpi->rd_tx_select_threshes[frame_type][i] += diff; - cpi->rd_tx_select_threshes[frame_type][i] /= 2; + pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, 2048 * (TX_SIZES - 1), 0); + tx_thrs[i] = (tx_thrs[i] + (int)(pd / cm->MBs)) / 2; } - if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) { + if (cm->reference_mode == REFERENCE_MODE_SELECT) { int single_count_zero = 0; int comp_count_zero = 0; for (i = 0; i < COMP_INTER_CONTEXTS; i++) { - single_count_zero += cpi->comp_inter_count[i][0]; - comp_count_zero += cpi->comp_inter_count[i][1]; + single_count_zero += cm->counts.comp_inter[i][0]; + comp_count_zero += cm->counts.comp_inter[i][1]; } if (comp_count_zero == 0) { - cpi->common.comp_pred_mode = SINGLE_PREDICTION_ONLY; - vp9_zero(cpi->comp_inter_count); + cm->reference_mode = SINGLE_REFERENCE; + vp9_zero(cm->counts.comp_inter); } else if (single_count_zero == 0) { - cpi->common.comp_pred_mode = COMP_PREDICTION_ONLY; - vp9_zero(cpi->comp_inter_count); + cm->reference_mode = COMPOUND_REFERENCE; + vp9_zero(cm->counts.comp_inter); } } - if (cpi->common.tx_mode == TX_MODE_SELECT) { + if (cm->tx_mode == TX_MODE_SELECT) { int count4x4 = 0; int count8x8_lp = 0, count8x8_8x8p = 0; int count16x16_16x16p = 0, count16x16_lp = 0; @@ -2344,194 +3494,159 @@ void vp9_encode_frame(VP9_COMP *cpi) { count32x32 += cm->counts.tx.p32x32[i][TX_32X32]; } - if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 - && count32x32 == 0) { - cpi->common.tx_mode = ALLOW_8X8; - reset_skip_txfm_size(cpi, TX_8X8); - } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 - && count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) { - cpi->common.tx_mode = ONLY_4X4; - reset_skip_txfm_size(cpi, TX_4X4); + if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 && + count32x32 == 0) { + cm->tx_mode = ALLOW_8X8; + reset_skip_tx_size(cm, TX_8X8); + } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 && + count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) { + cm->tx_mode = ONLY_4X4; + reset_skip_tx_size(cm, TX_4X4); } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) { - cpi->common.tx_mode = ALLOW_32X32; + cm->tx_mode = ALLOW_32X32; } else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) { - cpi->common.tx_mode = ALLOW_16X16; - reset_skip_txfm_size(cpi, TX_16X16); + cm->tx_mode = ALLOW_16X16; + reset_skip_tx_size(cm, TX_16X16); } } } else { + cm->reference_mode = SINGLE_REFERENCE; encode_frame_internal(cpi); } } -static void sum_intra_stats(VP9_COMP *cpi, const MODE_INFO *mi) { - const MB_PREDICTION_MODE y_mode = mi->mbmi.mode; - const MB_PREDICTION_MODE uv_mode = mi->mbmi.uv_mode; +static void sum_intra_stats(FRAME_COUNTS *counts, const MODE_INFO *mi) { + const PREDICTION_MODE y_mode = mi->mbmi.mode; + const PREDICTION_MODE uv_mode = mi->mbmi.uv_mode; const BLOCK_SIZE bsize = mi->mbmi.sb_type; - ++cpi->y_uv_mode_count[y_mode][uv_mode]; - if (bsize < BLOCK_8X8) { int idx, idy; - const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; - const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; - for (idy = 0; idy < 2; idy += num_4x4_blocks_high) - for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) - ++cpi->y_mode_count[0][mi->bmi[idy * 2 + idx].as_mode]; + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + for (idy = 0; idy < 2; idy += num_4x4_h) + for (idx = 0; idx < 2; idx += num_4x4_w) + ++counts->y_mode[0][mi->bmi[idy * 2 + idx].as_mode]; } else { - ++cpi->y_mode_count[size_group_lookup[bsize]][y_mode]; + ++counts->y_mode[size_group_lookup[bsize]][y_mode]; + } + + ++counts->uv_mode[y_mode][uv_mode]; +} + +static int get_zbin_mode_boost(const MB_MODE_INFO *mbmi, int enabled) { + if (enabled) { + if (is_inter_block(mbmi)) { + if (mbmi->mode == ZEROMV) { + return mbmi->ref_frame[0] != LAST_FRAME ? GF_ZEROMV_ZBIN_BOOST + : LF_ZEROMV_ZBIN_BOOST; + } else { + return mbmi->sb_type < BLOCK_8X8 ? SPLIT_MV_ZBIN_BOOST + : MV_ZBIN_BOOST; + } + } else { + return INTRA_ZBIN_BOOST; + } + } else { + return 0; } } -// Experimental stub function to create a per MB zbin adjustment based on -// some previously calculated measure of MB activity. -static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x) { -#if USE_ACT_INDEX - x->act_zbin_adj = *(x->mb_activity_ptr); -#else - int64_t a; - int64_t b; - int64_t act = *(x->mb_activity_ptr); - - // Apply the masking to the RD multiplier. - a = act + 4 * cpi->activity_avg; - b = 4 * act + cpi->activity_avg; - - if (act > cpi->activity_avg) - x->act_zbin_adj = (int) (((int64_t) b + (a >> 1)) / a) - 1; - else - x->act_zbin_adj = 1 - (int) (((int64_t) a + (b >> 1)) / b); -#endif -} static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, - int mi_row, int mi_col, BLOCK_SIZE bsize) { - VP9_COMMON * const cm = &cpi->common; - MACROBLOCK * const x = &cpi->mb; - MACROBLOCKD * const xd = &x->e_mbd; - MODE_INFO **mi_8x8 = xd->mi_8x8; + int mi_row, int mi_col, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO **mi_8x8 = xd->mi; MODE_INFO *mi = mi_8x8[0]; MB_MODE_INFO *mbmi = &mi->mbmi; - PICK_MODE_CONTEXT *ctx = get_block_context(x, bsize); - unsigned int segment_id = mbmi->segment_id; - const int mis = cm->mode_info_stride; + const int seg_skip = vp9_segfeature_active(&cm->seg, mbmi->segment_id, + SEG_LVL_SKIP); + const int mis = cm->mi_stride; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const int mi_height = num_8x8_blocks_high_lookup[bsize]; - x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8; + + x->skip_recode = !x->select_tx_size && mbmi->sb_type >= BLOCK_8X8 && + cpi->oxcf.aq_mode != COMPLEXITY_AQ && + cpi->oxcf.aq_mode != CYCLIC_REFRESH_AQ && + cpi->sf.allow_skip_recode; + + if (!x->skip_recode && !cpi->sf.use_nonrd_pick_mode) + vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm)); + x->skip_optimize = ctx->is_coded; ctx->is_coded = 1; x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct; x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH); + if (x->skip_encode) return; - if (cm->frame_type == KEY_FRAME) { - if (cpi->oxcf.tuning == VP8_TUNE_SSIM) { - adjust_act_zbin(cpi, x); - vp9_update_zbin_extra(cpi, x); - } - } else { - vp9_setup_interp_filters(xd, mbmi->interp_filter, cm); + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); - if (cpi->oxcf.tuning == VP8_TUNE_SSIM) { - // Adjust the zbin based on this MB rate. - adjust_act_zbin(cpi, x); - } - - // Experimental code. Special case for gf and arf zeromv modes. - // Increase zbin size to suppress noise - cpi->zbin_mode_boost = 0; - if (cpi->zbin_mode_boost_enabled) { - if (is_inter_block(mbmi)) { - if (mbmi->mode == ZEROMV) { - if (mbmi->ref_frame[0] != LAST_FRAME) - cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST; - else - cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST; - } else if (mbmi->sb_type < BLOCK_8X8) { - cpi->zbin_mode_boost = SPLIT_MV_ZBIN_BOOST; - } else { - cpi->zbin_mode_boost = MV_ZBIN_BOOST; - } - } else { - cpi->zbin_mode_boost = INTRA_ZBIN_BOOST; - } - } - - vp9_update_zbin_extra(cpi, x); - } + // Experimental code. Special case for gf and arf zeromv modes. + // Increase zbin size to suppress noise + cpi->zbin_mode_boost = get_zbin_mode_boost(mbmi, + cpi->zbin_mode_boost_enabled); + vp9_update_zbin_extra(cpi, x); if (!is_inter_block(mbmi)) { - vp9_encode_intra_block_y(x, MAX(bsize, BLOCK_8X8)); - vp9_encode_intra_block_uv(x, MAX(bsize, BLOCK_8X8)); + int plane; + mbmi->skip = 1; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) + vp9_encode_intra_block_plane(x, MAX(bsize, BLOCK_8X8), plane); if (output_enabled) - sum_intra_stats(cpi, mi); + sum_intra_stats(&cm->counts, mi); + vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8)); } else { - int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame[0])]; - YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[idx]; - YV12_BUFFER_CONFIG *second_ref_fb = NULL; - if (has_second_ref(mbmi)) { - idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame[1])]; - second_ref_fb = &cm->yv12_fb[idx]; + int ref; + const int is_compound = has_second_ref(mbmi); + for (ref = 0; ref < 1 + is_compound; ++ref) { + YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, + mbmi->ref_frame[ref]); + vp9_setup_pre_planes(xd, ref, cfg, mi_row, mi_col, + &xd->block_refs[ref]->sf); } + if (!cpi->sf.reuse_inter_pred_sby || seg_skip) + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8)); - assert(cm->frame_type != KEY_FRAME); + vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8)); - setup_pre_planes(xd, 0, ref_fb, mi_row, mi_col, - &xd->scale_factor[0]); - setup_pre_planes(xd, 1, second_ref_fb, mi_row, mi_col, - &xd->scale_factor[1]); - - vp9_build_inter_predictors_sb(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8)); - } - - if (!is_inter_block(mbmi)) { - vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8)); - } else if (!x->skip) { - vp9_encode_sb(x, MAX(bsize, BLOCK_8X8)); - vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8)); - } else { - int mb_skip_context = xd->left_available ? mi_8x8[-1]->mbmi.skip_coeff : 0; - mb_skip_context += mi_8x8[-mis] ? mi_8x8[-mis]->mbmi.skip_coeff : 0; - - mbmi->skip_coeff = 1; - if (output_enabled) - cm->counts.mbskip[mb_skip_context][1]++; - reset_skip_context(xd, MAX(bsize, BLOCK_8X8)); + if (!x->skip) { + mbmi->skip = 1; + vp9_encode_sb(x, MAX(bsize, BLOCK_8X8)); + vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8)); + } else { + mbmi->skip = 1; + if (output_enabled && !seg_skip) + cm->counts.skip[vp9_get_skip_context(xd)][1]++; + reset_skip_context(xd, MAX(bsize, BLOCK_8X8)); + } } if (output_enabled) { if (cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8 && - !(is_inter_block(mbmi) && - (mbmi->skip_coeff || - vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) { - const uint8_t context = vp9_get_pred_context_tx_size(xd); - ++get_tx_counts(max_txsize_lookup[bsize], - context, &cm->counts.tx)[mbmi->tx_size]; + !(is_inter_block(mbmi) && (mbmi->skip || seg_skip))) { + ++get_tx_counts(max_txsize_lookup[bsize], vp9_get_tx_size_context(xd), + &cm->counts.tx)[mbmi->tx_size]; } else { int x, y; - TX_SIZE sz = tx_mode_to_biggest_tx_size[cm->tx_mode]; - assert(sizeof(tx_mode_to_biggest_tx_size) / - sizeof(tx_mode_to_biggest_tx_size[0]) == TX_MODES); + TX_SIZE tx_size; // The new intra coding scheme requires no change of transform size if (is_inter_block(&mi->mbmi)) { - if (sz == TX_32X32 && bsize < BLOCK_32X32) - sz = TX_16X16; - if (sz == TX_16X16 && bsize < BLOCK_16X16) - sz = TX_8X8; - if (sz == TX_8X8 && bsize < BLOCK_8X8) - sz = TX_4X4; - } else if (bsize >= BLOCK_8X8) { - sz = mbmi->tx_size; + tx_size = MIN(tx_mode_to_biggest_tx_size[cm->tx_mode], + max_txsize_lookup[bsize]); } else { - sz = TX_4X4; + tx_size = (bsize >= BLOCK_8X8) ? mbmi->tx_size : TX_4X4; } for (y = 0; y < mi_height; y++) for (x = 0; x < mi_width; x++) if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows) - mi_8x8[mis * y + x]->mbmi.tx_size = sz; + mi_8x8[mis * y + x]->mbmi.tx_size = tx_size; } } } diff --git a/media/libvpx/vp9/encoder/vp9_encodeframe.h b/media/libvpx/vp9/encoder/vp9_encodeframe.h index 3e9f5381c06a..fd1c9aa6427a 100644 --- a/media/libvpx/vp9/encoder/vp9_encodeframe.h +++ b/media/libvpx/vp9/encoder/vp9_encodeframe.h @@ -12,11 +12,29 @@ #ifndef VP9_ENCODER_VP9_ENCODEFRAME_H_ #define VP9_ENCODER_VP9_ENCODEFRAME_H_ +#ifdef __cplusplus +extern "C" { +#endif + struct macroblock; struct yv12_buffer_config; +struct VP9_COMP; + +// Constants used in SOURCE_VAR_BASED_PARTITION +#define VAR_HIST_MAX_BG_VAR 1000 +#define VAR_HIST_FACTOR 10 +#define VAR_HIST_BINS (VAR_HIST_MAX_BG_VAR / VAR_HIST_FACTOR + 1) +#define VAR_HIST_LARGE_CUT_OFF 75 +#define VAR_HIST_SMALL_CUT_OFF 45 void vp9_setup_src_planes(struct macroblock *x, const struct yv12_buffer_config *src, int mi_row, int mi_col); +void vp9_encode_frame(struct VP9_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_ENCODER_VP9_ENCODEFRAME_H_ diff --git a/media/libvpx/vp9/encoder/vp9_encodeintra.c b/media/libvpx/vp9/encoder/vp9_encodeintra.c deleted file mode 100644 index 32b4593fcb86..000000000000 --- a/media/libvpx/vp9/encoder/vp9_encodeintra.c +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_config.h" -#include "./vp9_rtcd.h" -#include "vp9/encoder/vp9_quantize.h" -#include "vp9/common/vp9_reconintra.h" -#include "vp9/encoder/vp9_encodemb.h" -#include "vp9/encoder/vp9_encodeintra.h" - -int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred) { - MB_MODE_INFO * mbmi = &x->e_mbd.mi_8x8[0]->mbmi; - x->skip_encode = 0; - mbmi->mode = DC_PRED; - mbmi->ref_frame[0] = INTRA_FRAME; - mbmi->tx_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_16X16 ? TX_16X16 - : TX_8X8) - : TX_4X4; - vp9_encode_intra_block_y(x, mbmi->sb_type); - return vp9_get_mb_ss(x->plane[0].src_diff); -} diff --git a/media/libvpx/vp9/encoder/vp9_encodeintra.h b/media/libvpx/vp9/encoder/vp9_encodeintra.h deleted file mode 100644 index e217924653f1..000000000000 --- a/media/libvpx/vp9/encoder/vp9_encodeintra.h +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_ENCODER_VP9_ENCODEINTRA_H_ -#define VP9_ENCODER_VP9_ENCODEINTRA_H_ - -#include "vp9/encoder/vp9_onyx_int.h" - -int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred); -void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, void *arg); - -#endif // VP9_ENCODER_VP9_ENCODEINTRA_H_ diff --git a/media/libvpx/vp9/encoder/vp9_encodemb.c b/media/libvpx/vp9/encoder/vp9_encodemb.c index a85ddee454da..667845072507 100644 --- a/media/libvpx/vp9/encoder/vp9_encodemb.c +++ b/media/libvpx/vp9/encoder/vp9_encodemb.c @@ -19,75 +19,61 @@ #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_systemdependent.h" -#include "vp9/encoder/vp9_dct.h" #include "vp9/encoder/vp9_encodemb.h" #include "vp9/encoder/vp9_quantize.h" -#include "vp9/encoder/vp9_rdopt.h" +#include "vp9/encoder/vp9_rd.h" #include "vp9/encoder/vp9_tokenize.h" +struct optimize_ctx { + ENTROPY_CONTEXT ta[MAX_MB_PLANE][16]; + ENTROPY_CONTEXT tl[MAX_MB_PLANE][16]; +}; + +struct encode_b_args { + MACROBLOCK *x; + struct optimize_ctx *ctx; + int8_t *skip; +}; + void vp9_subtract_block_c(int rows, int cols, - int16_t *diff_ptr, ptrdiff_t diff_stride, - const uint8_t *src_ptr, ptrdiff_t src_stride, - const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + int16_t *diff, ptrdiff_t diff_stride, + const uint8_t *src, ptrdiff_t src_stride, + const uint8_t *pred, ptrdiff_t pred_stride) { int r, c; for (r = 0; r < rows; r++) { for (c = 0; c < cols; c++) - diff_ptr[c] = src_ptr[c] - pred_ptr[c]; + diff[c] = src[c] - pred[c]; - diff_ptr += diff_stride; - pred_ptr += pred_stride; - src_ptr += src_stride; + diff += diff_stride; + pred += pred_stride; + src += src_stride; } } -static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { +void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { struct macroblock_plane *const p = &x->plane[plane]; - const MACROBLOCKD *const xd = &x->e_mbd; - const struct macroblockd_plane *const pd = &xd->plane[plane]; - const int bw = plane_block_width(bsize, pd); - const int bh = plane_block_height(bsize, pd); + const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane]; + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize]; - vp9_subtract_block(bh, bw, p->src_diff, bw, - p->src.buf, p->src.stride, + vp9_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride); } -void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize) { - subtract_plane(x, bsize, 0); -} - -void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize) { - int i; - - for (i = 1; i < MAX_MB_PLANE; i++) - subtract_plane(x, bsize, i); -} - -void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize) { - vp9_subtract_sby(x, bsize); - vp9_subtract_sbuv(x, bsize); -} - #define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF) -typedef struct vp9_token_state vp9_token_state; -struct vp9_token_state { +typedef struct vp9_token_state { int rate; int error; int next; signed char token; short qc; -}; +} vp9_token_state; // TODO(jimbankoski): experiment to find optimal RD numbers. -#define Y1_RD_MULT 4 -#define UV_RD_MULT 2 - -static const int plane_rd_mult[4] = { - Y1_RD_MULT, - UV_RD_MULT, -}; +static const int plane_rd_mult[PLANE_TYPES] = { 4, 2 }; #define UPDATE_RD_COST()\ {\ @@ -112,61 +98,56 @@ static int trellis_get_coeff_context(const int16_t *scan, return pt; } -static void optimize_b(MACROBLOCK *mb, - int plane, int block, BLOCK_SIZE plane_bsize, - ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, - TX_SIZE tx_size) { +static int optimize_b(MACROBLOCK *mb, int plane, int block, + TX_SIZE tx_size, int ctx) { MACROBLOCKD *const xd = &mb->e_mbd; - struct macroblockd_plane *pd = &xd->plane[plane]; - const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi); + struct macroblock_plane *const p = &mb->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ref = is_inter_block(&xd->mi[0]->mbmi); vp9_token_state tokens[1025][2]; unsigned best_index[1025][2]; - const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff, block); - int16_t *qcoeff_ptr; - int16_t *dqcoeff_ptr; - int eob = pd->eobs[block], final_eob, sz = 0; - const int i0 = 0; - int rc, x, next, i; - int64_t rdmult, rddiv, rd_cost0, rd_cost1; - int rate0, rate1, error0, error1, t0, t1; - int best, band, pt; - PLANE_TYPE type = pd->plane_type; - int err_mult = plane_rd_mult[type]; - const int default_eob = 16 << (tx_size << 1); - const int16_t *scan, *nb; - const int mul = 1 + (tx_size == TX_32X32); uint8_t token_cache[1024]; + const int16_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block); + int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + const int eob = p->eobs[block]; + const PLANE_TYPE type = pd->plane_type; + const int default_eob = 16 << (tx_size << 1); + const int mul = 1 + (tx_size == TX_32X32); const int16_t *dequant_ptr = pd->dequant; const uint8_t *const band_translate = get_band_translate(tx_size); + const scan_order *const so = get_scan(xd, tx_size, type, block); + const int16_t *const scan = so->scan; + const int16_t *const nb = so->neighbors; + int next = eob, sz = 0; + int64_t rdmult = mb->rdmult * plane_rd_mult[type], rddiv = mb->rddiv; + int64_t rd_cost0, rd_cost1; + int rate0, rate1, error0, error1, t0, t1; + int best, band, pt, i, final_eob; assert((!type && !plane) || (type && plane)); - dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block); - qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block); - get_scan(xd, tx_size, type, block, &scan, &nb); assert(eob <= default_eob); /* Now set up a Viterbi trellis to evaluate alternative roundings. */ - rdmult = mb->rdmult * err_mult; - if (mb->e_mbd.mi_8x8[0]->mbmi.ref_frame[0] == INTRA_FRAME) + if (!ref) rdmult = (rdmult * 9) >> 4; - rddiv = mb->rddiv; + /* Initialize the sentinel node of the trellis. */ tokens[eob][0].rate = 0; tokens[eob][0].error = 0; tokens[eob][0].next = default_eob; - tokens[eob][0].token = DCT_EOB_TOKEN; + tokens[eob][0].token = EOB_TOKEN; tokens[eob][0].qc = 0; - *(tokens[eob] + 1) = *(tokens[eob] + 0); - next = eob; + tokens[eob][1] = tokens[eob][0]; + for (i = 0; i < eob; i++) - token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[ - qcoeff_ptr[scan[i]]].token]; + token_cache[scan[i]] = + vp9_pt_energy_class[vp9_dct_value_tokens_ptr[qcoeff[scan[i]]].token]; - for (i = eob; i-- > i0;) { + for (i = eob; i-- > 0;) { int base_bits, d2, dx; - - rc = scan[i]; - x = qcoeff_ptr[rc]; + const int rc = scan[i]; + int x = qcoeff[rc]; /* Only add a trellis state for non-zero coefficients. */ if (x) { int shortcut = 0; @@ -180,18 +161,16 @@ static void optimize_b(MACROBLOCK *mb, if (next < default_eob) { band = band_translate[i + 1]; pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); - rate0 += - mb->token_costs[tx_size][type][ref][band][0][pt] - [tokens[next][0].token]; - rate1 += - mb->token_costs[tx_size][type][ref][band][0][pt] - [tokens[next][1].token]; + rate0 += mb->token_costs[tx_size][type][ref][band][0][pt] + [tokens[next][0].token]; + rate1 += mb->token_costs[tx_size][type][ref][band][0][pt] + [tokens[next][1].token]; } UPDATE_RD_COST(); /* And pick the best. */ best = rd_cost1 < rd_cost0; - base_bits = *(vp9_dct_value_cost_ptr + x); - dx = mul * (dqcoeff_ptr[rc] - coeff_ptr[rc]); + base_bits = vp9_dct_value_cost_ptr[x]; + dx = mul * (dqcoeff[rc] - coeff[rc]); d2 = dx * dx; tokens[i][0].rate = base_bits + (best ? rate1 : rate0); tokens[i][0].error = d2 + (best ? error1 : error0); @@ -204,9 +183,9 @@ static void optimize_b(MACROBLOCK *mb, rate0 = tokens[next][0].rate; rate1 = tokens[next][1].rate; - if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff_ptr[rc]) * mul) && - (abs(x)*dequant_ptr[rc != 0] < abs(coeff_ptr[rc]) * mul + - dequant_ptr[rc != 0])) + if ((abs(x) * dequant_ptr[rc != 0] > abs(coeff[rc]) * mul) && + (abs(x) * dequant_ptr[rc != 0] < abs(coeff[rc]) * mul + + dequant_ptr[rc != 0])) shortcut = 1; else shortcut = 0; @@ -221,21 +200,19 @@ static void optimize_b(MACROBLOCK *mb, /* If we reduced this coefficient to zero, check to see if * we need to move the EOB back here. */ - t0 = tokens[next][0].token == DCT_EOB_TOKEN ? - DCT_EOB_TOKEN : ZERO_TOKEN; - t1 = tokens[next][1].token == DCT_EOB_TOKEN ? - DCT_EOB_TOKEN : ZERO_TOKEN; + t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN; + t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN; } else { t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token; } if (next < default_eob) { band = band_translate[i + 1]; - if (t0 != DCT_EOB_TOKEN) { + if (t0 != EOB_TOKEN) { pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt] [tokens[next][0].token]; } - if (t1 != DCT_EOB_TOKEN) { + if (t1 != EOB_TOKEN) { pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache); rate1 += mb->token_costs[tx_size][type][ref][band][!x][pt] [tokens[next][1].token]; @@ -245,7 +222,7 @@ static void optimize_b(MACROBLOCK *mb, UPDATE_RD_COST(); /* And pick the best. */ best = rd_cost1 < rd_cost0; - base_bits = *(vp9_dct_value_cost_ptr + x); + base_bits = vp9_dct_value_cost_ptr[x]; if (shortcut) { dx -= (dequant_ptr[rc != 0] + sz) ^ sz; @@ -267,12 +244,12 @@ static void optimize_b(MACROBLOCK *mb, t0 = tokens[next][0].token; t1 = tokens[next][1].token; /* Update the cost of each path if we're past the EOB token. */ - if (t0 != DCT_EOB_TOKEN) { + if (t0 != EOB_TOKEN) { tokens[next][0].rate += mb->token_costs[tx_size][type][ref][band][1][0][t0]; tokens[next][0].token = ZERO_TOKEN; } - if (t1 != DCT_EOB_TOKEN) { + if (t1 != EOB_TOKEN) { tokens[next][1].rate += mb->token_costs[tx_size][type][ref][band][1][0][t1]; tokens[next][1].token = ZERO_TOKEN; @@ -284,131 +261,192 @@ static void optimize_b(MACROBLOCK *mb, /* Now pick the best path through the whole trellis. */ band = band_translate[i + 1]; - pt = combine_entropy_contexts(*a, *l); rate0 = tokens[next][0].rate; rate1 = tokens[next][1].rate; error0 = tokens[next][0].error; error1 = tokens[next][1].error; t0 = tokens[next][0].token; t1 = tokens[next][1].token; - rate0 += mb->token_costs[tx_size][type][ref][band][0][pt][t0]; - rate1 += mb->token_costs[tx_size][type][ref][band][0][pt][t1]; + rate0 += mb->token_costs[tx_size][type][ref][band][0][ctx][t0]; + rate1 += mb->token_costs[tx_size][type][ref][band][0][ctx][t1]; UPDATE_RD_COST(); best = rd_cost1 < rd_cost0; - final_eob = i0 - 1; - vpx_memset(qcoeff_ptr, 0, sizeof(*qcoeff_ptr) * (16 << (tx_size * 2))); - vpx_memset(dqcoeff_ptr, 0, sizeof(*dqcoeff_ptr) * (16 << (tx_size * 2))); + final_eob = -1; + vpx_memset(qcoeff, 0, sizeof(*qcoeff) * (16 << (tx_size * 2))); + vpx_memset(dqcoeff, 0, sizeof(*dqcoeff) * (16 << (tx_size * 2))); for (i = next; i < eob; i = next) { - x = tokens[i][best].qc; + const int x = tokens[i][best].qc; + const int rc = scan[i]; if (x) { final_eob = i; } - rc = scan[i]; - qcoeff_ptr[rc] = x; - dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]) / mul; + + qcoeff[rc] = x; + dqcoeff[rc] = (x * dequant_ptr[rc != 0]) / mul; next = tokens[i][best].next; best = best_index[i][best]; } final_eob++; - xd->plane[plane].eobs[block] = final_eob; - *a = *l = (final_eob > 0); + mb->plane[plane].eobs[block] = final_eob; + return final_eob; } -void vp9_optimize_b(int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, MACROBLOCK *mb, struct optimize_ctx *ctx) { - int x, y; - txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y); - optimize_b(mb, plane, block, plane_bsize, - &ctx->ta[plane][x], &ctx->tl[plane][y], tx_size); +static INLINE void fdct32x32(int rd_transform, + const int16_t *src, int16_t *dst, int src_stride) { + if (rd_transform) + vp9_fdct32x32_rd(src, dst, src_stride); + else + vp9_fdct32x32(src, dst, src_stride); } -static void optimize_init_b(int plane, BLOCK_SIZE bsize, - struct encode_b_args *args) { - const MACROBLOCKD *xd = &args->x->e_mbd; - const struct macroblockd_plane* const pd = &xd->plane[plane]; - const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); - const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; - const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; - const MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; - const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size; - - vp9_get_entropy_contexts(tx_size, args->ctx->ta[plane], args->ctx->tl[plane], - pd->above_context, pd->left_context, - num_4x4_w, num_4x4_h); -} - -void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, void *arg) { - struct encode_b_args* const args = arg; - MACROBLOCK* const x = args->x; - MACROBLOCKD* const xd = &x->e_mbd; - struct macroblock_plane *const p = &x->plane[plane]; - struct macroblockd_plane *const pd = &xd->plane[plane]; - int16_t *coeff = BLOCK_OFFSET(p->coeff, block); - int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block); - int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - const int16_t *scan, *iscan; - uint16_t *eob = &pd->eobs[block]; - const int bwl = b_width_log2(plane_bsize), bw = 1 << bwl; - const int twl = bwl - tx_size, twmask = (1 << twl) - 1; - int xoff, yoff; - int16_t *src_diff; +void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; + int16_t *const coeff = BLOCK_OFFSET(p->coeff, block); + int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; + const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + int i, j; + const int16_t *src_diff; + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); + src_diff = &p->src_diff[4 * (j * diff_stride + i)]; switch (tx_size) { case TX_32X32: - scan = vp9_default_scan_32x32; - iscan = vp9_default_iscan_32x32; - block >>= 6; - xoff = 32 * (block & twmask); - yoff = 32 * (block >> twl); - src_diff = p->src_diff + 4 * bw * yoff + xoff; - if (x->use_lp32x32fdct) - vp9_fdct32x32_rd(src_diff, coeff, bw * 4); - else - vp9_fdct32x32(src_diff, coeff, bw * 4); - vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, scan, iscan); + fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); + vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp, + p->quant_fp, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan_order->scan, + scan_order->iscan); break; case TX_16X16: - scan = vp9_default_scan_16x16; - iscan = vp9_default_iscan_16x16; - block >>= 4; - xoff = 16 * (block & twmask); - yoff = 16 * (block >> twl); - src_diff = p->src_diff + 4 * bw * yoff + xoff; - vp9_fdct16x16(src_diff, coeff, bw * 4); - vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, scan, iscan); + vp9_fdct16x16(src_diff, coeff, diff_stride); + vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp, + p->quant_fp, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, + scan_order->scan, scan_order->iscan); break; case TX_8X8: - scan = vp9_default_scan_8x8; - iscan = vp9_default_iscan_8x8; - block >>= 2; - xoff = 8 * (block & twmask); - yoff = 8 * (block >> twl); - src_diff = p->src_diff + 4 * bw * yoff + xoff; - vp9_fdct8x8(src_diff, coeff, bw * 4); - vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, scan, iscan); + vp9_fdct8x8(src_diff, coeff, diff_stride); + vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp, + p->quant_fp, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, + scan_order->scan, scan_order->iscan); break; case TX_4X4: - scan = vp9_default_scan_4x4; - iscan = vp9_default_iscan_4x4; - xoff = 4 * (block & twmask); - yoff = 4 * (block >> twl); - src_diff = p->src_diff + 4 * bw * yoff + xoff; - x->fwd_txm4x4(src_diff, coeff, bw * 4); - vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, - p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, scan, iscan); + x->fwd_txm4x4(src_diff, coeff, diff_stride); + vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp, + p->quant_fp, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, + scan_order->scan, scan_order->iscan); break; default: assert(0); + break; + } +} + +void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + int16_t *const coeff = BLOCK_OFFSET(p->coeff, block); + int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; + const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + int i, j; + const int16_t *src_diff; + + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); + src_diff = &p->src_diff[4 * (j * diff_stride + i)]; + + switch (tx_size) { + case TX_32X32: + vp9_fdct32x32_1(src_diff, coeff, diff_stride); + vp9_quantize_dc_32x32(coeff, x->skip_block, p->round, + p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); + break; + case TX_16X16: + vp9_fdct16x16_1(src_diff, coeff, diff_stride); + vp9_quantize_dc(coeff, x->skip_block, p->round, + p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); + break; + case TX_8X8: + vp9_fdct8x8_1(src_diff, coeff, diff_stride); + vp9_quantize_dc(coeff, x->skip_block, p->round, + p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); + break; + case TX_4X4: + x->fwd_txm4x4(src_diff, coeff, diff_stride); + vp9_quantize_dc(coeff, x->skip_block, p->round, + p->quant_fp[0], qcoeff, dqcoeff, + pd->dequant[0], eob); + break; + default: + assert(0); + break; + } +} + +void vp9_xform_quant(MACROBLOCK *x, int plane, int block, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; + int16_t *const coeff = BLOCK_OFFSET(p->coeff, block); + int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; + const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + int i, j; + const int16_t *src_diff; + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); + src_diff = &p->src_diff[4 * (j * diff_stride + i)]; + + switch (tx_size) { + case TX_32X32: + fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); + vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan_order->scan, + scan_order->iscan); + break; + case TX_16X16: + vp9_fdct16x16(src_diff, coeff, diff_stride); + vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, + scan_order->scan, scan_order->iscan); + break; + case TX_8X8: + vp9_fdct8x8(src_diff, coeff, diff_stride); + vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, + scan_order->scan, scan_order->iscan); + break; + case TX_4X4: + x->fwd_txm4x4(src_diff, coeff, diff_stride); + vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, + scan_order->scan, scan_order->iscan); + break; + default: + assert(0); + break; } } @@ -418,6 +456,87 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; struct optimize_ctx *const ctx = args->ctx; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + int i, j; + uint8_t *dst; + ENTROPY_CONTEXT *a, *l; + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); + dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i]; + a = &ctx->ta[plane][i]; + l = &ctx->tl[plane][j]; + + // TODO(jingning): per transformed block zero forcing only enabled for + // luma component. will integrate chroma components as well. + if (x->zcoeff_blk[tx_size][block] && plane == 0) { + p->eobs[block] = 0; + *a = *l = 0; + return; + } + + if (!x->skip_recode) { + if (max_txsize_lookup[plane_bsize] == tx_size) { + if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 0) { + // full forward transform and quantization + if (x->quant_fp) + vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size); + else + vp9_xform_quant(x, plane, block, plane_bsize, tx_size); + } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) { + // fast path forward transform and quantization + vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size); + } else { + // skip forward transform + p->eobs[block] = 0; + *a = *l = 0; + return; + } + } else { + vp9_xform_quant(x, plane, block, plane_bsize, tx_size); + } + } + + if (x->optimize && (!x->skip_recode || !x->skip_optimize)) { + const int ctx = combine_entropy_contexts(*a, *l); + *a = *l = optimize_b(x, plane, block, tx_size, ctx) > 0; + } else { + *a = *l = p->eobs[block] > 0; + } + + if (p->eobs[block]) + *(args->skip) = 0; + + if (x->skip_encode || p->eobs[block] == 0) + return; + + switch (tx_size) { + case TX_32X32: + vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); + break; + case TX_16X16: + vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); + break; + case TX_8X8: + vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); + break; + case TX_4X4: + // this is like vp9_short_idct4x4 but has a special case around eob<=1 + // which is significant (not just an optimization) for the lossless + // case. + x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); + break; + default: + assert(0 && "Invalid transform size"); + break; + } +} + +static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + MACROBLOCK *const x = (MACROBLOCK *)arg; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); int i, j; @@ -425,229 +544,148 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i]; - // TODO(jingning): per transformed block zero forcing only enabled for - // luma component. will integrate chroma components as well. - if (x->zcoeff_blk[tx_size][block] && plane == 0) { - pd->eobs[block] = 0; - ctx->ta[plane][i] = 0; - ctx->tl[plane][j] = 0; - return; - } + vp9_xform_quant(x, plane, block, plane_bsize, tx_size); - if (!x->skip_recode) - vp9_xform_quant(plane, block, plane_bsize, tx_size, arg); - - if (x->optimize && (!x->skip_recode || !x->skip_optimize)) { - vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx); - } else { - ctx->ta[plane][i] = pd->eobs[block] > 0; - ctx->tl[plane][j] = pd->eobs[block] > 0; - } - - if (x->skip_encode || pd->eobs[block] == 0) - return; - - switch (tx_size) { - case TX_32X32: - vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]); - break; - case TX_16X16: - vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]); - break; - case TX_8X8: - vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]); - break; - case TX_4X4: - // this is like vp9_short_idct4x4 but has a special case around eob<=1 - // which is significant (not just an optimization) for the lossless - // case. - xd->itxm_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]); - break; - default: - assert(!"Invalid transform size"); - } + if (p->eobs[block] > 0) + x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); } -static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, void *arg) { - struct encode_b_args *const args = arg; - MACROBLOCK *const x = args->x; - MACROBLOCKD *const xd = &x->e_mbd; - struct macroblockd_plane *const pd = &xd->plane[plane]; - const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size, - block); - - int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block, - pd->dst.buf, pd->dst.stride); - - vp9_xform_quant(plane, block, plane_bsize, tx_size, arg); - - if (pd->eobs[block] == 0) - return; - - xd->itxm_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]); -} - -void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) { - MACROBLOCKD *const xd = &x->e_mbd; - struct optimize_ctx ctx; - struct encode_b_args arg = {x, &ctx}; - - vp9_subtract_sby(x, bsize); - if (x->optimize) - optimize_init_b(0, bsize, &arg); - - foreach_transformed_block_in_plane(xd, bsize, 0, encode_block_pass1, &arg); +void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) { + vp9_subtract_plane(x, bsize, 0); + vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0, + encode_block_pass1, x); } void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) { MACROBLOCKD *const xd = &x->e_mbd; struct optimize_ctx ctx; - struct encode_b_args arg = {x, &ctx}; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + struct encode_b_args arg = {x, &ctx, &mbmi->skip}; + int plane; - if (!x->skip_recode) - vp9_subtract_sb(x, bsize); + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + if (!x->skip_recode) + vp9_subtract_plane(x, bsize, plane); - if (x->optimize && (!x->skip_recode || !x->skip_optimize)) { - int i; - for (i = 0; i < MAX_MB_PLANE; ++i) - optimize_init_b(i, bsize, &arg); + if (x->optimize && (!x->skip_recode || !x->skip_optimize)) { + const struct macroblockd_plane* const pd = &xd->plane[plane]; + const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size; + vp9_get_entropy_contexts(bsize, tx_size, pd, + ctx.ta[plane], ctx.tl[plane]); + } + + vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block, + &arg); } - - foreach_transformed_block(xd, bsize, encode_block, &arg); } -void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, void *arg) { +static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { struct encode_b_args* const args = arg; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; struct macroblock_plane *const p = &x->plane[plane]; struct macroblockd_plane *const pd = &xd->plane[plane]; int16_t *coeff = BLOCK_OFFSET(p->coeff, block); - int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block); + int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - const int16_t *scan, *iscan; + const scan_order *scan_order; TX_TYPE tx_type; - MB_PREDICTION_MODE mode; - const int bwl = b_width_log2(plane_bsize), bw = 1 << bwl; - const int twl = bwl - tx_size, twmask = (1 << twl) - 1; - int xoff, yoff; + PREDICTION_MODE mode; + const int bwl = b_width_log2(plane_bsize); + const int diff_stride = 4 * (1 << bwl); uint8_t *src, *dst; int16_t *src_diff; - uint16_t *eob = &pd->eobs[block]; - - if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) - extend_for_intra(xd, plane_bsize, plane, block, tx_size); - - // if (x->optimize) - // vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx); + uint16_t *eob = &p->eobs[block]; + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; + int i, j; + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); + dst = &pd->dst.buf[4 * (j * dst_stride + i)]; + src = &p->src.buf[4 * (j * src_stride + i)]; + src_diff = &p->src_diff[4 * (j * diff_stride + i)]; switch (tx_size) { case TX_32X32: - scan = vp9_default_scan_32x32; - iscan = vp9_default_iscan_32x32; + scan_order = &vp9_default_scan_orders[TX_32X32]; mode = plane == 0 ? mbmi->mode : mbmi->uv_mode; - block >>= 6; - xoff = 32 * (block & twmask); - yoff = 32 * (block >> twl); - dst = pd->dst.buf + yoff * pd->dst.stride + xoff; - vp9_predict_intra_block(xd, block, bwl, TX_32X32, mode, - dst, pd->dst.stride, dst, pd->dst.stride); - + vp9_predict_intra_block(xd, block >> 6, bwl, TX_32X32, mode, + x->skip_encode ? src : dst, + x->skip_encode ? src_stride : dst_stride, + dst, dst_stride, i, j, plane); if (!x->skip_recode) { - src = p->src.buf + yoff * p->src.stride + xoff; - src_diff = p->src_diff + 4 * bw * yoff + xoff; - vp9_subtract_block(32, 32, src_diff, bw * 4, - src, p->src.stride, dst, pd->dst.stride); - if (x->use_lp32x32fdct) - vp9_fdct32x32_rd(src_diff, coeff, bw * 4); - else - vp9_fdct32x32(src_diff, coeff, bw * 4); + vp9_subtract_block(32, 32, src_diff, diff_stride, + src, src_stride, dst, dst_stride); + fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, scan, iscan); + pd->dequant, p->zbin_extra, eob, scan_order->scan, + scan_order->iscan); } if (!x->skip_encode && *eob) - vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, *eob); + vp9_idct32x32_add(dqcoeff, dst, dst_stride, *eob); break; case TX_16X16: - tx_type = get_tx_type_16x16(pd->plane_type, xd); - scan = get_scan_16x16(tx_type); - iscan = get_iscan_16x16(tx_type); + tx_type = get_tx_type(pd->plane_type, xd); + scan_order = &vp9_scan_orders[TX_16X16][tx_type]; mode = plane == 0 ? mbmi->mode : mbmi->uv_mode; - block >>= 4; - xoff = 16 * (block & twmask); - yoff = 16 * (block >> twl); - dst = pd->dst.buf + yoff * pd->dst.stride + xoff; - vp9_predict_intra_block(xd, block, bwl, TX_16X16, mode, - dst, pd->dst.stride, dst, pd->dst.stride); + vp9_predict_intra_block(xd, block >> 4, bwl, TX_16X16, mode, + x->skip_encode ? src : dst, + x->skip_encode ? src_stride : dst_stride, + dst, dst_stride, i, j, plane); if (!x->skip_recode) { - src = p->src.buf + yoff * p->src.stride + xoff; - src_diff = p->src_diff + 4 * bw * yoff + xoff; - vp9_subtract_block(16, 16, src_diff, bw * 4, - src, p->src.stride, dst, pd->dst.stride); - vp9_fht16x16(tx_type, src_diff, coeff, bw * 4); + vp9_subtract_block(16, 16, src_diff, diff_stride, + src, src_stride, dst, dst_stride); + vp9_fht16x16(src_diff, coeff, diff_stride, tx_type); vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, scan, iscan); + pd->dequant, p->zbin_extra, eob, scan_order->scan, + scan_order->iscan); } if (!x->skip_encode && *eob) - vp9_iht16x16_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob); + vp9_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob); break; case TX_8X8: - tx_type = get_tx_type_8x8(pd->plane_type, xd); - scan = get_scan_8x8(tx_type); - iscan = get_iscan_8x8(tx_type); + tx_type = get_tx_type(pd->plane_type, xd); + scan_order = &vp9_scan_orders[TX_8X8][tx_type]; mode = plane == 0 ? mbmi->mode : mbmi->uv_mode; - block >>= 2; - xoff = 8 * (block & twmask); - yoff = 8 * (block >> twl); - dst = pd->dst.buf + yoff * pd->dst.stride + xoff; - vp9_predict_intra_block(xd, block, bwl, TX_8X8, mode, - dst, pd->dst.stride, dst, pd->dst.stride); + vp9_predict_intra_block(xd, block >> 2, bwl, TX_8X8, mode, + x->skip_encode ? src : dst, + x->skip_encode ? src_stride : dst_stride, + dst, dst_stride, i, j, plane); if (!x->skip_recode) { - src = p->src.buf + yoff * p->src.stride + xoff; - src_diff = p->src_diff + 4 * bw * yoff + xoff; - vp9_subtract_block(8, 8, src_diff, bw * 4, - src, p->src.stride, dst, pd->dst.stride); - vp9_fht8x8(tx_type, src_diff, coeff, bw * 4); + vp9_subtract_block(8, 8, src_diff, diff_stride, + src, src_stride, dst, dst_stride); + vp9_fht8x8(src_diff, coeff, diff_stride, tx_type); vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, scan, iscan); + pd->dequant, p->zbin_extra, eob, scan_order->scan, + scan_order->iscan); } if (!x->skip_encode && *eob) - vp9_iht8x8_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob); + vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob); break; case TX_4X4: tx_type = get_tx_type_4x4(pd->plane_type, xd, block); - scan = get_scan_4x4(tx_type); - iscan = get_iscan_4x4(tx_type); - if (mbmi->sb_type < BLOCK_8X8 && plane == 0) - mode = xd->mi_8x8[0]->bmi[block].as_mode; - else - mode = plane == 0 ? mbmi->mode : mbmi->uv_mode; - - xoff = 4 * (block & twmask); - yoff = 4 * (block >> twl); - dst = pd->dst.buf + yoff * pd->dst.stride + xoff; + scan_order = &vp9_scan_orders[TX_4X4][tx_type]; + mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode; vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode, - dst, pd->dst.stride, dst, pd->dst.stride); + x->skip_encode ? src : dst, + x->skip_encode ? src_stride : dst_stride, + dst, dst_stride, i, j, plane); if (!x->skip_recode) { - src = p->src.buf + yoff * p->src.stride + xoff; - src_diff = p->src_diff + 4 * bw * yoff + xoff; - vp9_subtract_block(4, 4, src_diff, bw * 4, - src, p->src.stride, dst, pd->dst.stride); + vp9_subtract_block(4, 4, src_diff, diff_stride, + src, src_stride, dst, dst_stride); if (tx_type != DCT_DCT) - vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type); + vp9_fht4x4(src_diff, coeff, diff_stride, tx_type); else - x->fwd_txm4x4(src_diff, coeff, bw * 4); + x->fwd_txm4x4(src_diff, coeff, diff_stride); vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, - pd->dequant, p->zbin_extra, eob, scan, iscan); + pd->dequant, p->zbin_extra, eob, scan_order->scan, + scan_order->iscan); } if (!x->skip_encode && *eob) { @@ -655,28 +693,31 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. - xd->itxm_add(dqcoeff, dst, pd->dst.stride, *eob); + x->itxm_add(dqcoeff, dst, dst_stride, *eob); else - vp9_iht4x4_16_add(dqcoeff, dst, pd->dst.stride, tx_type); + vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type); } break; default: assert(0); + break; } + if (*eob) + *(args->skip) = 0; } -void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize) { - MACROBLOCKD* const xd = &x->e_mbd; - struct optimize_ctx ctx; - struct encode_b_args arg = {x, &ctx}; - - foreach_transformed_block_in_plane(xd, bsize, 0, vp9_encode_block_intra, - &arg); -} -void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize) { - MACROBLOCKD* const xd = &x->e_mbd; - struct optimize_ctx ctx; - struct encode_b_args arg = {x, &ctx}; - foreach_transformed_block_uv(xd, bsize, vp9_encode_block_intra, &arg); +void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + int8_t *skip) { + struct encode_b_args arg = {x, NULL, skip}; + encode_block_intra(plane, block, plane_bsize, tx_size, &arg); } + +void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { + const MACROBLOCKD *const xd = &x->e_mbd; + struct encode_b_args arg = {x, NULL, &xd->mi[0]->mbmi.skip}; + + vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block_intra, + &arg); +} diff --git a/media/libvpx/vp9/encoder/vp9_encodemb.h b/media/libvpx/vp9/encoder/vp9_encodemb.h index 61dd7358e0e3..1999718650ac 100644 --- a/media/libvpx/vp9/encoder/vp9_encodemb.h +++ b/media/libvpx/vp9/encoder/vp9_encodemb.h @@ -13,42 +13,32 @@ #include "./vpx_config.h" #include "vp9/encoder/vp9_block.h" -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_encoder.h" #include "vp9/common/vp9_onyxc_int.h" -typedef struct { - MB_PREDICTION_MODE mode; - MV_REFERENCE_FRAME ref_frame; - MV_REFERENCE_FRAME second_ref_frame; -} MODE_DEFINITION; - -typedef struct { - MV_REFERENCE_FRAME ref_frame; - MV_REFERENCE_FRAME second_ref_frame; -} REF_DEFINITION; - -struct optimize_ctx { - ENTROPY_CONTEXT ta[MAX_MB_PLANE][16]; - ENTROPY_CONTEXT tl[MAX_MB_PLANE][16]; -}; - -struct encode_b_args { - MACROBLOCK *x; - struct optimize_ctx *ctx; -}; +#ifdef __cplusplus +extern "C" { +#endif void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize); -void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize); +void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize); +void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size); +void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size); +void vp9_xform_quant(MACROBLOCK *x, int plane, int block, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size); -void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, void *arg); +void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane); -void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize); -void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize); -void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize); +void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, + int8_t *skip); -void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize); -void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize); +void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane); +#ifdef __cplusplus +} // extern "C" +#endif #endif // VP9_ENCODER_VP9_ENCODEMB_H_ diff --git a/media/libvpx/vp9/encoder/vp9_encodemv.c b/media/libvpx/vp9/encoder/vp9_encodemv.c index 030ca64134ec..9d42a124f954 100644 --- a/media/libvpx/vp9/encoder/vp9_encodemv.c +++ b/media/libvpx/vp9/encoder/vp9_encodemv.c @@ -13,12 +13,21 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_systemdependent.h" + +#include "vp9/encoder/vp9_cost.h" #include "vp9/encoder/vp9_encodemv.h" +static struct vp9_token mv_joint_encodings[MV_JOINTS]; +static struct vp9_token mv_class_encodings[MV_CLASSES]; +static struct vp9_token mv_fp_encodings[MV_FP_SIZE]; +static struct vp9_token mv_class0_encodings[CLASS0_SIZE]; -#ifdef ENTROPY_STATS -extern unsigned int active_section; -#endif +void vp9_entropy_mv_init() { + vp9_tokens_from_tree(mv_joint_encodings, vp9_mv_joint_tree); + vp9_tokens_from_tree(mv_class_encodings, vp9_mv_class_tree); + vp9_tokens_from_tree(mv_class0_encodings, vp9_mv_class0_tree); + vp9_tokens_from_tree(mv_fp_encodings, vp9_mv_fp_tree); +} static void encode_mv_component(vp9_writer* w, int comp, const nmv_component* mvcomp, int usehp) { @@ -36,13 +45,13 @@ static void encode_mv_component(vp9_writer* w, int comp, vp9_write(w, sign, mvcomp->sign); // Class - write_token(w, vp9_mv_class_tree, mvcomp->classes, - &vp9_mv_class_encodings[mv_class]); + vp9_write_token(w, vp9_mv_class_tree, mvcomp->classes, + &mv_class_encodings[mv_class]); // Integer bits if (mv_class == MV_CLASS_0) { - write_token(w, vp9_mv_class0_tree, mvcomp->class0, - &vp9_mv_class0_encodings[d]); + vp9_write_token(w, vp9_mv_class0_tree, mvcomp->class0, + &mv_class0_encodings[d]); } else { int i; const int n = mv_class + CLASS0_BITS - 1; // number of bits @@ -51,9 +60,9 @@ static void encode_mv_component(vp9_writer* w, int comp, } // Fractional bits - write_token(w, vp9_mv_fp_tree, - mv_class == MV_CLASS_0 ? mvcomp->class0_fp[d] : mvcomp->fp, - &vp9_mv_fp_encodings[fr]); + vp9_write_token(w, vp9_mv_fp_tree, + mv_class == MV_CLASS_0 ? mvcomp->class0_fp[d] : mvcomp->fp, + &mv_fp_encodings[fr]); // High precision bit if (usehp) @@ -68,7 +77,7 @@ static void build_nmv_component_cost_table(int *mvcost, int i, v; int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE]; int bits_cost[MV_OFFSET_BITS][2]; - int class0_fp_cost[CLASS0_SIZE][4], fp_cost[4]; + int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE]; int class0_hp_cost[2], hp_cost[2]; sign_cost[0] = vp9_cost_zero(mvcomp->sign); @@ -126,138 +135,66 @@ static void build_nmv_component_cost_table(int *mvcost, static int update_mv(vp9_writer *w, const unsigned int ct[2], vp9_prob *cur_p, vp9_prob upd_p) { - const vp9_prob new_p = get_binary_prob(ct[0], ct[1]); - vp9_prob mod_p = new_p | 1; - const int cur_b = cost_branch256(ct, *cur_p); - const int mod_b = cost_branch256(ct, mod_p); - const int cost = 7 * 256 + (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p)); - if (cur_b - mod_b > cost) { - *cur_p = mod_p; - vp9_write(w, 1, upd_p); - vp9_write_literal(w, mod_p >> 1, 7); - return 1; - } else { - vp9_write(w, 0, upd_p); - return 0; + const vp9_prob new_p = get_binary_prob(ct[0], ct[1]) | 1; + const int update = cost_branch256(ct, *cur_p) + vp9_cost_zero(upd_p) > + cost_branch256(ct, new_p) + vp9_cost_one(upd_p) + 7 * 256; + vp9_write(w, update, upd_p); + if (update) { + *cur_p = new_p; + vp9_write_literal(w, new_p >> 1, 7); } + return update; } -static void counts_to_nmv_context( - nmv_context_counts *nmv_count, - int usehp, - unsigned int (*branch_ct_joint)[2], - unsigned int (*branch_ct_sign)[2], - unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2], - unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2], - unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2], - unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2], - unsigned int (*branch_ct_fp)[4 - 1][2], - unsigned int (*branch_ct_class0_hp)[2], - unsigned int (*branch_ct_hp)[2]) { - int i, j, k; - vp9_tree_probs_from_distribution(vp9_mv_joint_tree, branch_ct_joint, - nmv_count->joints); - for (i = 0; i < 2; ++i) { - const uint32_t s0 = nmv_count->comps[i].sign[0]; - const uint32_t s1 = nmv_count->comps[i].sign[1]; +static void write_mv_update(const vp9_tree_index *tree, + vp9_prob probs[/*n - 1*/], + const unsigned int counts[/*n - 1*/], + int n, vp9_writer *w) { + int i; + unsigned int branch_ct[32][2]; - branch_ct_sign[i][0] = s0; - branch_ct_sign[i][1] = s1; - vp9_tree_probs_from_distribution(vp9_mv_class_tree, - branch_ct_classes[i], - nmv_count->comps[i].classes); - vp9_tree_probs_from_distribution(vp9_mv_class0_tree, - branch_ct_class0[i], - nmv_count->comps[i].class0); - for (j = 0; j < MV_OFFSET_BITS; ++j) { - const uint32_t b0 = nmv_count->comps[i].bits[j][0]; - const uint32_t b1 = nmv_count->comps[i].bits[j][1]; + // Assuming max number of probabilities <= 32 + assert(n <= 32); - branch_ct_bits[i][j][0] = b0; - branch_ct_bits[i][j][1] = b1; - } - } - for (i = 0; i < 2; ++i) { - for (k = 0; k < CLASS0_SIZE; ++k) { - vp9_tree_probs_from_distribution(vp9_mv_fp_tree, - branch_ct_class0_fp[i][k], - nmv_count->comps[i].class0_fp[k]); - } - vp9_tree_probs_from_distribution(vp9_mv_fp_tree, - branch_ct_fp[i], - nmv_count->comps[i].fp); - } - if (usehp) { - for (i = 0; i < 2; ++i) { - const uint32_t c0_hp0 = nmv_count->comps[i].class0_hp[0]; - const uint32_t c0_hp1 = nmv_count->comps[i].class0_hp[1]; - const uint32_t hp0 = nmv_count->comps[i].hp[0]; - const uint32_t hp1 = nmv_count->comps[i].hp[1]; - - branch_ct_class0_hp[i][0] = c0_hp0; - branch_ct_class0_hp[i][1] = c0_hp1; - - branch_ct_hp[i][0] = hp0; - branch_ct_hp[i][1] = hp1; - } - } + vp9_tree_probs_from_distribution(tree, branch_ct, counts); + for (i = 0; i < n - 1; ++i) + update_mv(w, branch_ct[i], &probs[i], MV_UPDATE_PROB); } -void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) { +void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w) { int i, j; - unsigned int branch_ct_joint[MV_JOINTS - 1][2]; - unsigned int branch_ct_sign[2][2]; - unsigned int branch_ct_classes[2][MV_CLASSES - 1][2]; - unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2]; - unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2]; - unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2]; - unsigned int branch_ct_fp[2][4 - 1][2]; - unsigned int branch_ct_class0_hp[2][2]; - unsigned int branch_ct_hp[2][2]; - nmv_context *mvc = &cpi->common.fc.nmvc; + nmv_context *const mvc = &cm->fc.nmvc; + nmv_context_counts *const counts = &cm->counts.mv; - counts_to_nmv_context(&cpi->NMVcount, usehp, - branch_ct_joint, branch_ct_sign, branch_ct_classes, - branch_ct_class0, branch_ct_bits, - branch_ct_class0_fp, branch_ct_fp, - branch_ct_class0_hp, branch_ct_hp); - - for (j = 0; j < MV_JOINTS - 1; ++j) - update_mv(bc, branch_ct_joint[j], &mvc->joints[j], NMV_UPDATE_PROB); + write_mv_update(vp9_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w); for (i = 0; i < 2; ++i) { - update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign, NMV_UPDATE_PROB); - for (j = 0; j < MV_CLASSES - 1; ++j) - update_mv(bc, branch_ct_classes[i][j], &mvc->comps[i].classes[j], - NMV_UPDATE_PROB); - - for (j = 0; j < CLASS0_SIZE - 1; ++j) - update_mv(bc, branch_ct_class0[i][j], &mvc->comps[i].class0[j], - NMV_UPDATE_PROB); + nmv_component *comp = &mvc->comps[i]; + nmv_component_counts *comp_counts = &counts->comps[i]; + update_mv(w, comp_counts->sign, &comp->sign, MV_UPDATE_PROB); + write_mv_update(vp9_mv_class_tree, comp->classes, comp_counts->classes, + MV_CLASSES, w); + write_mv_update(vp9_mv_class0_tree, comp->class0, comp_counts->class0, + CLASS0_SIZE, w); for (j = 0; j < MV_OFFSET_BITS; ++j) - update_mv(bc, branch_ct_bits[i][j], &mvc->comps[i].bits[j], - NMV_UPDATE_PROB); + update_mv(w, comp_counts->bits[j], &comp->bits[j], MV_UPDATE_PROB); } for (i = 0; i < 2; ++i) { - for (j = 0; j < CLASS0_SIZE; ++j) { - int k; - for (k = 0; k < 3; ++k) - update_mv(bc, branch_ct_class0_fp[i][j][k], - &mvc->comps[i].class0_fp[j][k], NMV_UPDATE_PROB); - } + for (j = 0; j < CLASS0_SIZE; ++j) + write_mv_update(vp9_mv_fp_tree, mvc->comps[i].class0_fp[j], + counts->comps[i].class0_fp[j], MV_FP_SIZE, w); - for (j = 0; j < 3; ++j) - update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j], NMV_UPDATE_PROB); + write_mv_update(vp9_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp, + MV_FP_SIZE, w); } if (usehp) { for (i = 0; i < 2; ++i) { - update_mv(bc, branch_ct_class0_hp[i], &mvc->comps[i].class0_hp, - NMV_UPDATE_PROB); - update_mv(bc, branch_ct_hp[i], &mvc->comps[i].hp, - NMV_UPDATE_PROB); + update_mv(w, counts->comps[i].class0_hp, &mvc->comps[i].class0_hp, + MV_UPDATE_PROB); + update_mv(w, counts->comps[i].hp, &mvc->comps[i].hp, MV_UPDATE_PROB); } } } @@ -270,7 +207,7 @@ void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w, const MV_JOINT_TYPE j = vp9_get_mv_joint(&diff); usehp = usehp && vp9_use_mv_hp(ref); - write_token(w, vp9_mv_joint_tree, mvctx->joints, &vp9_mv_joint_encodings[j]); + vp9_write_token(w, vp9_mv_joint_tree, mvctx->joints, &mv_joint_encodings[j]); if (mv_joint_vertical(j)) encode_mv_component(w, diff.row, &mvctx->comps[0], usehp); @@ -279,40 +216,34 @@ void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w, // If auto_mv_step_size is enabled then keep track of the largest // motion vector component used. - if (!cpi->dummy_packing && cpi->sf.auto_mv_step_size) { + if (cpi->sf.mv.auto_mv_step_size) { unsigned int maxv = MAX(abs(mv->row), abs(mv->col)) >> 3; cpi->max_mv_magnitude = MAX(maxv, cpi->max_mv_magnitude); } } -void vp9_build_nmv_cost_table(int *mvjoint, - int *mvcost[2], - const nmv_context* const mvctx, - int usehp, - int mvc_flag_v, - int mvc_flag_h) { - vp9_clear_system_state(); - vp9_cost_tokens(mvjoint, mvctx->joints, vp9_mv_joint_tree); - if (mvc_flag_v) - build_nmv_component_cost_table(mvcost[0], &mvctx->comps[0], usehp); - if (mvc_flag_h) - build_nmv_component_cost_table(mvcost[1], &mvctx->comps[1], usehp); +void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2], + const nmv_context* ctx, int usehp) { + vp9_cost_tokens(mvjoint, ctx->joints, vp9_mv_joint_tree); + build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], usehp); + build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], usehp); } -static void inc_mvs(int_mv mv[2], int_mv ref[2], int is_compound, +static void inc_mvs(const MB_MODE_INFO *mbmi, const int_mv mvs[2], nmv_context_counts *counts) { int i; - for (i = 0; i < 1 + is_compound; ++i) { - const MV diff = { mv[i].as_mv.row - ref[i].as_mv.row, - mv[i].as_mv.col - ref[i].as_mv.col }; + + for (i = 0; i < 1 + has_second_ref(mbmi); ++i) { + const MV *ref = &mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_mv; + const MV diff = {mvs[i].as_mv.row - ref->row, + mvs[i].as_mv.col - ref->col}; vp9_inc_mv(&diff, counts); } } -void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]) { - MODE_INFO *mi = x->e_mbd.mi_8x8[0]; - MB_MODE_INFO *const mbmi = &mi->mbmi; - const int is_compound = has_second_ref(mbmi); +void vp9_update_mv_count(VP9_COMMON *cm, const MACROBLOCKD *xd) { + const MODE_INFO *mi = xd->mi[0]; + const MB_MODE_INFO *const mbmi = &mi->mbmi; if (mbmi->sb_type < BLOCK_8X8) { const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi->sb_type]; @@ -323,10 +254,12 @@ void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]) { for (idx = 0; idx < 2; idx += num_4x4_w) { const int i = idy * 2 + idx; if (mi->bmi[i].as_mode == NEWMV) - inc_mvs(mi->bmi[i].as_mv, best_ref_mv, is_compound, &cpi->NMVcount); + inc_mvs(mbmi, mi->bmi[i].as_mv, &cm->counts.mv); } } - } else if (mbmi->mode == NEWMV) { - inc_mvs(mbmi->mv, best_ref_mv, is_compound, &cpi->NMVcount); + } else { + if (mbmi->mode == NEWMV) + inc_mvs(mbmi, mbmi->mv, &cm->counts.mv); } } + diff --git a/media/libvpx/vp9/encoder/vp9_encodemv.h b/media/libvpx/vp9/encoder/vp9_encodemv.h index 633177885367..e67f9e3b0756 100644 --- a/media/libvpx/vp9/encoder/vp9_encodemv.h +++ b/media/libvpx/vp9/encoder/vp9_encodemv.h @@ -12,20 +12,26 @@ #ifndef VP9_ENCODER_VP9_ENCODEMV_H_ #define VP9_ENCODER_VP9_ENCODEMV_H_ -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_encoder.h" -void vp9_write_nmv_probs(VP9_COMP* const, int usehp, vp9_writer* const); +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_entropy_mv_init(); + +void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w); void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref, const nmv_context* mvctx, int usehp); -void vp9_build_nmv_cost_table(int *mvjoint, - int *mvcost[2], - const nmv_context* const mvctx, - int usehp, - int mvc_flag_v, - int mvc_flag_h); +void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2], + const nmv_context* mvctx, int usehp); -void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]); +void vp9_update_mv_count(VP9_COMMON *cm, const MACROBLOCKD *xd); + +#ifdef __cplusplus +} // extern "C" +#endif #endif // VP9_ENCODER_VP9_ENCODEMV_H_ diff --git a/media/libvpx/vp9/encoder/vp9_encoder.c b/media/libvpx/vp9/encoder/vp9_encoder.c new file mode 100644 index 000000000000..2ca91b977c68 --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_encoder.c @@ -0,0 +1,3055 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "./vpx_config.h" +#include "./vpx_scale_rtcd.h" +#include "vpx/internal/vpx_psnr.h" +#include "vpx_ports/vpx_timer.h" + +#include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_filter.h" +#include "vp9/common/vp9_idct.h" +#if CONFIG_VP9_POSTPROC +#include "vp9/common/vp9_postproc.h" +#endif +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_systemdependent.h" +#include "vp9/common/vp9_tile_common.h" + +#include "vp9/encoder/vp9_aq_complexity.h" +#include "vp9/encoder/vp9_aq_cyclicrefresh.h" +#include "vp9/encoder/vp9_aq_variance.h" +#include "vp9/encoder/vp9_bitstream.h" +#include "vp9/encoder/vp9_context_tree.h" +#include "vp9/encoder/vp9_encodeframe.h" +#include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_firstpass.h" +#include "vp9/encoder/vp9_mbgraph.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_picklpf.h" +#include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_rd.h" +#include "vp9/encoder/vp9_segmentation.h" +#include "vp9/encoder/vp9_speed_features.h" +#if CONFIG_INTERNAL_STATS +#include "vp9/encoder/vp9_ssim.h" +#endif +#include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/vp9_resize.h" +#include "vp9/encoder/vp9_svc_layercontext.h" + +void vp9_coef_tree_initialize(); + +#define SHARP_FILTER_QTHRESH 0 /* Q threshold for 8-tap sharp filter */ + +#define ALTREF_HIGH_PRECISION_MV 1 // Whether to use high precision mv + // for altref computation. +#define HIGH_PRECISION_MV_QTHRESH 200 // Q threshold for high precision + // mv. Choose a very high value for + // now so that HIGH_PRECISION is always + // chosen. + +// #define OUTPUT_YUV_REC + +#ifdef OUTPUT_YUV_DENOISED +FILE *yuv_denoised_file = NULL; +#endif +#ifdef OUTPUT_YUV_REC +FILE *yuv_rec_file; +#endif + +#if 0 +FILE *framepsnr; +FILE *kf_list; +FILE *keyfile; +#endif + +static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) { + switch (mode) { + case NORMAL: + *hr = 1; + *hs = 1; + break; + case FOURFIVE: + *hr = 4; + *hs = 5; + break; + case THREEFIVE: + *hr = 3; + *hs = 5; + break; + case ONETWO: + *hr = 1; + *hs = 2; + break; + default: + *hr = 1; + *hs = 1; + assert(0); + break; + } +} + +void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv) { + MACROBLOCK *const mb = &cpi->mb; + cpi->common.allow_high_precision_mv = allow_high_precision_mv; + if (cpi->common.allow_high_precision_mv) { + mb->mvcost = mb->nmvcost_hp; + mb->mvsadcost = mb->nmvsadcost_hp; + } else { + mb->mvcost = mb->nmvcost; + mb->mvsadcost = mb->nmvsadcost; + } +} + +static void setup_frame(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + // Set up entropy context depending on frame type. The decoder mandates + // the use of the default context, index 0, for keyframes and inter + // frames where the error_resilient_mode or intra_only flag is set. For + // other inter-frames the encoder currently uses only two contexts; + // context 1 for ALTREF frames and context 0 for the others. + if (frame_is_intra_only(cm) || cm->error_resilient_mode) { + vp9_setup_past_independence(cm); + } else { + if (!cpi->use_svc) + cm->frame_context_idx = cpi->refresh_alt_ref_frame; + } + + if (cm->frame_type == KEY_FRAME) { + if (!is_two_pass_svc(cpi)) + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 1; + vp9_zero(cpi->interp_filter_selected); + } else { + cm->fc = cm->frame_contexts[cm->frame_context_idx]; + vp9_zero(cpi->interp_filter_selected[0]); + } +} + +void vp9_initialize_enc() { + static int init_done = 0; + + if (!init_done) { + vp9_rtcd(); + vp9_init_neighbors(); + vp9_init_intra_predictors(); + vp9_coef_tree_initialize(); + vp9_tokenize_initialize(); + vp9_init_me_luts(); + vp9_rc_init_minq_luts(); + vp9_entropy_mv_init(); + vp9_entropy_mode_init(); + vp9_temporal_filter_init(); + init_done = 1; + } +} + +static void dealloc_compressor_data(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + int i; + + // Delete sementation map + vpx_free(cpi->segmentation_map); + cpi->segmentation_map = NULL; + vpx_free(cm->last_frame_seg_map); + cm->last_frame_seg_map = NULL; + vpx_free(cpi->coding_context.last_frame_seg_map_copy); + cpi->coding_context.last_frame_seg_map_copy = NULL; + + vpx_free(cpi->complexity_map); + cpi->complexity_map = NULL; + + vpx_free(cpi->nmvcosts[0]); + vpx_free(cpi->nmvcosts[1]); + cpi->nmvcosts[0] = NULL; + cpi->nmvcosts[1] = NULL; + + vpx_free(cpi->nmvcosts_hp[0]); + vpx_free(cpi->nmvcosts_hp[1]); + cpi->nmvcosts_hp[0] = NULL; + cpi->nmvcosts_hp[1] = NULL; + + vpx_free(cpi->nmvsadcosts[0]); + vpx_free(cpi->nmvsadcosts[1]); + cpi->nmvsadcosts[0] = NULL; + cpi->nmvsadcosts[1] = NULL; + + vpx_free(cpi->nmvsadcosts_hp[0]); + vpx_free(cpi->nmvsadcosts_hp[1]); + cpi->nmvsadcosts_hp[0] = NULL; + cpi->nmvsadcosts_hp[1] = NULL; + + vp9_cyclic_refresh_free(cpi->cyclic_refresh); + cpi->cyclic_refresh = NULL; + + vp9_free_ref_frame_buffers(cm); + vp9_free_context_buffers(cm); + + vp9_free_frame_buffer(&cpi->last_frame_uf); + vp9_free_frame_buffer(&cpi->scaled_source); + vp9_free_frame_buffer(&cpi->scaled_last_source); + vp9_free_frame_buffer(&cpi->alt_ref_buffer); + vp9_lookahead_destroy(cpi->lookahead); + + vpx_free(cpi->tok); + cpi->tok = 0; + + vp9_free_pc_tree(cpi); + + for (i = 0; i < cpi->svc.number_spatial_layers; ++i) { + LAYER_CONTEXT *const lc = &cpi->svc.layer_context[i]; + vpx_free(lc->rc_twopass_stats_in.buf); + lc->rc_twopass_stats_in.buf = NULL; + lc->rc_twopass_stats_in.sz = 0; + } + + if (cpi->source_diff_var != NULL) { + vpx_free(cpi->source_diff_var); + cpi->source_diff_var = NULL; + } + + for (i = 0; i < MAX_LAG_BUFFERS; ++i) { + vp9_free_frame_buffer(&cpi->svc.scaled_frames[i]); + } + vpx_memset(&cpi->svc.scaled_frames[0], 0, + MAX_LAG_BUFFERS * sizeof(cpi->svc.scaled_frames[0])); +} + +static void save_coding_context(VP9_COMP *cpi) { + CODING_CONTEXT *const cc = &cpi->coding_context; + VP9_COMMON *cm = &cpi->common; + + // Stores a snapshot of key state variables which can subsequently be + // restored with a call to vp9_restore_coding_context. These functions are + // intended for use in a re-code loop in vp9_compress_frame where the + // quantizer value is adjusted between loop iterations. + vp9_copy(cc->nmvjointcost, cpi->mb.nmvjointcost); + + vpx_memcpy(cc->nmvcosts[0], cpi->nmvcosts[0], + MV_VALS * sizeof(*cpi->nmvcosts[0])); + vpx_memcpy(cc->nmvcosts[1], cpi->nmvcosts[1], + MV_VALS * sizeof(*cpi->nmvcosts[1])); + vpx_memcpy(cc->nmvcosts_hp[0], cpi->nmvcosts_hp[0], + MV_VALS * sizeof(*cpi->nmvcosts_hp[0])); + vpx_memcpy(cc->nmvcosts_hp[1], cpi->nmvcosts_hp[1], + MV_VALS * sizeof(*cpi->nmvcosts_hp[1])); + + vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs); + + vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy, + cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols)); + + vp9_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas); + vp9_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas); + + cc->fc = cm->fc; +} + +static void restore_coding_context(VP9_COMP *cpi) { + CODING_CONTEXT *const cc = &cpi->coding_context; + VP9_COMMON *cm = &cpi->common; + + // Restore key state variables to the snapshot state stored in the + // previous call to vp9_save_coding_context. + vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost); + + vpx_memcpy(cpi->nmvcosts[0], cc->nmvcosts[0], + MV_VALS * sizeof(*cc->nmvcosts[0])); + vpx_memcpy(cpi->nmvcosts[1], cc->nmvcosts[1], + MV_VALS * sizeof(*cc->nmvcosts[1])); + vpx_memcpy(cpi->nmvcosts_hp[0], cc->nmvcosts_hp[0], + MV_VALS * sizeof(*cc->nmvcosts_hp[0])); + vpx_memcpy(cpi->nmvcosts_hp[1], cc->nmvcosts_hp[1], + MV_VALS * sizeof(*cc->nmvcosts_hp[1])); + + vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs); + + vpx_memcpy(cm->last_frame_seg_map, + cpi->coding_context.last_frame_seg_map_copy, + (cm->mi_rows * cm->mi_cols)); + + vp9_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas); + vp9_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas); + + cm->fc = cc->fc; +} + +static void configure_static_seg_features(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + struct segmentation *const seg = &cm->seg; + + int high_q = (int)(rc->avg_q > 48.0); + int qi_delta; + + // Disable and clear down for KF + if (cm->frame_type == KEY_FRAME) { + // Clear down the global segmentation map + vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); + seg->update_map = 0; + seg->update_data = 0; + cpi->static_mb_pct = 0; + + // Disable segmentation + vp9_disable_segmentation(seg); + + // Clear down the segment features. + vp9_clearall_segfeatures(seg); + } else if (cpi->refresh_alt_ref_frame) { + // If this is an alt ref frame + // Clear down the global segmentation map + vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); + seg->update_map = 0; + seg->update_data = 0; + cpi->static_mb_pct = 0; + + // Disable segmentation and individual segment features by default + vp9_disable_segmentation(seg); + vp9_clearall_segfeatures(seg); + + // Scan frames from current to arf frame. + // This function re-enables segmentation if appropriate. + vp9_update_mbgraph_stats(cpi); + + // If segmentation was enabled set those features needed for the + // arf itself. + if (seg->enabled) { + seg->update_map = 1; + seg->update_data = 1; + + qi_delta = vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875); + vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2); + vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2); + + vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q); + vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_LF); + + // Where relevant assume segment data is delta data + seg->abs_delta = SEGMENT_DELTADATA; + } + } else if (seg->enabled) { + // All other frames if segmentation has been enabled + + // First normal frame in a valid gf or alt ref group + if (rc->frames_since_golden == 0) { + // Set up segment features for normal frames in an arf group + if (rc->source_alt_ref_active) { + seg->update_map = 0; + seg->update_data = 1; + seg->abs_delta = SEGMENT_DELTADATA; + + qi_delta = vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125); + vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2); + vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q); + + vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2); + vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_LF); + + // Segment coding disabled for compred testing + if (high_q || (cpi->static_mb_pct == 100)) { + vp9_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME); + vp9_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME); + vp9_enable_segfeature(seg, 1, SEG_LVL_SKIP); + } + } else { + // Disable segmentation and clear down features if alt ref + // is not active for this group + + vp9_disable_segmentation(seg); + + vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); + + seg->update_map = 0; + seg->update_data = 0; + + vp9_clearall_segfeatures(seg); + } + } else if (rc->is_src_frame_alt_ref) { + // Special case where we are coding over the top of a previous + // alt ref frame. + // Segment coding disabled for compred testing + + // Enable ref frame features for segment 0 as well + vp9_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME); + vp9_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME); + + // All mbs should use ALTREF_FRAME + vp9_clear_segdata(seg, 0, SEG_LVL_REF_FRAME); + vp9_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME); + vp9_clear_segdata(seg, 1, SEG_LVL_REF_FRAME); + vp9_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME); + + // Skip all MBs if high Q (0,0 mv and skip coeffs) + if (high_q) { + vp9_enable_segfeature(seg, 0, SEG_LVL_SKIP); + vp9_enable_segfeature(seg, 1, SEG_LVL_SKIP); + } + // Enable data update + seg->update_data = 1; + } else { + // All other frames. + + // No updates.. leave things as they are. + seg->update_map = 0; + seg->update_data = 0; + } + } +} + +static void update_reference_segmentation_map(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + MODE_INFO **mi_8x8_ptr = cm->mi_grid_visible; + uint8_t *cache_ptr = cm->last_frame_seg_map; + int row, col; + + for (row = 0; row < cm->mi_rows; row++) { + MODE_INFO **mi_8x8 = mi_8x8_ptr; + uint8_t *cache = cache_ptr; + for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++) + cache[0] = mi_8x8[0]->mbmi.segment_id; + mi_8x8_ptr += cm->mi_stride; + cache_ptr += cm->mi_cols; + } +} + +static void alloc_raw_frame_buffers(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + const VP9EncoderConfig *oxcf = &cpi->oxcf; + + cpi->lookahead = vp9_lookahead_init(oxcf->width, oxcf->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + oxcf->lag_in_frames); + if (!cpi->lookahead) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate lag buffers"); + + if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer, + oxcf->width, oxcf->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate altref buffer"); +} + +static void alloc_ref_frame_buffers(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + if (vp9_alloc_ref_frame_buffers(cm, cm->width, cm->height)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffers"); +} + +static void alloc_util_frame_buffers(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + if (vp9_realloc_frame_buffer(&cpi->last_frame_uf, + cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate last frame buffer"); + + if (vp9_realloc_frame_buffer(&cpi->scaled_source, + cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate scaled source buffer"); + + if (vp9_realloc_frame_buffer(&cpi->scaled_last_source, + cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate scaled last source buffer"); +} + +void vp9_alloc_compressor_data(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + + vp9_alloc_context_buffers(cm, cm->width, cm->height); + + vpx_free(cpi->tok); + + { + unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols); + CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok))); + } + + vp9_setup_pc_tree(&cpi->common, cpi); +} + +static void update_frame_size(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->mb.e_mbd; + + vp9_set_mb_mi(cm, cm->width, cm->height); + vp9_init_context_buffers(cm); + init_macroblockd(cm, xd); + + if (is_two_pass_svc(cpi)) { + if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer, + cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to reallocate alt_ref_buffer"); + } +} + +void vp9_new_framerate(VP9_COMP *cpi, double framerate) { + cpi->framerate = framerate < 0.1 ? 30 : framerate; + vp9_rc_update_framerate(cpi); +} + +static void set_tile_limits(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + + int min_log2_tile_cols, max_log2_tile_cols; + vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); + + cm->log2_tile_cols = clamp(cpi->oxcf.tile_columns, + min_log2_tile_cols, max_log2_tile_cols); + cm->log2_tile_rows = cpi->oxcf.tile_rows; +} + +static void init_buffer_indices(VP9_COMP *cpi) { + cpi->lst_fb_idx = 0; + cpi->gld_fb_idx = 1; + cpi->alt_fb_idx = 2; +} + +static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) { + VP9_COMMON *const cm = &cpi->common; + + cpi->oxcf = *oxcf; + cpi->framerate = oxcf->init_framerate; + + cm->profile = oxcf->profile; + cm->bit_depth = oxcf->bit_depth; + cm->color_space = UNKNOWN; + + cm->width = oxcf->width; + cm->height = oxcf->height; + vp9_alloc_compressor_data(cpi); + + // Spatial scalability. + cpi->svc.number_spatial_layers = oxcf->ss_number_layers; + // Temporal scalability. + cpi->svc.number_temporal_layers = oxcf->ts_number_layers; + + if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) || + ((cpi->svc.number_temporal_layers > 1 || + cpi->svc.number_spatial_layers > 1) && + cpi->oxcf.pass == 2)) { + vp9_init_layer_context(cpi); + } + + // change includes all joint functionality + vp9_change_config(cpi, oxcf); + + cpi->static_mb_pct = 0; + cpi->ref_frame_flags = 0; + + init_buffer_indices(cpi); + + set_tile_limits(cpi); +} + +static void set_rc_buffer_sizes(RATE_CONTROL *rc, + const VP9EncoderConfig *oxcf) { + const int64_t bandwidth = oxcf->target_bandwidth; + const int64_t starting = oxcf->starting_buffer_level_ms; + const int64_t optimal = oxcf->optimal_buffer_level_ms; + const int64_t maximum = oxcf->maximum_buffer_size_ms; + + rc->starting_buffer_level = starting * bandwidth / 1000; + rc->optimal_buffer_level = (optimal == 0) ? bandwidth / 8 + : optimal * bandwidth / 1000; + rc->maximum_buffer_size = (maximum == 0) ? bandwidth / 8 + : maximum * bandwidth / 1000; +} + +void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { + VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + + if (cm->profile != oxcf->profile) + cm->profile = oxcf->profile; + cm->bit_depth = oxcf->bit_depth; + + if (cm->profile <= PROFILE_1) + assert(cm->bit_depth == VPX_BITS_8); + else + assert(cm->bit_depth > VPX_BITS_8); + + cpi->oxcf = *oxcf; + + rc->baseline_gf_interval = DEFAULT_GF_INTERVAL; + + cpi->refresh_golden_frame = 0; + cpi->refresh_last_frame = 1; + cm->refresh_frame_context = 1; + cm->reset_frame_context = 0; + + vp9_reset_segment_features(&cm->seg); + vp9_set_high_precision_mv(cpi, 0); + + { + int i; + + for (i = 0; i < MAX_SEGMENTS; i++) + cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout; + } + cpi->encode_breakout = cpi->oxcf.encode_breakout; + + set_rc_buffer_sizes(rc, &cpi->oxcf); + + // Under a configuration change, where maximum_buffer_size may change, + // keep buffer level clipped to the maximum allowed buffer size. + rc->bits_off_target = MIN(rc->bits_off_target, rc->maximum_buffer_size); + rc->buffer_level = MIN(rc->buffer_level, rc->maximum_buffer_size); + + // Set up frame rate and related parameters rate control values. + vp9_new_framerate(cpi, cpi->framerate); + + // Set absolute upper and lower quality limits + rc->worst_quality = cpi->oxcf.worst_allowed_q; + rc->best_quality = cpi->oxcf.best_allowed_q; + + cm->interp_filter = cpi->sf.default_interp_filter; + + cm->display_width = cpi->oxcf.width; + cm->display_height = cpi->oxcf.height; + + if (cpi->initial_width) { + // Increasing the size of the frame beyond the first seen frame, or some + // otherwise signaled maximum size, is not supported. + // TODO(jkoleszar): exit gracefully. + assert(cm->width <= cpi->initial_width); + assert(cm->height <= cpi->initial_height); + } + update_frame_size(cpi); + + if ((cpi->svc.number_temporal_layers > 1 && + cpi->oxcf.rc_mode == VPX_CBR) || + ((cpi->svc.number_temporal_layers > 1 || + cpi->svc.number_spatial_layers > 1) && + cpi->oxcf.pass == 2)) { + vp9_update_layer_context_change_config(cpi, + (int)cpi->oxcf.target_bandwidth); + } + + cpi->alt_ref_source = NULL; + rc->is_src_frame_alt_ref = 0; + +#if 0 + // Experimental RD Code + cpi->frame_distortion = 0; + cpi->last_frame_distortion = 0; +#endif + + set_tile_limits(cpi); + + cpi->ext_refresh_frame_flags_pending = 0; + cpi->ext_refresh_frame_context_pending = 0; + +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0) { + vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS); + } +#endif +} + +#ifndef M_LOG2_E +#define M_LOG2_E 0.693147180559945309417 +#endif +#define log2f(x) (log (x) / (float) M_LOG2_E) + +static void cal_nmvjointsadcost(int *mvjointsadcost) { + mvjointsadcost[0] = 600; + mvjointsadcost[1] = 300; + mvjointsadcost[2] = 300; + mvjointsadcost[3] = 300; +} + +static void cal_nmvsadcosts(int *mvsadcost[2]) { + int i = 1; + + mvsadcost[0][0] = 0; + mvsadcost[1][0] = 0; + + do { + double z = 256 * (2 * (log2f(8 * i) + .6)); + mvsadcost[0][i] = (int)z; + mvsadcost[1][i] = (int)z; + mvsadcost[0][-i] = (int)z; + mvsadcost[1][-i] = (int)z; + } while (++i <= MV_MAX); +} + +static void cal_nmvsadcosts_hp(int *mvsadcost[2]) { + int i = 1; + + mvsadcost[0][0] = 0; + mvsadcost[1][0] = 0; + + do { + double z = 256 * (2 * (log2f(8 * i) + .6)); + mvsadcost[0][i] = (int)z; + mvsadcost[1][i] = (int)z; + mvsadcost[0][-i] = (int)z; + mvsadcost[1][-i] = (int)z; + } while (++i <= MV_MAX); +} + + +VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) { + unsigned int i, j; + VP9_COMP *const cpi = vpx_memalign(32, sizeof(VP9_COMP)); + VP9_COMMON *const cm = cpi != NULL ? &cpi->common : NULL; + + if (!cm) + return NULL; + + vp9_zero(*cpi); + + if (setjmp(cm->error.jmp)) { + cm->error.setjmp = 0; + vp9_remove_compressor(cpi); + return 0; + } + + cm->error.setjmp = 1; + + cpi->use_svc = 0; + + init_config(cpi, oxcf); + vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc); + + cm->current_video_frame = 0; + cpi->skippable_frame = 0; + + // Create the encoder segmentation map and set all entries to 0 + CHECK_MEM_ERROR(cm, cpi->segmentation_map, + vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); + + // Create a complexity map used for rd adjustment + CHECK_MEM_ERROR(cm, cpi->complexity_map, + vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); + + // Create a map used for cyclic background refresh. + CHECK_MEM_ERROR(cm, cpi->cyclic_refresh, + vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols)); + + // And a place holder structure is the coding context + // for use if we want to save and restore it + CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy, + vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); + + CHECK_MEM_ERROR(cm, cpi->nmvcosts[0], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0]))); + CHECK_MEM_ERROR(cm, cpi->nmvcosts[1], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1]))); + CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[0], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0]))); + CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[1], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1]))); + CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[0], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0]))); + CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[1], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1]))); + CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[0], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0]))); + CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1], + vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1]))); + + for (i = 0; i < (sizeof(cpi->mbgraph_stats) / + sizeof(cpi->mbgraph_stats[0])); i++) { + CHECK_MEM_ERROR(cm, cpi->mbgraph_stats[i].mb_stats, + vpx_calloc(cm->MBs * + sizeof(*cpi->mbgraph_stats[i].mb_stats), 1)); + } + +#if CONFIG_FP_MB_STATS + cpi->use_fp_mb_stats = 0; + if (cpi->use_fp_mb_stats) { + // a place holder used to store the first pass mb stats in the first pass + CHECK_MEM_ERROR(cm, cpi->twopass.frame_mb_stats_buf, + vpx_calloc(cm->MBs * sizeof(uint8_t), 1)); + } else { + cpi->twopass.frame_mb_stats_buf = NULL; + } +#endif + + cpi->refresh_alt_ref_frame = 0; + + // Note that at the moment multi_arf will not work with svc. + // For the current check in all the execution paths are defaulted to 0 + // pending further tuning and testing. The code is left in place here + // as a place holder in regard to the required paths. + cpi->multi_arf_last_grp_enabled = 0; + if (oxcf->pass == 2) { + if (cpi->use_svc) { + cpi->multi_arf_allowed = 0; + cpi->multi_arf_enabled = 0; + } else { + // Disable by default for now. + cpi->multi_arf_allowed = 0; + cpi->multi_arf_enabled = 0; + } + } else { + cpi->multi_arf_allowed = 0; + cpi->multi_arf_enabled = 0; + } + + cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; +#if CONFIG_INTERNAL_STATS + cpi->b_calculate_ssimg = 0; + + cpi->count = 0; + cpi->bytes = 0; + + if (cpi->b_calculate_psnr) { + cpi->total_y = 0.0; + cpi->total_u = 0.0; + cpi->total_v = 0.0; + cpi->total = 0.0; + cpi->total_sq_error = 0; + cpi->total_samples = 0; + + cpi->totalp_y = 0.0; + cpi->totalp_u = 0.0; + cpi->totalp_v = 0.0; + cpi->totalp = 0.0; + cpi->totalp_sq_error = 0; + cpi->totalp_samples = 0; + + cpi->tot_recode_hits = 0; + cpi->summed_quality = 0; + cpi->summed_weights = 0; + cpi->summedp_quality = 0; + cpi->summedp_weights = 0; + } + + if (cpi->b_calculate_ssimg) { + cpi->total_ssimg_y = 0; + cpi->total_ssimg_u = 0; + cpi->total_ssimg_v = 0; + cpi->total_ssimg_all = 0; + } + +#endif + + cpi->first_time_stamp_ever = INT64_MAX; + + cal_nmvjointsadcost(cpi->mb.nmvjointsadcost); + cpi->mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX]; + cpi->mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX]; + cpi->mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX]; + cpi->mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX]; + cal_nmvsadcosts(cpi->mb.nmvsadcost); + + cpi->mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX]; + cpi->mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX]; + cpi->mb.nmvsadcost_hp[0] = &cpi->nmvsadcosts_hp[0][MV_MAX]; + cpi->mb.nmvsadcost_hp[1] = &cpi->nmvsadcosts_hp[1][MV_MAX]; + cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp); + +#if CONFIG_VP9_TEMPORAL_DENOISING +#ifdef OUTPUT_YUV_DENOISED + yuv_denoised_file = fopen("denoised.yuv", "ab"); +#endif +#endif +#ifdef OUTPUT_YUV_REC + yuv_rec_file = fopen("rec.yuv", "wb"); +#endif + +#if 0 + framepsnr = fopen("framepsnr.stt", "a"); + kf_list = fopen("kf_list.stt", "w"); +#endif + + cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; + + if (oxcf->pass == 1) { + vp9_init_first_pass(cpi); + } else if (oxcf->pass == 2) { + const size_t packet_sz = sizeof(FIRSTPASS_STATS); + const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz); + + if (cpi->svc.number_spatial_layers > 1 + || cpi->svc.number_temporal_layers > 1) { + FIRSTPASS_STATS *const stats = oxcf->two_pass_stats_in.buf; + FIRSTPASS_STATS *stats_copy[VPX_SS_MAX_LAYERS] = {0}; + int i; + + for (i = 0; i < oxcf->ss_number_layers; ++i) { + FIRSTPASS_STATS *const last_packet_for_layer = + &stats[packets - oxcf->ss_number_layers + i]; + const int layer_id = (int)last_packet_for_layer->spatial_layer_id; + const int packets_in_layer = (int)last_packet_for_layer->count + 1; + if (layer_id >= 0 && layer_id < oxcf->ss_number_layers) { + LAYER_CONTEXT *const lc = &cpi->svc.layer_context[layer_id]; + + vpx_free(lc->rc_twopass_stats_in.buf); + + lc->rc_twopass_stats_in.sz = packets_in_layer * packet_sz; + CHECK_MEM_ERROR(cm, lc->rc_twopass_stats_in.buf, + vpx_malloc(lc->rc_twopass_stats_in.sz)); + lc->twopass.stats_in_start = lc->rc_twopass_stats_in.buf; + lc->twopass.stats_in = lc->twopass.stats_in_start; + lc->twopass.stats_in_end = lc->twopass.stats_in_start + + packets_in_layer - 1; + stats_copy[layer_id] = lc->rc_twopass_stats_in.buf; + } + } + + for (i = 0; i < packets; ++i) { + const int layer_id = (int)stats[i].spatial_layer_id; + if (layer_id >= 0 && layer_id < oxcf->ss_number_layers + && stats_copy[layer_id] != NULL) { + *stats_copy[layer_id] = stats[i]; + ++stats_copy[layer_id]; + } + } + + vp9_init_second_pass_spatial_svc(cpi); + } else { +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + const size_t psz = cpi->common.MBs * sizeof(uint8_t); + const int ps = (int)(oxcf->firstpass_mb_stats_in.sz / psz); + + cpi->twopass.firstpass_mb_stats.mb_stats_start = + oxcf->firstpass_mb_stats_in.buf; + cpi->twopass.firstpass_mb_stats.mb_stats_end = + cpi->twopass.firstpass_mb_stats.mb_stats_start + + (ps - 1) * cpi->common.MBs * sizeof(uint8_t); + } +#endif + + cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf; + cpi->twopass.stats_in = cpi->twopass.stats_in_start; + cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1]; + + vp9_init_second_pass(cpi); + } + } + + vp9_set_speed_features(cpi); + + // Allocate memory to store variances for a frame. + CHECK_MEM_ERROR(cm, cpi->source_diff_var, + vpx_calloc(cm->MBs, sizeof(diff))); + cpi->source_var_thresh = 0; + cpi->frames_till_next_var_check = 0; + + // Default rd threshold factors for mode selection + for (i = 0; i < BLOCK_SIZES; ++i) { + for (j = 0; j < MAX_MODES; ++j) + cpi->rd.thresh_freq_fact[i][j] = 32; + } + +#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF)\ + cpi->fn_ptr[BT].sdf = SDF; \ + cpi->fn_ptr[BT].sdaf = SDAF; \ + cpi->fn_ptr[BT].vf = VF; \ + cpi->fn_ptr[BT].svf = SVF; \ + cpi->fn_ptr[BT].svaf = SVAF; \ + cpi->fn_ptr[BT].sdx3f = SDX3F; \ + cpi->fn_ptr[BT].sdx8f = SDX8F; \ + cpi->fn_ptr[BT].sdx4df = SDX4DF; + + BFP(BLOCK_32X16, vp9_sad32x16, vp9_sad32x16_avg, + vp9_variance32x16, vp9_sub_pixel_variance32x16, + vp9_sub_pixel_avg_variance32x16, NULL, NULL, vp9_sad32x16x4d) + + BFP(BLOCK_16X32, vp9_sad16x32, vp9_sad16x32_avg, + vp9_variance16x32, vp9_sub_pixel_variance16x32, + vp9_sub_pixel_avg_variance16x32, NULL, NULL, vp9_sad16x32x4d) + + BFP(BLOCK_64X32, vp9_sad64x32, vp9_sad64x32_avg, + vp9_variance64x32, vp9_sub_pixel_variance64x32, + vp9_sub_pixel_avg_variance64x32, NULL, NULL, vp9_sad64x32x4d) + + BFP(BLOCK_32X64, vp9_sad32x64, vp9_sad32x64_avg, + vp9_variance32x64, vp9_sub_pixel_variance32x64, + vp9_sub_pixel_avg_variance32x64, NULL, NULL, vp9_sad32x64x4d) + + BFP(BLOCK_32X32, vp9_sad32x32, vp9_sad32x32_avg, + vp9_variance32x32, vp9_sub_pixel_variance32x32, + vp9_sub_pixel_avg_variance32x32, vp9_sad32x32x3, vp9_sad32x32x8, + vp9_sad32x32x4d) + + BFP(BLOCK_64X64, vp9_sad64x64, vp9_sad64x64_avg, + vp9_variance64x64, vp9_sub_pixel_variance64x64, + vp9_sub_pixel_avg_variance64x64, vp9_sad64x64x3, vp9_sad64x64x8, + vp9_sad64x64x4d) + + BFP(BLOCK_16X16, vp9_sad16x16, vp9_sad16x16_avg, + vp9_variance16x16, vp9_sub_pixel_variance16x16, + vp9_sub_pixel_avg_variance16x16, vp9_sad16x16x3, vp9_sad16x16x8, + vp9_sad16x16x4d) + + BFP(BLOCK_16X8, vp9_sad16x8, vp9_sad16x8_avg, + vp9_variance16x8, vp9_sub_pixel_variance16x8, + vp9_sub_pixel_avg_variance16x8, + vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d) + + BFP(BLOCK_8X16, vp9_sad8x16, vp9_sad8x16_avg, + vp9_variance8x16, vp9_sub_pixel_variance8x16, + vp9_sub_pixel_avg_variance8x16, + vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d) + + BFP(BLOCK_8X8, vp9_sad8x8, vp9_sad8x8_avg, + vp9_variance8x8, vp9_sub_pixel_variance8x8, + vp9_sub_pixel_avg_variance8x8, + vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d) + + BFP(BLOCK_8X4, vp9_sad8x4, vp9_sad8x4_avg, + vp9_variance8x4, vp9_sub_pixel_variance8x4, + vp9_sub_pixel_avg_variance8x4, NULL, vp9_sad8x4x8, vp9_sad8x4x4d) + + BFP(BLOCK_4X8, vp9_sad4x8, vp9_sad4x8_avg, + vp9_variance4x8, vp9_sub_pixel_variance4x8, + vp9_sub_pixel_avg_variance4x8, NULL, vp9_sad4x8x8, vp9_sad4x8x4d) + + BFP(BLOCK_4X4, vp9_sad4x4, vp9_sad4x4_avg, + vp9_variance4x4, vp9_sub_pixel_variance4x4, + vp9_sub_pixel_avg_variance4x4, + vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d) + + /* vp9_init_quantizer() is first called here. Add check in + * vp9_frame_init_quantizer() so that vp9_init_quantizer is only + * called later when needed. This will avoid unnecessary calls of + * vp9_init_quantizer() for every frame. + */ + vp9_init_quantizer(cpi); + + vp9_loop_filter_init(cm); + + cm->error.setjmp = 0; + + return cpi; +} + +void vp9_remove_compressor(VP9_COMP *cpi) { + unsigned int i; + + if (!cpi) + return; + + if (cpi && (cpi->common.current_video_frame > 0)) { +#if CONFIG_INTERNAL_STATS + + vp9_clear_system_state(); + + // printf("\n8x8-4x4:%d-%d\n", cpi->t8x8_count, cpi->t4x4_count); + if (cpi->oxcf.pass != 1) { + FILE *f = fopen("opsnr.stt", "a"); + double time_encoded = (cpi->last_end_time_stamp_seen + - cpi->first_time_stamp_ever) / 10000000.000; + double total_encode_time = (cpi->time_receive_data + + cpi->time_compress_data) / 1000.000; + double dr = (double)cpi->bytes * (double) 8 / (double)1000 + / time_encoded; + + if (cpi->b_calculate_psnr) { + const double total_psnr = + vpx_sse_to_psnr((double)cpi->total_samples, 255.0, + (double)cpi->total_sq_error); + const double totalp_psnr = + vpx_sse_to_psnr((double)cpi->totalp_samples, 255.0, + (double)cpi->totalp_sq_error); + const double total_ssim = 100 * pow(cpi->summed_quality / + cpi->summed_weights, 8.0); + const double totalp_ssim = 100 * pow(cpi->summedp_quality / + cpi->summedp_weights, 8.0); + + fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t" + "VPXSSIM\tVPSSIMP\t Time(ms)\n"); + fprintf(f, "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f\n", + dr, cpi->total / cpi->count, total_psnr, + cpi->totalp / cpi->count, totalp_psnr, total_ssim, totalp_ssim, + total_encode_time); + } + + if (cpi->b_calculate_ssimg) { + fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t Time(ms)\n"); + fprintf(f, "%7.2f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr, + cpi->total_ssimg_y / cpi->count, + cpi->total_ssimg_u / cpi->count, + cpi->total_ssimg_v / cpi->count, + cpi->total_ssimg_all / cpi->count, total_encode_time); + } + + fclose(f); + } + +#endif + +#if 0 + { + printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000); + printf("\n_frames recive_data encod_mb_row compress_frame Total\n"); + printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, + cpi->time_receive_data / 1000, cpi->time_encode_sb_row / 1000, + cpi->time_compress_data / 1000, + (cpi->time_receive_data + cpi->time_compress_data) / 1000); + } +#endif + } + +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0) { + vp9_denoiser_free(&(cpi->denoiser)); + } +#endif + + dealloc_compressor_data(cpi); + vpx_free(cpi->tok); + + for (i = 0; i < sizeof(cpi->mbgraph_stats) / + sizeof(cpi->mbgraph_stats[0]); ++i) { + vpx_free(cpi->mbgraph_stats[i].mb_stats); + } + +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + vpx_free(cpi->twopass.frame_mb_stats_buf); + cpi->twopass.frame_mb_stats_buf = NULL; + } +#endif + + vp9_remove_common(&cpi->common); + vpx_free(cpi); + +#if CONFIG_VP9_TEMPORAL_DENOISING +#ifdef OUTPUT_YUV_DENOISED + fclose(yuv_denoised_file); +#endif +#endif +#ifdef OUTPUT_YUV_REC + fclose(yuv_rec_file); +#endif + +#if 0 + + if (keyfile) + fclose(keyfile); + + if (framepsnr) + fclose(framepsnr); + + if (kf_list) + fclose(kf_list); + +#endif +} +static int64_t get_sse(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int width, int height) { + const int dw = width % 16; + const int dh = height % 16; + int64_t total_sse = 0; + unsigned int sse = 0; + int sum = 0; + int x, y; + + if (dw > 0) { + variance(&a[width - dw], a_stride, &b[width - dw], b_stride, + dw, height, &sse, &sum); + total_sse += sse; + } + + if (dh > 0) { + variance(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, + width - dw, dh, &sse, &sum); + total_sse += sse; + } + + for (y = 0; y < height / 16; ++y) { + const uint8_t *pa = a; + const uint8_t *pb = b; + for (x = 0; x < width / 16; ++x) { + vp9_mse16x16(pa, a_stride, pb, b_stride, &sse); + total_sse += sse; + + pa += 16; + pb += 16; + } + + a += 16 * a_stride; + b += 16 * b_stride; + } + + return total_sse; +} + +typedef struct { + double psnr[4]; // total/y/u/v + uint64_t sse[4]; // total/y/u/v + uint32_t samples[4]; // total/y/u/v +} PSNR_STATS; + +static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, + PSNR_STATS *psnr) { + const int widths[3] = {a->y_width, a->uv_width, a->uv_width }; + const int heights[3] = {a->y_height, a->uv_height, a->uv_height}; + const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer, a->v_buffer }; + const int a_strides[3] = {a->y_stride, a->uv_stride, a->uv_stride}; + const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer, b->v_buffer }; + const int b_strides[3] = {b->y_stride, b->uv_stride, b->uv_stride}; + int i; + uint64_t total_sse = 0; + uint32_t total_samples = 0; + + for (i = 0; i < 3; ++i) { + const int w = widths[i]; + const int h = heights[i]; + const uint32_t samples = w * h; + const uint64_t sse = get_sse(a_planes[i], a_strides[i], + b_planes[i], b_strides[i], + w, h); + psnr->sse[1 + i] = sse; + psnr->samples[1 + i] = samples; + psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, 255.0, (double)sse); + + total_sse += sse; + total_samples += samples; + } + + psnr->sse[0] = total_sse; + psnr->samples[0] = total_samples; + psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, 255.0, + (double)total_sse); +} + +static void generate_psnr_packet(VP9_COMP *cpi) { + struct vpx_codec_cx_pkt pkt; + int i; + PSNR_STATS psnr; + calc_psnr(cpi->Source, cpi->common.frame_to_show, &psnr); + for (i = 0; i < 4; ++i) { + pkt.data.psnr.samples[i] = psnr.samples[i]; + pkt.data.psnr.sse[i] = psnr.sse[i]; + pkt.data.psnr.psnr[i] = psnr.psnr[i]; + } + pkt.kind = VPX_CODEC_PSNR_PKT; + vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt); +} + +int vp9_use_as_reference(VP9_COMP *cpi, int ref_frame_flags) { + if (ref_frame_flags > 7) + return -1; + + cpi->ref_frame_flags = ref_frame_flags; + return 0; +} + +void vp9_update_reference(VP9_COMP *cpi, int ref_frame_flags) { + cpi->ext_refresh_golden_frame = (ref_frame_flags & VP9_GOLD_FLAG) != 0; + cpi->ext_refresh_alt_ref_frame = (ref_frame_flags & VP9_ALT_FLAG) != 0; + cpi->ext_refresh_last_frame = (ref_frame_flags & VP9_LAST_FLAG) != 0; + cpi->ext_refresh_frame_flags_pending = 1; +} + +static YV12_BUFFER_CONFIG *get_vp9_ref_frame_buffer(VP9_COMP *cpi, + VP9_REFFRAME ref_frame_flag) { + MV_REFERENCE_FRAME ref_frame = NONE; + if (ref_frame_flag == VP9_LAST_FLAG) + ref_frame = LAST_FRAME; + else if (ref_frame_flag == VP9_GOLD_FLAG) + ref_frame = GOLDEN_FRAME; + else if (ref_frame_flag == VP9_ALT_FLAG) + ref_frame = ALTREF_FRAME; + + return ref_frame == NONE ? NULL : get_ref_frame_buffer(cpi, ref_frame); +} + +int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd) { + YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag); + if (cfg) { + vp8_yv12_copy_frame(cfg, sd); + return 0; + } else { + return -1; + } +} + +int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd) { + YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag); + if (cfg) { + vp8_yv12_copy_frame(sd, cfg); + return 0; + } else { + return -1; + } +} + +int vp9_update_entropy(VP9_COMP * cpi, int update) { + cpi->ext_refresh_frame_context = update; + cpi->ext_refresh_frame_context_pending = 1; + return 0; +} + +#if CONFIG_VP9_TEMPORAL_DENOISING +#if defined(OUTPUT_YUV_DENOISED) +// The denoiser buffer is allocated as a YUV 440 buffer. This function writes it +// as YUV 420. We simply use the top-left pixels of the UV buffers, since we do +// not denoise the UV channels at this time. If ever we implement UV channel +// denoising we will have to modify this. +void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) { + uint8_t *src = s->y_buffer; + int h = s->y_height; + + do { + fwrite(src, s->y_width, 1, f); + src += s->y_stride; + } while (--h); + + src = s->u_buffer; + h = s->uv_height / 2; + + do { + fwrite(src, s->uv_width / 2, 1, f); + src += s->uv_stride + s->uv_width / 2; + } while (--h); + + src = s->v_buffer; + h = s->uv_height / 2; + + do { + fwrite(src, s->uv_width / 2, 1, f); + src += s->uv_stride + s->uv_width / 2; + } while (--h); +} +#endif +#endif + +#ifdef OUTPUT_YUV_REC +void vp9_write_yuv_rec_frame(VP9_COMMON *cm) { + YV12_BUFFER_CONFIG *s = cm->frame_to_show; + uint8_t *src = s->y_buffer; + int h = cm->height; + + do { + fwrite(src, s->y_width, 1, yuv_rec_file); + src += s->y_stride; + } while (--h); + + src = s->u_buffer; + h = s->uv_height; + + do { + fwrite(src, s->uv_width, 1, yuv_rec_file); + src += s->uv_stride; + } while (--h); + + src = s->v_buffer; + h = s->uv_height; + + do { + fwrite(src, s->uv_width, 1, yuv_rec_file); + src += s->uv_stride; + } while (--h); + + fflush(yuv_rec_file); +} +#endif + +static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) { + // TODO(dkovalev): replace YV12_BUFFER_CONFIG with vpx_image_t + int i; + const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer}; + const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride}; + const int src_widths[3] = {src->y_crop_width, src->uv_crop_width, + src->uv_crop_width }; + const int src_heights[3] = {src->y_crop_height, src->uv_crop_height, + src->uv_crop_height}; + uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer}; + const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride}; + const int dst_widths[3] = {dst->y_crop_width, dst->uv_crop_width, + dst->uv_crop_width}; + const int dst_heights[3] = {dst->y_crop_height, dst->uv_crop_height, + dst->uv_crop_height}; + + for (i = 0; i < MAX_MB_PLANE; ++i) + vp9_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i], + dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]); + + vp9_extend_frame_borders(dst); +} + +static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) { + const int src_w = src->y_crop_width; + const int src_h = src->y_crop_height; + const int dst_w = dst->y_crop_width; + const int dst_h = dst->y_crop_height; + const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer}; + const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride}; + uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer}; + const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride}; + const InterpKernel *const kernel = vp9_get_interp_kernel(EIGHTTAP); + int x, y, i; + + for (y = 0; y < dst_h; y += 16) { + for (x = 0; x < dst_w; x += 16) { + for (i = 0; i < MAX_MB_PLANE; ++i) { + const int factor = (i == 0 || i == 3 ? 1 : 2); + const int x_q4 = x * (16 / factor) * src_w / dst_w; + const int y_q4 = y * (16 / factor) * src_h / dst_h; + const int src_stride = src_strides[i]; + const int dst_stride = dst_strides[i]; + const uint8_t *src_ptr = srcs[i] + (y / factor) * src_h / dst_h * + src_stride + (x / factor) * src_w / dst_w; + uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor); + + vp9_convolve8(src_ptr, src_stride, dst_ptr, dst_stride, + kernel[x_q4 & 0xf], 16 * src_w / dst_w, + kernel[y_q4 & 0xf], 16 * src_h / dst_h, + 16 / factor, 16 / factor); + } + } + } + + vp9_extend_frame_borders(dst); +} + +// Function to test for conditions that indicate we should loop +// back and recode a frame. +static int recode_loop_test(const VP9_COMP *cpi, + int high_limit, int low_limit, + int q, int maxq, int minq) { + const VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + int force_recode = 0; + + // Special case trap if maximum allowed frame size exceeded. + if (rc->projected_frame_size > rc->max_frame_bandwidth) { + force_recode = 1; + + // Is frame recode allowed. + // Yes if either recode mode 1 is selected or mode 2 is selected + // and the frame is a key frame, golden frame or alt_ref_frame + } else if ((cpi->sf.recode_loop == ALLOW_RECODE) || + ((cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF) && + (cm->frame_type == KEY_FRAME || + cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) { + // General over and under shoot tests + if ((rc->projected_frame_size > high_limit && q < maxq) || + (rc->projected_frame_size < low_limit && q > minq)) { + force_recode = 1; + } else if (cpi->oxcf.rc_mode == VPX_CQ) { + // Deal with frame undershoot and whether or not we are + // below the automatically set cq level. + if (q > oxcf->cq_level && + rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) { + force_recode = 1; + } + } + } + return force_recode; +} + +void vp9_update_reference_frames(VP9_COMP *cpi) { + VP9_COMMON * const cm = &cpi->common; + + // At this point the new frame has been encoded. + // If any buffer copy / swapping is signaled it should be done here. + if (cm->frame_type == KEY_FRAME) { + ref_cnt_fb(cm->frame_bufs, + &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx); + ref_cnt_fb(cm->frame_bufs, + &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx); + } else if (vp9_preserve_existing_gf(cpi)) { + // We have decided to preserve the previously existing golden frame as our + // new ARF frame. However, in the short term in function + // vp9_bitstream.c::get_refresh_mask() we left it in the GF slot and, if + // we're updating the GF with the current decoded frame, we save it to the + // ARF slot instead. + // We now have to update the ARF with the current frame and swap gld_fb_idx + // and alt_fb_idx so that, overall, we've stored the old GF in the new ARF + // slot and, if we're updating the GF, the current frame becomes the new GF. + int tmp; + + ref_cnt_fb(cm->frame_bufs, + &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx); + + tmp = cpi->alt_fb_idx; + cpi->alt_fb_idx = cpi->gld_fb_idx; + cpi->gld_fb_idx = tmp; + + if (is_two_pass_svc(cpi)) { + cpi->svc.layer_context[0].gold_ref_idx = cpi->gld_fb_idx; + cpi->svc.layer_context[0].alt_ref_idx = cpi->alt_fb_idx; + } + } else { /* For non key/golden frames */ + if (cpi->refresh_alt_ref_frame) { + int arf_idx = cpi->alt_fb_idx; + if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + arf_idx = gf_group->arf_update_idx[gf_group->index]; + } + + ref_cnt_fb(cm->frame_bufs, + &cm->ref_frame_map[arf_idx], cm->new_fb_idx); + vpx_memcpy(cpi->interp_filter_selected[ALTREF_FRAME], + cpi->interp_filter_selected[0], + sizeof(cpi->interp_filter_selected[0])); + } + + if (cpi->refresh_golden_frame) { + ref_cnt_fb(cm->frame_bufs, + &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx); + if (!cpi->rc.is_src_frame_alt_ref) + vpx_memcpy(cpi->interp_filter_selected[GOLDEN_FRAME], + cpi->interp_filter_selected[0], + sizeof(cpi->interp_filter_selected[0])); + else + vpx_memcpy(cpi->interp_filter_selected[GOLDEN_FRAME], + cpi->interp_filter_selected[ALTREF_FRAME], + sizeof(cpi->interp_filter_selected[ALTREF_FRAME])); + } + } + + if (cpi->refresh_last_frame) { + ref_cnt_fb(cm->frame_bufs, + &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx); + if (!cpi->rc.is_src_frame_alt_ref) + vpx_memcpy(cpi->interp_filter_selected[LAST_FRAME], + cpi->interp_filter_selected[0], + sizeof(cpi->interp_filter_selected[0])); + } +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0) { + vp9_denoiser_update_frame_info(&cpi->denoiser, + *cpi->Source, + cpi->common.frame_type, + cpi->refresh_alt_ref_frame, + cpi->refresh_golden_frame, + cpi->refresh_last_frame); + } +#endif +} + +static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { + MACROBLOCKD *xd = &cpi->mb.e_mbd; + struct loopfilter *lf = &cm->lf; + if (xd->lossless) { + lf->filter_level = 0; + } else { + struct vpx_usec_timer timer; + + vp9_clear_system_state(); + + vpx_usec_timer_start(&timer); + + vp9_pick_filter_level(cpi->Source, cpi, cpi->sf.lpf_pick); + + vpx_usec_timer_mark(&timer); + cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer); + } + + if (lf->filter_level > 0) { + vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0); + } + + vp9_extend_frame_inner_borders(cm->frame_to_show); +} + +void vp9_scale_references(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + MV_REFERENCE_FRAME ref_frame; + const VP9_REFFRAME ref_mask[3] = {VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG}; + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]; + const YV12_BUFFER_CONFIG *const ref = &cm->frame_bufs[idx].buf; + + // Need to convert from VP9_REFFRAME to index into ref_mask (subtract 1). + if ((cpi->ref_frame_flags & ref_mask[ref_frame - 1]) && + (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)) { + const int new_fb = get_free_fb(cm); + vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf, + cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL); + scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf); + cpi->scaled_ref_idx[ref_frame - 1] = new_fb; + } else { + cpi->scaled_ref_idx[ref_frame - 1] = idx; + cm->frame_bufs[idx].ref_count++; + } + } +} + +static void release_scaled_references(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + int i; + + for (i = 0; i < 3; i++) + cm->frame_bufs[cpi->scaled_ref_idx[i]].ref_count--; +} + +static void full_to_model_count(unsigned int *model_count, + unsigned int *full_count) { + int n; + model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN]; + model_count[ONE_TOKEN] = full_count[ONE_TOKEN]; + model_count[TWO_TOKEN] = full_count[TWO_TOKEN]; + for (n = THREE_TOKEN; n < EOB_TOKEN; ++n) + model_count[TWO_TOKEN] += full_count[n]; + model_count[EOB_MODEL_TOKEN] = full_count[EOB_TOKEN]; +} + +static void full_to_model_counts(vp9_coeff_count_model *model_count, + vp9_coeff_count *full_count) { + int i, j, k, l; + + for (i = 0; i < PLANE_TYPES; ++i) + for (j = 0; j < REF_TYPES; ++j) + for (k = 0; k < COEF_BANDS; ++k) + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) + full_to_model_count(model_count[i][j][k][l], full_count[i][j][k][l]); +} + +#if 0 && CONFIG_INTERNAL_STATS +static void output_frame_level_debug_stats(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w"); + int recon_err; + + vp9_clear_system_state(); + + recon_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm)); + + if (cpi->twopass.total_left_stats.coded_error != 0.0) + fprintf(f, "%10u %10d %10d %10d %10d" + "%10"PRId64" %10"PRId64" %10"PRId64" %10"PRId64" %10d " + "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf" + "%6d %6d %5d %5d %5d " + "%10"PRId64" %10.3lf" + "%10lf %8u %10d %10d %10d\n", + cpi->common.current_video_frame, cpi->rc.this_frame_target, + cpi->rc.projected_frame_size, + cpi->rc.projected_frame_size / cpi->common.MBs, + (cpi->rc.projected_frame_size - cpi->rc.this_frame_target), + cpi->rc.vbr_bits_off_target, + cpi->rc.total_target_vs_actual, + (cpi->rc.starting_buffer_level - cpi->rc.bits_off_target), + cpi->rc.total_actual_bits, cm->base_qindex, + vp9_convert_qindex_to_q(cm->base_qindex), + (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0, + vp9_convert_qindex_to_q(cpi->twopass.active_worst_quality), + cpi->rc.avg_q, + vp9_convert_qindex_to_q(cpi->oxcf.cq_level), + cpi->refresh_last_frame, cpi->refresh_golden_frame, + cpi->refresh_alt_ref_frame, cm->frame_type, cpi->rc.gfu_boost, + cpi->twopass.bits_left, + cpi->twopass.total_left_stats.coded_error, + cpi->twopass.bits_left / + (1 + cpi->twopass.total_left_stats.coded_error), + cpi->tot_recode_hits, recon_err, cpi->rc.kf_boost, + cpi->twopass.kf_zeromotion_pct); + + fclose(f); + + if (0) { + FILE *const fmodes = fopen("Modes.stt", "a"); + int i; + + fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame, + cm->frame_type, cpi->refresh_golden_frame, + cpi->refresh_alt_ref_frame); + + for (i = 0; i < MAX_MODES; ++i) + fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]); + + fprintf(fmodes, "\n"); + + fclose(fmodes); + } +} +#endif + +static void encode_without_recode_loop(VP9_COMP *cpi, + int q) { + VP9_COMMON *const cm = &cpi->common; + vp9_clear_system_state(); + vp9_set_quantizer(cm, q); + setup_frame(cpi); + // Variance adaptive and in frame q adjustment experiments are mutually + // exclusive. + if (cpi->oxcf.aq_mode == VARIANCE_AQ) { + vp9_vaq_frame_setup(cpi); + } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { + vp9_setup_in_frame_q_adj(cpi); + } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + vp9_cyclic_refresh_setup(cpi); + } + // transform / motion compensation build reconstruction frame + vp9_encode_frame(cpi); + + // Update the skip mb flag probabilities based on the distribution + // seen in the last encoder iteration. + // update_base_skip_probs(cpi); + vp9_clear_system_state(); +} + +static void encode_with_recode_loop(VP9_COMP *cpi, + size_t *size, + uint8_t *dest, + int q, + int bottom_index, + int top_index) { + VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + int loop_count = 0; + int loop = 0; + int overshoot_seen = 0; + int undershoot_seen = 0; + int q_low = bottom_index, q_high = top_index; + int frame_over_shoot_limit; + int frame_under_shoot_limit; + + // Decide frame size bounds + vp9_rc_compute_frame_size_bounds(cpi, rc->this_frame_target, + &frame_under_shoot_limit, + &frame_over_shoot_limit); + + do { + vp9_clear_system_state(); + + vp9_set_quantizer(cm, q); + + if (loop_count == 0) + setup_frame(cpi); + + // Variance adaptive and in frame q adjustment experiments are mutually + // exclusive. + if (cpi->oxcf.aq_mode == VARIANCE_AQ) { + vp9_vaq_frame_setup(cpi); + } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { + vp9_setup_in_frame_q_adj(cpi); + } + + // transform / motion compensation build reconstruction frame + vp9_encode_frame(cpi); + + // Update the skip mb flag probabilities based on the distribution + // seen in the last encoder iteration. + // update_base_skip_probs(cpi); + + vp9_clear_system_state(); + + // Dummy pack of the bitstream using up to date stats to get an + // accurate estimate of output frame size to determine if we need + // to recode. + if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) { + save_coding_context(cpi); + if (!cpi->sf.use_nonrd_pick_mode) + vp9_pack_bitstream(cpi, dest, size); + + rc->projected_frame_size = (int)(*size) << 3; + restore_coding_context(cpi); + + if (frame_over_shoot_limit == 0) + frame_over_shoot_limit = 1; + } + + if (cpi->oxcf.rc_mode == VPX_Q) { + loop = 0; + } else { + if ((cm->frame_type == KEY_FRAME) && + rc->this_key_frame_forced && + (rc->projected_frame_size < rc->max_frame_bandwidth)) { + int last_q = q; + int kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm)); + + int high_err_target = cpi->ambient_err; + int low_err_target = cpi->ambient_err >> 1; + + // Prevent possible divide by zero error below for perfect KF + kf_err += !kf_err; + + // The key frame is not good enough or we can afford + // to make it better without undue risk of popping. + if ((kf_err > high_err_target && + rc->projected_frame_size <= frame_over_shoot_limit) || + (kf_err > low_err_target && + rc->projected_frame_size <= frame_under_shoot_limit)) { + // Lower q_high + q_high = q > q_low ? q - 1 : q_low; + + // Adjust Q + q = (q * high_err_target) / kf_err; + q = MIN(q, (q_high + q_low) >> 1); + } else if (kf_err < low_err_target && + rc->projected_frame_size >= frame_under_shoot_limit) { + // The key frame is much better than the previous frame + // Raise q_low + q_low = q < q_high ? q + 1 : q_high; + + // Adjust Q + q = (q * low_err_target) / kf_err; + q = MIN(q, (q_high + q_low + 1) >> 1); + } + + // Clamp Q to upper and lower limits: + q = clamp(q, q_low, q_high); + + loop = q != last_q; + } else if (recode_loop_test( + cpi, frame_over_shoot_limit, frame_under_shoot_limit, + q, MAX(q_high, top_index), bottom_index)) { + // Is the projected frame size out of range and are we allowed + // to attempt to recode. + int last_q = q; + int retries = 0; + + // Frame size out of permitted range: + // Update correction factor & compute new Q to try... + + // Frame is too large + if (rc->projected_frame_size > rc->this_frame_target) { + // Special case if the projected size is > the max allowed. + if (rc->projected_frame_size >= rc->max_frame_bandwidth) + q_high = rc->worst_quality; + + // Raise Qlow as to at least the current value + q_low = q < q_high ? q + 1 : q_high; + + if (undershoot_seen || loop_count > 1) { + // Update rate_correction_factor unless + vp9_rc_update_rate_correction_factors(cpi, 1); + + q = (q_high + q_low + 1) / 2; + } else { + // Update rate_correction_factor unless + vp9_rc_update_rate_correction_factors(cpi, 0); + + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, + bottom_index, MAX(q_high, top_index)); + + while (q < q_low && retries < 10) { + vp9_rc_update_rate_correction_factors(cpi, 0); + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, + bottom_index, MAX(q_high, top_index)); + retries++; + } + } + + overshoot_seen = 1; + } else { + // Frame is too small + q_high = q > q_low ? q - 1 : q_low; + + if (overshoot_seen || loop_count > 1) { + vp9_rc_update_rate_correction_factors(cpi, 1); + q = (q_high + q_low) / 2; + } else { + vp9_rc_update_rate_correction_factors(cpi, 0); + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, + bottom_index, top_index); + // Special case reset for qlow for constrained quality. + // This should only trigger where there is very substantial + // undershoot on a frame and the auto cq level is above + // the user passsed in value. + if (cpi->oxcf.rc_mode == VPX_CQ && + q < q_low) { + q_low = q; + } + + while (q > q_high && retries < 10) { + vp9_rc_update_rate_correction_factors(cpi, 0); + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, + bottom_index, top_index); + retries++; + } + } + + undershoot_seen = 1; + } + + // Clamp Q to upper and lower limits: + q = clamp(q, q_low, q_high); + + loop = q != last_q; + } else { + loop = 0; + } + } + + // Special case for overlay frame. + if (rc->is_src_frame_alt_ref && + rc->projected_frame_size < rc->max_frame_bandwidth) + loop = 0; + + if (loop) { + loop_count++; + +#if CONFIG_INTERNAL_STATS + cpi->tot_recode_hits++; +#endif + } + } while (loop); +} + +static int get_ref_frame_flags(const VP9_COMP *cpi) { + const int *const map = cpi->common.ref_frame_map; + const int gold_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx]; + const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx]; + const int gold_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx]; + int flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG; + + if (gold_is_last) + flags &= ~VP9_GOLD_FLAG; + + if (cpi->rc.frames_till_gf_update_due == INT_MAX && !is_two_pass_svc(cpi)) + flags &= ~VP9_GOLD_FLAG; + + if (alt_is_last) + flags &= ~VP9_ALT_FLAG; + + if (gold_is_alt) + flags &= ~VP9_ALT_FLAG; + + return flags; +} + +static void set_ext_overrides(VP9_COMP *cpi) { + // Overrides the defaults with the externally supplied values with + // vp9_update_reference() and vp9_update_entropy() calls + // Note: The overrides are valid only for the next frame passed + // to encode_frame_to_data_rate() function + if (cpi->ext_refresh_frame_context_pending) { + cpi->common.refresh_frame_context = cpi->ext_refresh_frame_context; + cpi->ext_refresh_frame_context_pending = 0; + } + if (cpi->ext_refresh_frame_flags_pending) { + cpi->refresh_last_frame = cpi->ext_refresh_last_frame; + cpi->refresh_golden_frame = cpi->ext_refresh_golden_frame; + cpi->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame; + cpi->ext_refresh_frame_flags_pending = 0; + } +} + +YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm, + YV12_BUFFER_CONFIG *unscaled, + YV12_BUFFER_CONFIG *scaled) { + if (cm->mi_cols * MI_SIZE != unscaled->y_width || + cm->mi_rows * MI_SIZE != unscaled->y_height) { + scale_and_extend_frame_nonnormative(unscaled, scaled); + return scaled; + } else { + return unscaled; + } +} + +static int is_skippable_frame(const VP9_COMP *cpi) { + // If the current frame does not have non-zero motion vector detected in the + // first pass, and so do its previous and forward frames, then this frame + // can be skipped for partition check, and the partition size is assigned + // according to the variance + const SVC *const svc = &cpi->svc; + const TWO_PASS *const twopass = is_two_pass_svc(cpi) ? + &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass; + + return (!frame_is_intra_only(&cpi->common) && + twopass->stats_in - 2 > twopass->stats_in_start && + twopass->stats_in < twopass->stats_in_end && + (twopass->stats_in - 1)->pcnt_inter - (twopass->stats_in - 1)->pcnt_motion + == 1 && + (twopass->stats_in - 2)->pcnt_inter - (twopass->stats_in - 2)->pcnt_motion + == 1 && + twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1); +} + +static void set_arf_sign_bias(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + int arf_sign_bias; + + if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + arf_sign_bias = cpi->rc.source_alt_ref_active && + (!cpi->refresh_alt_ref_frame || + (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)); + } else { + arf_sign_bias = + (cpi->rc.source_alt_ref_active && !cpi->refresh_alt_ref_frame); + } + cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias; +} + +static void set_mv_search_params(VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; + const unsigned int max_mv_def = MIN(cm->width, cm->height); + + // Default based on max resolution. + cpi->mv_step_param = vp9_init_search_range(max_mv_def); + + if (cpi->sf.mv.auto_mv_step_size) { + if (frame_is_intra_only(cm)) { + // Initialize max_mv_magnitude for use in the first INTER frame + // after a key/intra-only frame. + cpi->max_mv_magnitude = max_mv_def; + } else { + if (cm->show_frame) + // Allow mv_steps to correspond to twice the max mv magnitude found + // in the previous frame, capped by the default max_mv_magnitude based + // on resolution. + cpi->mv_step_param = + vp9_init_search_range(MIN(max_mv_def, 2 * cpi->max_mv_magnitude)); + cpi->max_mv_magnitude = 0; + } + } +} + + +int setup_interp_filter_search_mask(VP9_COMP *cpi) { + INTERP_FILTER ifilter; + int ref_total[MAX_REF_FRAMES] = {0}; + MV_REFERENCE_FRAME ref; + int mask = 0; + if (cpi->common.last_frame_type == KEY_FRAME || + cpi->refresh_alt_ref_frame) + return mask; + for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) + for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter) + ref_total[ref] += cpi->interp_filter_selected[ref][ifilter]; + + for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter) { + if ((ref_total[LAST_FRAME] && + cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) && + (ref_total[GOLDEN_FRAME] == 0 || + cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50 + < ref_total[GOLDEN_FRAME]) && + (ref_total[ALTREF_FRAME] == 0 || + cpi->interp_filter_selected[ALTREF_FRAME][ifilter] * 50 + < ref_total[ALTREF_FRAME])) + mask |= 1 << ifilter; + } + return mask; +} + +static void encode_frame_to_data_rate(VP9_COMP *cpi, + size_t *size, + uint8_t *dest, + unsigned int *frame_flags) { + VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + struct segmentation *const seg = &cm->seg; + TX_SIZE t; + int q; + int top_index; + int bottom_index; + + set_ext_overrides(cpi); + + cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source, + &cpi->scaled_source); + + if (cpi->unscaled_last_source != NULL) + cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source, + &cpi->scaled_last_source); + + vp9_scale_references(cpi); + + vp9_clear_system_state(); + + // Enable or disable mode based tweaking of the zbin. + // For 2 pass only used where GF/ARF prediction quality + // is above a threshold. + cpi->zbin_mode_boost = 0; + cpi->zbin_mode_boost_enabled = 0; + + // Set the arf sign bias for this frame. + set_arf_sign_bias(cpi); + + // Set default state for segment based loop filter update flags. + cm->lf.mode_ref_delta_update = 0; + + set_mv_search_params(cpi); + + if (cpi->oxcf.pass == 2 && + cpi->sf.adaptive_interp_filter_search) + cpi->sf.interp_filter_search_mask = + setup_interp_filter_search_mask(cpi); + + + // Set various flags etc to special state if it is a key frame. + if (frame_is_intra_only(cm)) { + // Reset the loop filter deltas and segmentation map. + vp9_reset_segment_features(&cm->seg); + + // If segmentation is enabled force a map update for key frames. + if (seg->enabled) { + seg->update_map = 1; + seg->update_data = 1; + } + + // The alternate reference frame cannot be active for a key frame. + cpi->rc.source_alt_ref_active = 0; + + cm->error_resilient_mode = oxcf->error_resilient_mode; + + // By default, encoder assumes decoder can use prev_mi. + if (cm->error_resilient_mode) { + cm->frame_parallel_decoding_mode = 1; + cm->reset_frame_context = 0; + cm->refresh_frame_context = 0; + } else if (cm->intra_only) { + cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode; + // Only reset the current context. + cm->reset_frame_context = 2; + } + } + if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0) { + cm->frame_context_idx = + cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers + + cpi->svc.temporal_layer_id; + + // The probs will be updated based on the frame type of its previous + // frame if frame_parallel_decoding_mode is 0. The type may vary for + // the frame after a key frame in base layer since we may drop enhancement + // layers. So set frame_parallel_decoding_mode to 1 in this case. + if (cpi->svc.number_temporal_layers == 1) { + if (cpi->svc.spatial_layer_id == 0 && + cpi->svc.layer_context[0].last_frame_type == KEY_FRAME) + cm->frame_parallel_decoding_mode = 1; + else + cm->frame_parallel_decoding_mode = 0; + } else if (cpi->svc.spatial_layer_id == 0) { + // Find the 2nd frame in temporal base layer and 1st frame in temporal + // enhancement layers from the key frame. + int i; + for (i = 0; i < cpi->svc.number_temporal_layers; ++i) { + if (cpi->svc.layer_context[0].frames_from_key_frame == 1 << i) { + cm->frame_parallel_decoding_mode = 1; + break; + } + } + if (i == cpi->svc.number_temporal_layers) + cm->frame_parallel_decoding_mode = 0; + } + } + + // Configure experimental use of segmentation for enhanced coding of + // static regions if indicated. + // Only allowed in second pass of two pass (as requires lagged coding) + // and if the relevant speed feature flag is set. + if (oxcf->pass == 2 && cpi->sf.static_segmentation) + configure_static_seg_features(cpi); + + // Check if the current frame is skippable for the partition search in the + // second pass according to the first pass stats + if (oxcf->pass == 2 && + (!cpi->use_svc || is_two_pass_svc(cpi))) { + cpi->skippable_frame = is_skippable_frame(cpi); + } + + // For 1 pass CBR, check if we are dropping this frame. + // Never drop on key frame. + if (oxcf->pass == 0 && + oxcf->rc_mode == VPX_CBR && + cm->frame_type != KEY_FRAME) { + if (vp9_rc_drop_frame(cpi)) { + vp9_rc_postencode_update_drop_frame(cpi); + ++cm->current_video_frame; + return; + } + } + + vp9_clear_system_state(); + +#if CONFIG_VP9_POSTPROC + if (oxcf->noise_sensitivity > 0) { + int l = 0; + switch (oxcf->noise_sensitivity) { + case 1: + l = 20; + break; + case 2: + l = 40; + break; + case 3: + l = 60; + break; + case 4: + case 5: + l = 100; + break; + case 6: + l = 150; + break; + } + vp9_denoise(cpi->Source, cpi->Source, l); + } +#endif + +#if CONFIG_INTERNAL_STATS + int i; + for (i = 0; i < MAX_MODES; ++i) + cpi->mode_chosen_counts[i] = 0; +#endif + + vp9_set_speed_features(cpi); + + vp9_set_rd_speed_thresholds(cpi); + vp9_set_rd_speed_thresholds_sub8x8(cpi); + + // Decide q and q bounds. + q = vp9_rc_pick_q_and_bounds(cpi, &bottom_index, &top_index); + + if (!frame_is_intra_only(cm)) { + cm->interp_filter = cpi->sf.default_interp_filter; + /* TODO: Decide this more intelligently */ + vp9_set_high_precision_mv(cpi, q < HIGH_PRECISION_MV_QTHRESH); + } + + if (cpi->sf.recode_loop == DISALLOW_RECODE) { + encode_without_recode_loop(cpi, q); + } else { + encode_with_recode_loop(cpi, size, dest, q, bottom_index, top_index); + } + +#if CONFIG_VP9_TEMPORAL_DENOISING +#ifdef OUTPUT_YUV_DENOISED + if (oxcf->noise_sensitivity > 0) { + vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME], + yuv_denoised_file); + } +#endif +#endif + + + // Special case code to reduce pulsing when key frames are forced at a + // fixed interval. Note the reconstruction error if it is the frame before + // the force key frame + if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) { + cpi->ambient_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm)); + } + + // If the encoder forced a KEY_FRAME decision + if (cm->frame_type == KEY_FRAME) + cpi->refresh_last_frame = 1; + + cm->frame_to_show = get_frame_new_buffer(cm); + + // Pick the loop filter level for the frame. + loopfilter_frame(cpi, cm); + + // build the bitstream + vp9_pack_bitstream(cpi, dest, size); + + if (cm->seg.update_map) + update_reference_segmentation_map(cpi); + + release_scaled_references(cpi); + vp9_update_reference_frames(cpi); + + for (t = TX_4X4; t <= TX_32X32; t++) + full_to_model_counts(cm->counts.coef[t], cpi->coef_counts[t]); + + if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) + vp9_adapt_coef_probs(cm); + + if (!frame_is_intra_only(cm)) { + if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) { + vp9_adapt_mode_probs(cm); + vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv); + } + } + + if (cpi->refresh_golden_frame == 1) + cpi->frame_flags |= FRAMEFLAGS_GOLDEN; + else + cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN; + + if (cpi->refresh_alt_ref_frame == 1) + cpi->frame_flags |= FRAMEFLAGS_ALTREF; + else + cpi->frame_flags &= ~FRAMEFLAGS_ALTREF; + + cpi->ref_frame_flags = get_ref_frame_flags(cpi); + + cm->last_frame_type = cm->frame_type; + vp9_rc_postencode_update(cpi, *size); + +#if 0 + output_frame_level_debug_stats(cpi); +#endif + + if (cm->frame_type == KEY_FRAME) { + // Tell the caller that the frame was coded as a key frame + *frame_flags = cpi->frame_flags | FRAMEFLAGS_KEY; + } else { + *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY; + } + + // Clear the one shot update flags for segmentation map and mode/ref loop + // filter deltas. + cm->seg.update_map = 0; + cm->seg.update_data = 0; + cm->lf.mode_ref_delta_update = 0; + + // keep track of the last coded dimensions + cm->last_width = cm->width; + cm->last_height = cm->height; + + // reset to normal state now that we are done. + if (!cm->show_existing_frame) { + if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0) + cm->last_show_frame = 0; + else + cm->last_show_frame = cm->show_frame; + } + + if (cm->show_frame) { + vp9_swap_mi_and_prev_mi(cm); + + // Don't increment frame counters if this was an altref buffer + // update not a real frame + ++cm->current_video_frame; + if (cpi->use_svc) + vp9_inc_frame_in_layer(cpi); + } + + if (is_two_pass_svc(cpi)) + cpi->svc.layer_context[cpi->svc.spatial_layer_id].last_frame_type = + cm->frame_type; +} + +static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest, + unsigned int *frame_flags) { + vp9_rc_get_svc_params(cpi); + encode_frame_to_data_rate(cpi, size, dest, frame_flags); +} + +static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, + unsigned int *frame_flags) { + if (cpi->oxcf.rc_mode == VPX_CBR) { + vp9_rc_get_one_pass_cbr_params(cpi); + } else { + vp9_rc_get_one_pass_vbr_params(cpi); + } + encode_frame_to_data_rate(cpi, size, dest, frame_flags); +} + +static void Pass2Encode(VP9_COMP *cpi, size_t *size, + uint8_t *dest, unsigned int *frame_flags) { + cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; + + vp9_rc_get_second_pass_params(cpi); + encode_frame_to_data_rate(cpi, size, dest, frame_flags); + + vp9_twopass_postencode_update(cpi); +} + +static void init_motion_estimation(VP9_COMP *cpi) { + int y_stride = cpi->scaled_source.y_stride; + + if (cpi->sf.mv.search_method == NSTEP) { + vp9_init3smotion_compensation(&cpi->ss_cfg, y_stride); + } else if (cpi->sf.mv.search_method == DIAMOND) { + vp9_init_dsmotion_compensation(&cpi->ss_cfg, y_stride); + } +} + +static void check_initial_width(VP9_COMP *cpi, int subsampling_x, + int subsampling_y) { + VP9_COMMON *const cm = &cpi->common; + + if (!cpi->initial_width) { + cm->subsampling_x = subsampling_x; + cm->subsampling_y = subsampling_y; + + alloc_raw_frame_buffers(cpi); + alloc_ref_frame_buffers(cpi); + alloc_util_frame_buffers(cpi); + + init_motion_estimation(cpi); + + cpi->initial_width = cm->width; + cpi->initial_height = cm->height; + } +} + + +int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags, + YV12_BUFFER_CONFIG *sd, int64_t time_stamp, + int64_t end_time) { + VP9_COMMON *cm = &cpi->common; + struct vpx_usec_timer timer; + int res = 0; + const int subsampling_x = sd->uv_width < sd->y_width; + const int subsampling_y = sd->uv_height < sd->y_height; + + check_initial_width(cpi, subsampling_x, subsampling_y); + + vpx_usec_timer_start(&timer); + +#if CONFIG_SPATIAL_SVC + if (is_two_pass_svc(cpi)) + res = vp9_svc_lookahead_push(cpi, cpi->lookahead, sd, time_stamp, end_time, + frame_flags); + else +#endif + res = vp9_lookahead_push(cpi->lookahead, + sd, time_stamp, end_time, frame_flags); + if (res) + res = -1; + vpx_usec_timer_mark(&timer); + cpi->time_receive_data += vpx_usec_timer_elapsed(&timer); + + if ((cm->profile == PROFILE_0 || cm->profile == PROFILE_2) && + (subsampling_x != 1 || subsampling_y != 1)) { + vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM, + "Non-4:2:0 color space requires profile 1 or 3"); + res = -1; + } + if ((cm->profile == PROFILE_1 || cm->profile == PROFILE_3) && + (subsampling_x == 1 && subsampling_y == 1)) { + vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM, + "4:2:0 color space requires profile 0 or 2"); + res = -1; + } + + return res; +} + + +static int frame_is_reference(const VP9_COMP *cpi) { + const VP9_COMMON *cm = &cpi->common; + + return cm->frame_type == KEY_FRAME || + cpi->refresh_last_frame || + cpi->refresh_golden_frame || + cpi->refresh_alt_ref_frame || + cm->refresh_frame_context || + cm->lf.mode_ref_delta_update || + cm->seg.update_map || + cm->seg.update_data; +} + +void adjust_frame_rate(VP9_COMP *cpi, + const struct lookahead_entry *source) { + int64_t this_duration; + int step = 0; + + if (source->ts_start == cpi->first_time_stamp_ever) { + this_duration = source->ts_end - source->ts_start; + step = 1; + } else { + int64_t last_duration = cpi->last_end_time_stamp_seen + - cpi->last_time_stamp_seen; + + this_duration = source->ts_end - cpi->last_end_time_stamp_seen; + + // do a step update if the duration changes by 10% + if (last_duration) + step = (int)((this_duration - last_duration) * 10 / last_duration); + } + + if (this_duration) { + if (step) { + vp9_new_framerate(cpi, 10000000.0 / this_duration); + } else { + // Average this frame's rate into the last second's average + // frame rate. If we haven't seen 1 second yet, then average + // over the whole interval seen. + const double interval = MIN((double)(source->ts_end + - cpi->first_time_stamp_ever), 10000000.0); + double avg_duration = 10000000.0 / cpi->framerate; + avg_duration *= (interval - avg_duration + this_duration); + avg_duration /= interval; + + vp9_new_framerate(cpi, 10000000.0 / avg_duration); + } + } + cpi->last_time_stamp_seen = source->ts_start; + cpi->last_end_time_stamp_seen = source->ts_end; +} + +// Returns 0 if this is not an alt ref else the offset of the source frame +// used as the arf midpoint. +static int get_arf_src_index(VP9_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + int arf_src_index = 0; + if (is_altref_enabled(cpi)) { + if (cpi->oxcf.pass == 2) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { + arf_src_index = gf_group->arf_src_offset[gf_group->index]; + } + } else if (rc->source_alt_ref_pending) { + arf_src_index = rc->frames_till_gf_update_due; + } + } + return arf_src_index; +} + +static void check_src_altref(VP9_COMP *cpi, + const struct lookahead_entry *source) { + RATE_CONTROL *const rc = &cpi->rc; + + if (cpi->oxcf.pass == 2) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + rc->is_src_frame_alt_ref = + (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE); + } else { + rc->is_src_frame_alt_ref = cpi->alt_ref_source && + (source == cpi->alt_ref_source); + } + + if (rc->is_src_frame_alt_ref) { + // Current frame is an ARF overlay frame. + cpi->alt_ref_source = NULL; + + // Don't refresh the last buffer for an ARF overlay frame. It will + // become the GF so preserve last as an alternative prediction option. + cpi->refresh_last_frame = 0; + } +} + +int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, + size_t *size, uint8_t *dest, + int64_t *time_stamp, int64_t *time_end, int flush) { + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->mb.e_mbd; + RATE_CONTROL *const rc = &cpi->rc; + struct vpx_usec_timer cmptimer; + YV12_BUFFER_CONFIG *force_src_buffer = NULL; + struct lookahead_entry *last_source = NULL; + struct lookahead_entry *source = NULL; + MV_REFERENCE_FRAME ref_frame; + int arf_src_index; + + if (is_two_pass_svc(cpi) && oxcf->pass == 2) { +#if CONFIG_SPATIAL_SVC + vp9_svc_lookahead_peek(cpi, cpi->lookahead, 0, 1); +#endif + vp9_restore_layer_context(cpi); + } + + vpx_usec_timer_start(&cmptimer); + + vp9_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV); + + // Normal defaults + cm->reset_frame_context = 0; + cm->refresh_frame_context = 1; + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 0; + + // Should we encode an arf frame. + arf_src_index = get_arf_src_index(cpi); + if (arf_src_index) { + assert(arf_src_index <= rc->frames_to_key); + +#if CONFIG_SPATIAL_SVC + if (is_two_pass_svc(cpi)) + source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, arf_src_index, 0); + else +#endif + source = vp9_lookahead_peek(cpi->lookahead, arf_src_index); + if (source != NULL) { + cpi->alt_ref_source = source; + +#if CONFIG_SPATIAL_SVC + if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0) { + int i; + // Reference a hidden frame from a lower layer + for (i = cpi->svc.spatial_layer_id - 1; i >= 0; --i) { + if (oxcf->ss_play_alternate[i]) { + cpi->gld_fb_idx = cpi->svc.layer_context[i].alt_ref_idx; + break; + } + } + } + cpi->svc.layer_context[cpi->svc.spatial_layer_id].has_alt_frame = 1; +#endif + + if (oxcf->arnr_max_frames > 0) { + // Produce the filtered ARF frame. + vp9_temporal_filter(cpi, arf_src_index); + vp9_extend_frame_borders(&cpi->alt_ref_buffer); + force_src_buffer = &cpi->alt_ref_buffer; + } + + cm->show_frame = 0; + cpi->refresh_alt_ref_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_last_frame = 0; + rc->is_src_frame_alt_ref = 0; + rc->source_alt_ref_pending = 0; + } else { + rc->source_alt_ref_pending = 0; + } + } + + if (!source) { + // Get last frame source. + if (cm->current_video_frame > 0) { +#if CONFIG_SPATIAL_SVC + if (is_two_pass_svc(cpi)) + last_source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, -1, 0); + else +#endif + last_source = vp9_lookahead_peek(cpi->lookahead, -1); + if (last_source == NULL) + return -1; + } + + // Read in the source frame. +#if CONFIG_SPATIAL_SVC + if (is_two_pass_svc(cpi)) + source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush); + else +#endif + source = vp9_lookahead_pop(cpi->lookahead, flush); + if (source != NULL) { + cm->show_frame = 1; + cm->intra_only = 0; + + // Check to see if the frame should be encoded as an arf overlay. + check_src_altref(cpi, source); + } + } + + if (source) { + cpi->un_scaled_source = cpi->Source = force_src_buffer ? force_src_buffer + : &source->img; + + cpi->unscaled_last_source = last_source != NULL ? &last_source->img : NULL; + + *time_stamp = source->ts_start; + *time_end = source->ts_end; + *frame_flags = (source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; + + } else { + *size = 0; + if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) { + vp9_end_first_pass(cpi); /* get last stats packet */ + cpi->twopass.first_pass_done = 1; + } + return -1; + } + + if (source->ts_start < cpi->first_time_stamp_ever) { + cpi->first_time_stamp_ever = source->ts_start; + cpi->last_end_time_stamp_seen = source->ts_start; + } + + // Clear down mmx registers + vp9_clear_system_state(); + + // adjust frame rates based on timestamps given + if (cm->show_frame) { + adjust_frame_rate(cpi, source); + } + + if (cpi->svc.number_temporal_layers > 1 && + oxcf->rc_mode == VPX_CBR) { + vp9_update_temporal_layer_framerate(cpi); + vp9_restore_layer_context(cpi); + } + + // start with a 0 size frame + *size = 0; + + /* find a free buffer for the new frame, releasing the reference previously + * held. + */ + cm->frame_bufs[cm->new_fb_idx].ref_count--; + cm->new_fb_idx = get_free_fb(cm); + + if (!cpi->use_svc && cpi->multi_arf_allowed) { + if (cm->frame_type == KEY_FRAME) { + init_buffer_indices(cpi); + } else if (oxcf->pass == 2) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + cpi->alt_fb_idx = gf_group->arf_ref_idx[gf_group->index]; + } + } + + cpi->frame_flags = *frame_flags; + + if (oxcf->pass == 2 && + cm->current_video_frame == 0 && + oxcf->allow_spatial_resampling && + oxcf->rc_mode == VPX_VBR) { + // Internal scaling is triggered on the first frame. + vp9_set_size_literal(cpi, oxcf->scaled_frame_width, + oxcf->scaled_frame_height); + } + + // Reset the frame pointers to the current frame size + vp9_realloc_frame_buffer(get_frame_new_buffer(cm), + cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL); + + alloc_util_frame_buffers(cpi); + init_motion_estimation(cpi); + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]; + YV12_BUFFER_CONFIG *const buf = &cm->frame_bufs[idx].buf; + RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1]; + ref_buf->buf = buf; + ref_buf->idx = idx; + vp9_setup_scale_factors_for_frame(&ref_buf->sf, + buf->y_crop_width, buf->y_crop_height, + cm->width, cm->height); + + if (vp9_is_scaled(&ref_buf->sf)) + vp9_extend_frame_borders(buf); + } + + set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME); + + if (oxcf->aq_mode == VARIANCE_AQ) { + vp9_vaq_init(); + } + + if (oxcf->pass == 1 && + (!cpi->use_svc || is_two_pass_svc(cpi))) { + const int lossless = is_lossless_requested(oxcf); + cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4; + cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; + vp9_first_pass(cpi, source); + } else if (oxcf->pass == 2 && + (!cpi->use_svc || is_two_pass_svc(cpi))) { + Pass2Encode(cpi, size, dest, frame_flags); + } else if (cpi->use_svc) { + SvcEncode(cpi, size, dest, frame_flags); + } else { + // One pass encode + Pass0Encode(cpi, size, dest, frame_flags); + } + + if (cm->refresh_frame_context) + cm->frame_contexts[cm->frame_context_idx] = cm->fc; + + // Frame was dropped, release scaled references. + if (*size == 0) { + release_scaled_references(cpi); + } + + if (*size > 0) { + cpi->droppable = !frame_is_reference(cpi); + } + + // Save layer specific state. + if ((cpi->svc.number_temporal_layers > 1 && + oxcf->rc_mode == VPX_CBR) || + ((cpi->svc.number_temporal_layers > 1 || + cpi->svc.number_spatial_layers > 1) && + oxcf->pass == 2)) { + vp9_save_layer_context(cpi); + } + + vpx_usec_timer_mark(&cmptimer); + cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer); + + if (cpi->b_calculate_psnr && oxcf->pass != 1 && cm->show_frame) + generate_psnr_packet(cpi); + +#if CONFIG_INTERNAL_STATS + + if (oxcf->pass != 1) { + cpi->bytes += (int)(*size); + + if (cm->show_frame) { + cpi->count++; + + if (cpi->b_calculate_psnr) { + YV12_BUFFER_CONFIG *orig = cpi->Source; + YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show; + YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer; + PSNR_STATS psnr; + calc_psnr(orig, recon, &psnr); + + cpi->total += psnr.psnr[0]; + cpi->total_y += psnr.psnr[1]; + cpi->total_u += psnr.psnr[2]; + cpi->total_v += psnr.psnr[3]; + cpi->total_sq_error += psnr.sse[0]; + cpi->total_samples += psnr.samples[0]; + + { + PSNR_STATS psnr2; + double frame_ssim2 = 0, weight = 0; +#if CONFIG_VP9_POSTPROC + // TODO(agrange) Add resizing of post-proc buffer in here when the + // encoder is changed to use on-demand buffer allocation. + vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, + cm->lf.filter_level * 10 / 6); +#endif + vp9_clear_system_state(); + + calc_psnr(orig, pp, &psnr2); + + cpi->totalp += psnr2.psnr[0]; + cpi->totalp_y += psnr2.psnr[1]; + cpi->totalp_u += psnr2.psnr[2]; + cpi->totalp_v += psnr2.psnr[3]; + cpi->totalp_sq_error += psnr2.sse[0]; + cpi->totalp_samples += psnr2.samples[0]; + + frame_ssim2 = vp9_calc_ssim(orig, recon, &weight); + + cpi->summed_quality += frame_ssim2 * weight; + cpi->summed_weights += weight; + + frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, &weight); + + cpi->summedp_quality += frame_ssim2 * weight; + cpi->summedp_weights += weight; +#if 0 + { + FILE *f = fopen("q_used.stt", "a"); + fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n", + cpi->common.current_video_frame, y2, u2, v2, + frame_psnr2, frame_ssim2); + fclose(f); + } +#endif + } + } + + + if (cpi->b_calculate_ssimg) { + double y, u, v, frame_all; + frame_all = vp9_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v); + cpi->total_ssimg_y += y; + cpi->total_ssimg_u += u; + cpi->total_ssimg_v += v; + cpi->total_ssimg_all += frame_all; + } + } + } + +#endif + return 0; +} + +int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest, + vp9_ppflags_t *flags) { + VP9_COMMON *cm = &cpi->common; +#if !CONFIG_VP9_POSTPROC + (void)flags; +#endif + + if (!cm->show_frame) { + return -1; + } else { + int ret; +#if CONFIG_VP9_POSTPROC + ret = vp9_post_proc_frame(cm, dest, flags); +#else + if (cm->frame_to_show) { + *dest = *cm->frame_to_show; + dest->y_width = cm->width; + dest->y_height = cm->height; + dest->uv_width = cm->width >> cm->subsampling_x; + dest->uv_height = cm->height >> cm->subsampling_y; + ret = 0; + } else { + ret = -1; + } +#endif // !CONFIG_VP9_POSTPROC + vp9_clear_system_state(); + return ret; + } +} + +int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols) { + if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) { + const int mi_rows = cpi->common.mi_rows; + const int mi_cols = cpi->common.mi_cols; + if (map) { + int r, c; + for (r = 0; r < mi_rows; r++) { + for (c = 0; c < mi_cols; c++) { + cpi->segmentation_map[r * mi_cols + c] = + !map[(r >> 1) * cols + (c >> 1)]; + } + } + vp9_enable_segfeature(&cpi->common.seg, 1, SEG_LVL_SKIP); + vp9_enable_segmentation(&cpi->common.seg); + } else { + vp9_disable_segmentation(&cpi->common.seg); + } + return 0; + } else { + return -1; + } +} + +int vp9_set_internal_size(VP9_COMP *cpi, + VPX_SCALING horiz_mode, VPX_SCALING vert_mode) { + VP9_COMMON *cm = &cpi->common; + int hr = 0, hs = 0, vr = 0, vs = 0; + + if (horiz_mode > ONETWO || vert_mode > ONETWO) + return -1; + + Scale2Ratio(horiz_mode, &hr, &hs); + Scale2Ratio(vert_mode, &vr, &vs); + + // always go to the next whole number + cm->width = (hs - 1 + cpi->oxcf.width * hr) / hs; + cm->height = (vs - 1 + cpi->oxcf.height * vr) / vs; + assert(cm->width <= cpi->initial_width); + assert(cm->height <= cpi->initial_height); + + update_frame_size(cpi); + + return 0; +} + +int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width, + unsigned int height) { + VP9_COMMON *cm = &cpi->common; + + check_initial_width(cpi, 1, 1); + + if (width) { + cm->width = width; + if (cm->width * 5 < cpi->initial_width) { + cm->width = cpi->initial_width / 5 + 1; + printf("Warning: Desired width too small, changed to %d\n", cm->width); + } + if (cm->width > cpi->initial_width) { + cm->width = cpi->initial_width; + printf("Warning: Desired width too large, changed to %d\n", cm->width); + } + } + + if (height) { + cm->height = height; + if (cm->height * 5 < cpi->initial_height) { + cm->height = cpi->initial_height / 5 + 1; + printf("Warning: Desired height too small, changed to %d\n", cm->height); + } + if (cm->height > cpi->initial_height) { + cm->height = cpi->initial_height; + printf("Warning: Desired height too large, changed to %d\n", cm->height); + } + } + assert(cm->width <= cpi->initial_width); + assert(cm->height <= cpi->initial_height); + + update_frame_size(cpi); + + return 0; +} + +void vp9_set_svc(VP9_COMP *cpi, int use_svc) { + cpi->use_svc = use_svc; + return; +} + +int vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b) { + assert(a->y_crop_width == b->y_crop_width); + assert(a->y_crop_height == b->y_crop_height); + + return (int)get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride, + a->y_crop_width, a->y_crop_height); +} + + +int vp9_get_quantizer(VP9_COMP *cpi) { + return cpi->common.base_qindex; +} + +void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags) { + if (flags & (VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF | + VP8_EFLAG_NO_REF_ARF)) { + int ref = 7; + + if (flags & VP8_EFLAG_NO_REF_LAST) + ref ^= VP9_LAST_FLAG; + + if (flags & VP8_EFLAG_NO_REF_GF) + ref ^= VP9_GOLD_FLAG; + + if (flags & VP8_EFLAG_NO_REF_ARF) + ref ^= VP9_ALT_FLAG; + + vp9_use_as_reference(cpi, ref); + } + + if (flags & (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF | + VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_FORCE_GF | + VP8_EFLAG_FORCE_ARF)) { + int upd = 7; + + if (flags & VP8_EFLAG_NO_UPD_LAST) + upd ^= VP9_LAST_FLAG; + + if (flags & VP8_EFLAG_NO_UPD_GF) + upd ^= VP9_GOLD_FLAG; + + if (flags & VP8_EFLAG_NO_UPD_ARF) + upd ^= VP9_ALT_FLAG; + + vp9_update_reference(cpi, upd); + } + + if (flags & VP8_EFLAG_NO_UPD_ENTROPY) { + vp9_update_entropy(cpi, 0); + } +} diff --git a/media/libvpx/vp9/encoder/vp9_encoder.h b/media/libvpx/vp9/encoder/vp9_encoder.h new file mode 100644 index 000000000000..0d3c4c19a4d8 --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_encoder.h @@ -0,0 +1,533 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_ENCODER_VP9_ENCODER_H_ +#define VP9_ENCODER_VP9_ENCODER_H_ + +#include + +#include "./vpx_config.h" +#include "vpx_ports/mem.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx/vp8cx.h" + +#include "vp9/common/vp9_ppflags.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_onyxc_int.h" + +#include "vp9/encoder/vp9_aq_cyclicrefresh.h" +#include "vp9/encoder/vp9_context_tree.h" +#include "vp9/encoder/vp9_encodemb.h" +#include "vp9/encoder/vp9_firstpass.h" +#include "vp9/encoder/vp9_lookahead.h" +#include "vp9/encoder/vp9_mbgraph.h" +#include "vp9/encoder/vp9_mcomp.h" +#include "vp9/encoder/vp9_quantize.h" +#include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_rd.h" +#include "vp9/encoder/vp9_speed_features.h" +#include "vp9/encoder/vp9_svc_layercontext.h" +#include "vp9/encoder/vp9_tokenize.h" +#include "vp9/encoder/vp9_variance.h" +#if CONFIG_VP9_TEMPORAL_DENOISING +#include "vp9/encoder/vp9_denoiser.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define DEFAULT_GF_INTERVAL 10 + +typedef struct { + int nmvjointcost[MV_JOINTS]; + int nmvcosts[2][MV_VALS]; + int nmvcosts_hp[2][MV_VALS]; + + vp9_prob segment_pred_probs[PREDICTION_PROBS]; + + unsigned char *last_frame_seg_map_copy; + + // 0 = Intra, Last, GF, ARF + signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS]; + // 0 = ZERO_MV, MV + signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS]; + + FRAME_CONTEXT fc; +} CODING_CONTEXT; + + +typedef enum { + // encode_breakout is disabled. + ENCODE_BREAKOUT_DISABLED = 0, + // encode_breakout is enabled. + ENCODE_BREAKOUT_ENABLED = 1, + // encode_breakout is enabled with small max_thresh limit. + ENCODE_BREAKOUT_LIMITED = 2 +} ENCODE_BREAKOUT_TYPE; + +typedef enum { + NORMAL = 0, + FOURFIVE = 1, + THREEFIVE = 2, + ONETWO = 3 +} VPX_SCALING; + +typedef enum { + // Good Quality Fast Encoding. The encoder balances quality with the amount of + // time it takes to encode the output. Speed setting controls how fast. + GOOD, + + // The encoder places priority on the quality of the output over encoding + // speed. The output is compressed at the highest possible quality. This + // option takes the longest amount of time to encode. Speed setting ignored. + BEST, + + // Realtime/Live Encoding. This mode is optimized for realtime encoding (for + // example, capturing a television signal or feed from a live camera). Speed + // setting controls how fast. + REALTIME +} MODE; + +typedef enum { + FRAMEFLAGS_KEY = 1 << 0, + FRAMEFLAGS_GOLDEN = 1 << 1, + FRAMEFLAGS_ALTREF = 1 << 2, +} FRAMETYPE_FLAGS; + +typedef enum { + NO_AQ = 0, + VARIANCE_AQ = 1, + COMPLEXITY_AQ = 2, + CYCLIC_REFRESH_AQ = 3, + AQ_MODE_COUNT // This should always be the last member of the enum +} AQ_MODE; + + +typedef struct VP9EncoderConfig { + BITSTREAM_PROFILE profile; + vpx_bit_depth_t bit_depth; // Codec bit-depth. + int width; // width of data passed to the compressor + int height; // height of data passed to the compressor + unsigned int input_bit_depth; // Input bit depth. + double init_framerate; // set to passed in framerate + int64_t target_bandwidth; // bandwidth to be used in kilobits per second + + int noise_sensitivity; // pre processing blur: recommendation 0 + int sharpness; // sharpening output: recommendation 0: + int speed; + unsigned int rc_max_intra_bitrate_pct; + + MODE mode; + int pass; + + // Key Framing Operations + int auto_key; // autodetect cut scenes and set the keyframes + int key_freq; // maximum distance to key frame. + + int lag_in_frames; // how many frames lag before we start encoding + + // ---------------------------------------------------------------- + // DATARATE CONTROL OPTIONS + + // vbr, cbr, constrained quality or constant quality + enum vpx_rc_mode rc_mode; + + // buffer targeting aggressiveness + int under_shoot_pct; + int over_shoot_pct; + + // buffering parameters + int64_t starting_buffer_level_ms; + int64_t optimal_buffer_level_ms; + int64_t maximum_buffer_size_ms; + + // Frame drop threshold. + int drop_frames_water_mark; + + // controlling quality + int fixed_q; + int worst_allowed_q; + int best_allowed_q; + int cq_level; + AQ_MODE aq_mode; // Adaptive Quantization mode + + // Internal frame size scaling. + int allow_spatial_resampling; + int scaled_frame_width; + int scaled_frame_height; + + // Enable feature to reduce the frame quantization every x frames. + int frame_periodic_boost; + + // two pass datarate control + int two_pass_vbrbias; // two pass datarate control tweaks + int two_pass_vbrmin_section; + int two_pass_vbrmax_section; + // END DATARATE CONTROL OPTIONS + // ---------------------------------------------------------------- + + // Spatial and temporal scalability. + int ss_number_layers; // Number of spatial layers. + int ts_number_layers; // Number of temporal layers. + // Bitrate allocation for spatial layers. + int ss_target_bitrate[VPX_SS_MAX_LAYERS]; + int ss_play_alternate[VPX_SS_MAX_LAYERS]; + // Bitrate allocation (CBR mode) and framerate factor, for temporal layers. + int ts_target_bitrate[VPX_TS_MAX_LAYERS]; + int ts_rate_decimator[VPX_TS_MAX_LAYERS]; + + // these parameters aren't to be used in final build don't use!!! + int play_alternate; + + int encode_breakout; // early breakout : for video conf recommend 800 + + /* Bitfield defining the error resiliency features to enable. + * Can provide decodable frames after losses in previous + * frames and decodable partitions after losses in the same frame. + */ + unsigned int error_resilient_mode; + + /* Bitfield defining the parallel decoding mode where the + * decoding in successive frames may be conducted in parallel + * just by decoding the frame headers. + */ + unsigned int frame_parallel_decoding_mode; + + int arnr_max_frames; + int arnr_strength; + + int tile_columns; + int tile_rows; + + vpx_fixed_buf_t two_pass_stats_in; + struct vpx_codec_pkt_list *output_pkt_list; + +#if CONFIG_FP_MB_STATS + vpx_fixed_buf_t firstpass_mb_stats_in; +#endif + + vp8e_tuning tuning; + vp9e_tune_content content; +} VP9EncoderConfig; + +static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) { + return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0; +} + +typedef struct VP9_COMP { + QUANTS quants; + MACROBLOCK mb; + VP9_COMMON common; + VP9EncoderConfig oxcf; + struct lookahead_ctx *lookahead; + struct lookahead_entry *alt_ref_source; + + YV12_BUFFER_CONFIG *Source; + YV12_BUFFER_CONFIG *Last_Source; // NULL for first frame and alt_ref frames + YV12_BUFFER_CONFIG *un_scaled_source; + YV12_BUFFER_CONFIG scaled_source; + YV12_BUFFER_CONFIG *unscaled_last_source; + YV12_BUFFER_CONFIG scaled_last_source; + + int skippable_frame; + + int scaled_ref_idx[3]; + int lst_fb_idx; + int gld_fb_idx; + int alt_fb_idx; + + int refresh_last_frame; + int refresh_golden_frame; + int refresh_alt_ref_frame; + + int ext_refresh_frame_flags_pending; + int ext_refresh_last_frame; + int ext_refresh_golden_frame; + int ext_refresh_alt_ref_frame; + + int ext_refresh_frame_context_pending; + int ext_refresh_frame_context; + + YV12_BUFFER_CONFIG last_frame_uf; + + TOKENEXTRA *tok; + unsigned int tok_count[4][1 << 6]; + + // Ambient reconstruction err target for force key frames + int ambient_err; + + RD_OPT rd; + + CODING_CONTEXT coding_context; + + int *nmvcosts[2]; + int *nmvcosts_hp[2]; + int *nmvsadcosts[2]; + int *nmvsadcosts_hp[2]; + + int zbin_mode_boost; + int zbin_mode_boost_enabled; + + int64_t last_time_stamp_seen; + int64_t last_end_time_stamp_seen; + int64_t first_time_stamp_ever; + + RATE_CONTROL rc; + double framerate; + + vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES]; + int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]; + + struct vpx_codec_pkt_list *output_pkt_list; + + MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS]; + int mbgraph_n_frames; // number of frames filled in the above + int static_mb_pct; // % forced skip mbs by segmentation + int ref_frame_flags; + + SPEED_FEATURES sf; + + unsigned int max_mv_magnitude; + int mv_step_param; + + // Default value is 1. From first pass stats, encode_breakout may be disabled. + ENCODE_BREAKOUT_TYPE allow_encode_breakout; + + // Get threshold from external input. A suggested threshold is 800 for HD + // clips, and 300 for < HD clips. + int encode_breakout; + + unsigned char *segmentation_map; + + // segment threashold for encode breakout + int segment_encode_breakout[MAX_SEGMENTS]; + + unsigned char *complexity_map; + + CYCLIC_REFRESH *cyclic_refresh; + + fractional_mv_step_fp *find_fractional_mv_step; + vp9_full_search_fn_t full_search_sad; + vp9_refining_search_fn_t refining_search_sad; + vp9_diamond_search_fn_t diamond_search_sad; + vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES]; + uint64_t time_receive_data; + uint64_t time_compress_data; + uint64_t time_pick_lpf; + uint64_t time_encode_sb_row; + +#if CONFIG_FP_MB_STATS + int use_fp_mb_stats; +#endif + + TWO_PASS twopass; + + YV12_BUFFER_CONFIG alt_ref_buffer; + + +#if CONFIG_INTERNAL_STATS + unsigned int mode_chosen_counts[MAX_MODES]; + + int count; + double total_y; + double total_u; + double total_v; + double total; + uint64_t total_sq_error; + uint64_t total_samples; + + double totalp_y; + double totalp_u; + double totalp_v; + double totalp; + uint64_t totalp_sq_error; + uint64_t totalp_samples; + + int bytes; + double summed_quality; + double summed_weights; + double summedp_quality; + double summedp_weights; + unsigned int tot_recode_hits; + + + double total_ssimg_y; + double total_ssimg_u; + double total_ssimg_v; + double total_ssimg_all; + + int b_calculate_ssimg; +#endif + int b_calculate_psnr; + + int droppable; + + int initial_width; + int initial_height; + + int use_svc; + + SVC svc; + + // Store frame variance info in SOURCE_VAR_BASED_PARTITION search type. + diff *source_diff_var; + // The threshold used in SOURCE_VAR_BASED_PARTITION search type. + unsigned int source_var_thresh; + int frames_till_next_var_check; + + int frame_flags; + + search_site_config ss_cfg; + + int mbmode_cost[INTRA_MODES]; + unsigned int inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES]; + int intra_uv_mode_cost[FRAME_TYPES][INTRA_MODES]; + int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES]; + int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS]; + + PICK_MODE_CONTEXT *leaf_tree; + PC_TREE *pc_tree; + PC_TREE *pc_root; + int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES]; + + int multi_arf_allowed; + int multi_arf_enabled; + int multi_arf_last_grp_enabled; + +#if CONFIG_VP9_TEMPORAL_DENOISING + VP9_DENOISER denoiser; +#endif +} VP9_COMP; + +void vp9_initialize_enc(); + +struct VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf); +void vp9_remove_compressor(VP9_COMP *cpi); + +void vp9_change_config(VP9_COMP *cpi, const VP9EncoderConfig *oxcf); + + // receive a frames worth of data. caller can assume that a copy of this + // frame is made and not just a copy of the pointer.. +int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags, + YV12_BUFFER_CONFIG *sd, int64_t time_stamp, + int64_t end_time_stamp); + +int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, + size_t *size, uint8_t *dest, + int64_t *time_stamp, int64_t *time_end, int flush); + +int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest, + vp9_ppflags_t *flags); + +int vp9_use_as_reference(VP9_COMP *cpi, int ref_frame_flags); + +void vp9_update_reference(VP9_COMP *cpi, int ref_frame_flags); + +int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd); + +int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd); + +int vp9_update_entropy(VP9_COMP *cpi, int update); + +int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols); + +int vp9_set_internal_size(VP9_COMP *cpi, + VPX_SCALING horiz_mode, VPX_SCALING vert_mode); + +int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width, + unsigned int height); + +void vp9_set_svc(VP9_COMP *cpi, int use_svc); + +int vp9_get_quantizer(struct VP9_COMP *cpi); + +static INLINE int get_ref_frame_idx(const VP9_COMP *cpi, + MV_REFERENCE_FRAME ref_frame) { + if (ref_frame == LAST_FRAME) { + return cpi->lst_fb_idx; + } else if (ref_frame == GOLDEN_FRAME) { + return cpi->gld_fb_idx; + } else { + return cpi->alt_fb_idx; + } +} + +static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer( + VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) { + VP9_COMMON * const cm = &cpi->common; + return &cm->frame_bufs[cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]] + .buf; +} + +static INLINE int get_token_alloc(int mb_rows, int mb_cols) { + // TODO(JBB): double check we can't exceed this token count if we have a + // 32x32 transform crossing a boundary at a multiple of 16. + // mb_rows, cols are in units of 16 pixels. We assume 3 planes all at full + // resolution. We assume up to 1 token per pixel, and then allow + // a head room of 4. + return mb_rows * mb_cols * (16 * 16 * 3 + 4); +} + +int vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); + +void vp9_alloc_compressor_data(VP9_COMP *cpi); + +void vp9_scale_references(VP9_COMP *cpi); + +void vp9_update_reference_frames(VP9_COMP *cpi); + +void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv); + +YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm, + YV12_BUFFER_CONFIG *unscaled, + YV12_BUFFER_CONFIG *scaled); + +void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags); + +static INLINE int is_two_pass_svc(const struct VP9_COMP *const cpi) { + return cpi->use_svc && + (cpi->svc.number_temporal_layers > 1 || + cpi->svc.number_spatial_layers > 1) && + (cpi->oxcf.pass == 1 || cpi->oxcf.pass == 2); +} + +static INLINE int is_altref_enabled(const VP9_COMP *const cpi) { + return cpi->oxcf.mode != REALTIME && cpi->oxcf.lag_in_frames > 0 && + (cpi->oxcf.play_alternate && + (!is_two_pass_svc(cpi) || + cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id])); +} + +static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd, + MV_REFERENCE_FRAME ref0, + MV_REFERENCE_FRAME ref1) { + xd->block_refs[0] = &cm->frame_refs[ref0 >= LAST_FRAME ? ref0 - LAST_FRAME + : 0]; + xd->block_refs[1] = &cm->frame_refs[ref1 >= LAST_FRAME ? ref1 - LAST_FRAME + : 0]; +} + +static INLINE int get_chessboard_index(const int frame_index) { + return frame_index & 0x1; +} + +static INLINE int *cond_sad_list(const struct VP9_COMP *cpi, int *sad_list) { + return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? sad_list : NULL; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_ENCODER_VP9_ENCODER_H_ diff --git a/media/libvpx/vp9/common/vp9_extend.c b/media/libvpx/vp9/encoder/vp9_extend.c similarity index 90% rename from media/libvpx/vp9/common/vp9_extend.c rename to media/libvpx/vp9/encoder/vp9_extend.c index 836bf0e79e05..e8517c889252 100644 --- a/media/libvpx/vp9/common/vp9_extend.c +++ b/media/libvpx/vp9/encoder/vp9_extend.c @@ -11,7 +11,7 @@ #include "vpx_mem/vpx_mem.h" #include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_extend.h" +#include "vp9/encoder/vp9_extend.h" static void copy_and_extend_plane(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch, @@ -75,18 +75,6 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, const int eb_uv = eb_y >> uv_height_subsampling; const int er_uv = er_y >> uv_width_subsampling; -#if CONFIG_ALPHA - const int et_a = dst->border >> (dst->alpha_height != dst->y_height); - const int el_a = dst->border >> (dst->alpha_width != dst->y_width); - const int eb_a = et_a + dst->alpha_height - src->alpha_height; - const int er_a = el_a + dst->alpha_width - src->alpha_width; - - copy_and_extend_plane(src->alpha_buffer, src->alpha_stride, - dst->alpha_buffer, dst->alpha_stride, - src->alpha_width, src->alpha_height, - et_a, el_a, eb_a, er_a); -#endif - copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src->y_width, src->y_height, diff --git a/media/libvpx/vp9/common/vp9_extend.h b/media/libvpx/vp9/encoder/vp9_extend.h similarity index 81% rename from media/libvpx/vp9/common/vp9_extend.h rename to media/libvpx/vp9/encoder/vp9_extend.h index 7ff79b7b6b3c..058fe09cf983 100644 --- a/media/libvpx/vp9/common/vp9_extend.h +++ b/media/libvpx/vp9/encoder/vp9_extend.h @@ -8,12 +8,16 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_EXTEND_H_ -#define VP9_COMMON_VP9_EXTEND_H_ +#ifndef VP9_ENCODER_VP9_EXTEND_H_ +#define VP9_ENCODER_VP9_EXTEND_H_ #include "vpx_scale/yv12config.h" #include "vpx/vpx_integer.h" +#ifdef __cplusplus +extern "C" { +#endif + void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst); @@ -22,4 +26,8 @@ void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int srcy, int srcx, int srch, int srcw); -#endif // VP9_COMMON_VP9_EXTEND_H_ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_ENCODER_VP9_EXTEND_H_ diff --git a/media/libvpx/vp9/encoder/vp9_firstpass.c b/media/libvpx/vp9/encoder/vp9_firstpass.c index 791bc9ef7754..8041b59cfc66 100644 --- a/media/libvpx/vp9/encoder/vp9_firstpass.c +++ b/media/libvpx/vp9/encoder/vp9_firstpass.c @@ -8,32 +8,32 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include #include +#include #include -#include "vp9/encoder/vp9_block.h" -#include "vp9/encoder/vp9_onyx_int.h" -#include "vp9/encoder/vp9_variance.h" -#include "vp9/encoder/vp9_encodeintra.h" -#include "vp9/encoder/vp9_mcomp.h" -#include "vp9/encoder/vp9_firstpass.h" + +#include "./vpx_scale_rtcd.h" + +#include "vpx_mem/vpx_mem.h" #include "vpx_scale/vpx_scale.h" +#include "vpx_scale/yv12config.h" + +#include "vp9/common/vp9_entropymv.h" +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_reconinter.h" // vp9_setup_dst_planes() +#include "vp9/common/vp9_systemdependent.h" +#include "vp9/encoder/vp9_aq_variance.h" +#include "vp9/encoder/vp9_block.h" #include "vp9/encoder/vp9_encodeframe.h" #include "vp9/encoder/vp9_encodemb.h" -#include "vp9/common/vp9_extend.h" -#include "vp9/common/vp9_systemdependent.h" -#include "vpx_mem/vpx_mem.h" -#include "vpx_scale/yv12config.h" -#include "vp9/encoder/vp9_quantize.h" -#include "vp9/encoder/vp9_rdopt.h" -#include "vp9/encoder/vp9_ratectrl.h" -#include "vp9/common/vp9_quant_common.h" -#include "vp9/common/vp9_entropymv.h" #include "vp9/encoder/vp9_encodemv.h" -#include "vp9/encoder/vp9_vaq.h" -#include "./vpx_scale_rtcd.h" -// TODO(jkoleszar): for setup_dst_planes -#include "vp9/common/vp9_reconinter.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_extend.h" +#include "vp9/encoder/vp9_firstpass.h" +#include "vp9/encoder/vp9_mcomp.h" +#include "vp9/encoder/vp9_quantize.h" +#include "vp9/encoder/vp9_rd.h" +#include "vp9/encoder/vp9_variance.h" #define OUTPUT_FPF 0 @@ -44,14 +44,17 @@ #define GF_RMAX 96.0 #define ERR_DIVISOR 150.0 #define MIN_DECAY_FACTOR 0.1 +#define SVC_FACTOR_PT_LOW 0.45 +#define FACTOR_PT_LOW 0.5 +#define FACTOR_PT_HIGH 0.9 #define KF_MB_INTRA_MIN 150 #define GF_MB_INTRA_MIN 100 #define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001) -#define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0 -#define POW2 (double)cpi->oxcf.two_pass_vbrbias/100.0 +#define MIN_KF_BOOST 300 +#define MIN_GF_INTERVAL 4 static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) { YV12_BUFFER_CONFIG temp = *a; @@ -59,71 +62,41 @@ static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) { *b = temp; } -static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame); - -static int select_cq_level(int qindex) { - int ret_val = QINDEX_RANGE - 1; - int i; - - double target_q = (vp9_convert_qindex_to_q(qindex) * 0.5847) + 1.0; - - for (i = 0; i < QINDEX_RANGE; i++) { - if (target_q <= vp9_convert_qindex_to_q(i)) { - ret_val = i; - break; - } - } - - return ret_val; +static int gfboost_qadjust(int qindex) { + const double q = vp9_convert_qindex_to_q(qindex); + return (int)((0.00000828 * q * q * q) + + (-0.0055 * q * q) + + (1.32 * q) + 79.3); } - // Resets the first pass file to the given position using a relative seek from // the current position. -static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *position) { - cpi->twopass.stats_in = position; +static void reset_fpf_position(TWO_PASS *p, + const FIRSTPASS_STATS *position) { + p->stats_in = position; } -static int lookup_next_frame_stats(VP9_COMP *cpi, FIRSTPASS_STATS *next_frame) { - if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) - return EOF; - - *next_frame = *cpi->twopass.stats_in; - return 1; -} - -// Read frame stats at an offset from the current position -static int read_frame_stats(VP9_COMP *cpi, - FIRSTPASS_STATS *frame_stats, - int offset) { - FIRSTPASS_STATS *fps_ptr = cpi->twopass.stats_in; - - // Check legality of offset - if (offset >= 0) { - if (&fps_ptr[offset] >= cpi->twopass.stats_in_end) - return EOF; - } else if (offset < 0) { - if (&fps_ptr[offset] < cpi->twopass.stats_in_start) - return EOF; +// Read frame stats at an offset from the current position. +static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) { + if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) || + (offset < 0 && p->stats_in + offset < p->stats_in_start)) { + return NULL; } - *frame_stats = fps_ptr[offset]; - return 1; + return &p->stats_in[offset]; } -static int input_stats(VP9_COMP *cpi, FIRSTPASS_STATS *fps) { - if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) +static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) { + if (p->stats_in >= p->stats_in_end) return EOF; - *fps = *cpi->twopass.stats_in; - cpi->twopass.stats_in = - (void *)((char *)cpi->twopass.stats_in + sizeof(FIRSTPASS_STATS)); + *fps = *p->stats_in; + ++p->stats_in; return 1; } -static void output_stats(const VP9_COMP *cpi, - struct vpx_codec_pkt_list *pktlist, - FIRSTPASS_STATS *stats) { +static void output_stats(FIRSTPASS_STATS *stats, + struct vpx_codec_pkt_list *pktlist) { struct vpx_codec_cx_pkt pkt; pkt.kind = VPX_CODEC_STATS_PKT; pkt.data.twopass_stats.buf = stats; @@ -132,19 +105,17 @@ static void output_stats(const VP9_COMP *cpi, // TEMP debug code #if OUTPUT_FPF - { FILE *fpfile; fpfile = fopen("firstpass.stt", "a"); - fprintf(stdout, "%12.0f %12.0f %12.0f %12.0f %12.0f %12.4f %12.4f" + fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.4f %12.4f" "%12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f" "%12.0f %12.0f %12.4f %12.0f %12.0f %12.4f\n", stats->frame, stats->intra_error, stats->coded_error, stats->sr_coded_error, - stats->ssim_weighted_pred_err, stats->pcnt_inter, stats->pcnt_motion, stats->pcnt_second_ref, @@ -164,12 +135,22 @@ static void output_stats(const VP9_COMP *cpi, #endif } +#if CONFIG_FP_MB_STATS +static void output_fpmb_stats(uint8_t *this_frame_mb_stats, VP9_COMMON *cm, + struct vpx_codec_pkt_list *pktlist) { + struct vpx_codec_cx_pkt pkt; + pkt.kind = VPX_CODEC_FPMB_STATS_PKT; + pkt.data.firstpass_mb_stats.buf = this_frame_mb_stats; + pkt.data.firstpass_mb_stats.sz = cm->MBs * sizeof(uint8_t); + vpx_codec_pkt_list_add(pktlist, &pkt); +} +#endif + static void zero_stats(FIRSTPASS_STATS *section) { section->frame = 0.0; section->intra_error = 0.0; section->coded_error = 0.0; section->sr_coded_error = 0.0; - section->ssim_weighted_pred_err = 0.0; section->pcnt_inter = 0.0; section->pcnt_motion = 0.0; section->pcnt_second_ref = 0.0; @@ -184,14 +165,16 @@ static void zero_stats(FIRSTPASS_STATS *section) { section->new_mv_count = 0.0; section->count = 0.0; section->duration = 1.0; + section->spatial_layer_id = 0; } -static void accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) { +static void accumulate_stats(FIRSTPASS_STATS *section, + const FIRSTPASS_STATS *frame) { section->frame += frame->frame; + section->spatial_layer_id = frame->spatial_layer_id; section->intra_error += frame->intra_error; section->coded_error += frame->coded_error; section->sr_coded_error += frame->sr_coded_error; - section->ssim_weighted_pred_err += frame->ssim_weighted_pred_err; section->pcnt_inter += frame->pcnt_inter; section->pcnt_motion += frame->pcnt_motion; section->pcnt_second_ref += frame->pcnt_second_ref; @@ -208,12 +191,12 @@ static void accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) { section->duration += frame->duration; } -static void subtract_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) { +static void subtract_stats(FIRSTPASS_STATS *section, + const FIRSTPASS_STATS *frame) { section->frame -= frame->frame; section->intra_error -= frame->intra_error; section->coded_error -= frame->coded_error; section->sr_coded_error -= frame->sr_coded_error; - section->ssim_weighted_pred_err -= frame->ssim_weighted_pred_err; section->pcnt_inter -= frame->pcnt_inter; section->pcnt_motion -= frame->pcnt_motion; section->pcnt_second_ref -= frame->pcnt_second_ref; @@ -230,116 +213,32 @@ static void subtract_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame) { section->duration -= frame->duration; } -static void avg_stats(FIRSTPASS_STATS *section) { - if (section->count < 1.0) - return; - - section->intra_error /= section->count; - section->coded_error /= section->count; - section->sr_coded_error /= section->count; - section->ssim_weighted_pred_err /= section->count; - section->pcnt_inter /= section->count; - section->pcnt_second_ref /= section->count; - section->pcnt_neutral /= section->count; - section->pcnt_motion /= section->count; - section->MVr /= section->count; - section->mvr_abs /= section->count; - section->MVc /= section->count; - section->mvc_abs /= section->count; - section->MVrv /= section->count; - section->MVcv /= section->count; - section->mv_in_out_count /= section->count; - section->duration /= section->count; -} // Calculate a modified Error used in distributing bits between easier and // harder frames. -static double calculate_modified_err(VP9_COMP *cpi, - FIRSTPASS_STATS *this_frame) { - const FIRSTPASS_STATS *const stats = &cpi->twopass.total_stats; - const double av_err = stats->ssim_weighted_pred_err / stats->count; - const double this_err = this_frame->ssim_weighted_pred_err; - return av_err * pow(this_err / DOUBLE_DIVIDE_CHECK(av_err), - this_err > av_err ? POW1 : POW2); +static double calculate_modified_err(const TWO_PASS *twopass, + const VP9EncoderConfig *oxcf, + const FIRSTPASS_STATS *this_frame) { + const FIRSTPASS_STATS *const stats = &twopass->total_stats; + const double av_err = stats->coded_error / stats->count; + const double modified_error = av_err * + pow(this_frame->coded_error / DOUBLE_DIVIDE_CHECK(av_err), + oxcf->two_pass_vbrbias / 100.0); + return fclamp(modified_error, + twopass->modified_error_min, twopass->modified_error_max); } -static const double weight_table[256] = { - 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, - 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, - 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, - 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, - 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.031250, 0.062500, - 0.093750, 0.125000, 0.156250, 0.187500, 0.218750, 0.250000, 0.281250, - 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750, 0.500000, - 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750, - 0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500, - 0.968750, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, - 1.000000, 1.000000, 1.000000, 1.000000 -}; +// This function returns the maximum target rate per frame. +static int frame_max_bits(const RATE_CONTROL *rc, + const VP9EncoderConfig *oxcf) { + int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth * + (int64_t)oxcf->two_pass_vbrmax_section) / 100; + if (max_bits < 0) + max_bits = 0; + else if (max_bits > rc->max_frame_bandwidth) + max_bits = rc->max_frame_bandwidth; -static double simple_weight(YV12_BUFFER_CONFIG *source) { - int i, j; - - uint8_t *src = source->y_buffer; - double sum_weights = 0.0; - - // Loop through the Y plane examining levels and creating a weight for - // the image. - i = source->y_height; - do { - j = source->y_width; - do { - sum_weights += weight_table[ *src]; - src++; - } while (--j); - src -= source->y_width; - src += source->y_stride; - } while (--i); - - sum_weights /= (source->y_height * source->y_width); - - return sum_weights; -} - - -// This function returns the current per frame maximum bitrate target. -static int frame_max_bits(VP9_COMP *cpi) { - // Max allocation for a single frame based on the max section guidelines - // passed in and how many bits are left. - // For VBR base this on the bits and frames left plus the - // two_pass_vbrmax_section rate passed in by the user. - const double max_bits = (1.0 * cpi->twopass.bits_left / - (cpi->twopass.total_stats.count - cpi->common.current_video_frame)) * - (cpi->oxcf.two_pass_vbrmax_section / 100.0); - - // Trap case where we are out of bits. - return MAX((int)max_bits, 0); + return (int)max_bits; } void vp9_init_first_pass(VP9_COMP *cpi) { @@ -347,136 +246,147 @@ void vp9_init_first_pass(VP9_COMP *cpi) { } void vp9_end_first_pass(VP9_COMP *cpi) { - output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats); + if (is_two_pass_svc(cpi)) { + int i; + for (i = 0; i < cpi->svc.number_spatial_layers; ++i) { + output_stats(&cpi->svc.layer_context[i].twopass.total_stats, + cpi->output_pkt_list); + } + } else { + output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list); + } } -static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, - YV12_BUFFER_CONFIG *recon_buffer, - int *best_motion_err, int recon_yoffset) { - MACROBLOCKD *const xd = &x->e_mbd; - - // Set up pointers for this macro block recon buffer - xd->plane[0].pre[0].buf = recon_buffer->y_buffer + recon_yoffset; - - switch (xd->mi_8x8[0]->mbmi.sb_type) { +static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) { + switch (bsize) { case BLOCK_8X8: - vp9_mse8x8(x->plane[0].src.buf, x->plane[0].src.stride, - xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride, - (unsigned int *)(best_motion_err)); - break; + return vp9_mse8x8; case BLOCK_16X8: - vp9_mse16x8(x->plane[0].src.buf, x->plane[0].src.stride, - xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride, - (unsigned int *)(best_motion_err)); - break; + return vp9_mse16x8; case BLOCK_8X16: - vp9_mse8x16(x->plane[0].src.buf, x->plane[0].src.stride, - xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride, - (unsigned int *)(best_motion_err)); - break; + return vp9_mse8x16; default: - vp9_mse16x16(x->plane[0].src.buf, x->plane[0].src.stride, - xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride, - (unsigned int *)(best_motion_err)); - break; + return vp9_mse16x16; } } +static unsigned int get_prediction_error(BLOCK_SIZE bsize, + const struct buf_2d *src, + const struct buf_2d *ref) { + unsigned int sse; + const vp9_variance_fn_t fn = get_block_variance_fn(bsize); + fn(src->buf, src->stride, ref->buf, ref->stride, &sse); + return sse; +} + +// Refine the motion search range according to the frame dimension +// for first pass test. +static int get_search_range(const VP9_COMMON *cm) { + int sr = 0; + const int dim = MIN(cm->width, cm->height); + + while ((dim << sr) < MAX_FULL_PEL_VAL) + ++sr; + return sr; +} + static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, - int_mv *ref_mv, MV *best_mv, - YV12_BUFFER_CONFIG *recon_buffer, - int *best_motion_err, int recon_yoffset) { + const MV *ref_mv, MV *best_mv, + int *best_motion_err) { MACROBLOCKD *const xd = &x->e_mbd; - int num00; + MV tmp_mv = {0, 0}; + MV ref_mv_full = {ref_mv->row >> 3, ref_mv->col >> 3}; + int num00, tmp_err, n; + const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type; + vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize]; + const int new_mv_mode_penalty = 256; - int_mv tmp_mv; - int_mv ref_mv_full; - - int tmp_err; int step_param = 3; int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; - int n; - vp9_variance_fn_ptr_t v_fn_ptr = - cpi->fn_ptr[xd->mi_8x8[0]->mbmi.sb_type]; - int new_mv_mode_penalty = 256; - - int sr = 0; - int quart_frm = MIN(cpi->common.width, cpi->common.height); - - // refine the motion search range accroding to the frame dimension - // for first pass test - while ((quart_frm << sr) < MAX_FULL_PEL_VAL) - sr++; - if (sr) - sr--; - - step_param += sr; + const int sr = get_search_range(&cpi->common); + step_param += sr; further_steps -= sr; - // override the default variance function to use MSE - switch (xd->mi_8x8[0]->mbmi.sb_type) { - case BLOCK_8X8: - v_fn_ptr.vf = vp9_mse8x8; - break; - case BLOCK_16X8: - v_fn_ptr.vf = vp9_mse16x8; - break; - case BLOCK_8X16: - v_fn_ptr.vf = vp9_mse8x16; - break; - default: - v_fn_ptr.vf = vp9_mse16x16; - break; - } + // Override the default variance function to use MSE. + v_fn_ptr.vf = get_block_variance_fn(bsize); - // Set up pointers for this macro block recon buffer - xd->plane[0].pre[0].buf = recon_buffer->y_buffer + recon_yoffset; - - // Initial step/diamond search centred on best mv - tmp_mv.as_int = 0; - ref_mv_full.as_mv.col = ref_mv->as_mv.col >> 3; - ref_mv_full.as_mv.row = ref_mv->as_mv.row >> 3; - tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv, step_param, - x->sadperbit16, &num00, &v_fn_ptr, - x->nmvjointcost, - x->mvcost, ref_mv); + // Center the initial step/diamond search on best mv. + tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv, + step_param, + x->sadperbit16, &num00, &v_fn_ptr, ref_mv); + if (tmp_err < INT_MAX) + tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1); if (tmp_err < INT_MAX - new_mv_mode_penalty) tmp_err += new_mv_mode_penalty; if (tmp_err < *best_motion_err) { *best_motion_err = tmp_err; - best_mv->row = tmp_mv.as_mv.row; - best_mv->col = tmp_mv.as_mv.col; + *best_mv = tmp_mv; } - // Further step/diamond searches as necessary + // Carry out further step/diamond searches as necessary. n = num00; num00 = 0; while (n < further_steps) { - n++; + ++n; if (num00) { - num00--; + --num00; } else { - tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv, + tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv, step_param + n, x->sadperbit16, - &num00, &v_fn_ptr, - x->nmvjointcost, - x->mvcost, ref_mv); + &num00, &v_fn_ptr, ref_mv); + if (tmp_err < INT_MAX) + tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1); if (tmp_err < INT_MAX - new_mv_mode_penalty) tmp_err += new_mv_mode_penalty; if (tmp_err < *best_motion_err) { *best_motion_err = tmp_err; - best_mv->row = tmp_mv.as_mv.row; - best_mv->col = tmp_mv.as_mv.col; + *best_mv = tmp_mv; } } } } -void vp9_first_pass(VP9_COMP *cpi) { +static BLOCK_SIZE get_bsize(const VP9_COMMON *cm, int mb_row, int mb_col) { + if (2 * mb_col + 1 < cm->mi_cols) { + return 2 * mb_row + 1 < cm->mi_rows ? BLOCK_16X16 + : BLOCK_16X8; + } else { + return 2 * mb_row + 1 < cm->mi_rows ? BLOCK_8X16 + : BLOCK_8X8; + } +} + +static int find_fp_qindex() { + int i; + + for (i = 0; i < QINDEX_RANGE; ++i) + if (vp9_convert_qindex_to_q(i) >= 30.0) + break; + + if (i == QINDEX_RANGE) + i--; + + return i; +} + +static void set_first_pass_params(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + if (!cpi->refresh_alt_ref_frame && + (cm->current_video_frame == 0 || + (cpi->frame_flags & FRAMEFLAGS_KEY))) { + cm->frame_type = KEY_FRAME; + } else { + cm->frame_type = INTER_FRAME; + } + // Do not use periodic key frames. + cpi->rc.frames_to_key = INT_MAX; +} + +void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { int mb_row, mb_col; MACROBLOCK *const x = &cpi->mb; VP9_COMMON *const cm = &cpi->common; @@ -484,24 +394,23 @@ void vp9_first_pass(VP9_COMP *cpi) { TileInfo tile; struct macroblock_plane *const p = x->plane; struct macroblockd_plane *const pd = xd->plane; - PICK_MODE_CONTEXT *ctx = &x->sb64_context; + const PICK_MODE_CONTEXT *ctx = &cpi->pc_root->none; int i; int recon_yoffset, recon_uvoffset; - const int lst_yv12_idx = cm->ref_frame_map[cpi->lst_fb_idx]; - const int gld_yv12_idx = cm->ref_frame_map[cpi->gld_fb_idx]; - YV12_BUFFER_CONFIG *const lst_yv12 = &cm->yv12_fb[lst_yv12_idx]; - YV12_BUFFER_CONFIG *const gld_yv12 = &cm->yv12_fb[gld_yv12_idx]; + YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); + YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME); YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm); - const int recon_y_stride = lst_yv12->y_stride; - const int recon_uv_stride = lst_yv12->uv_stride; + int recon_y_stride = lst_yv12->y_stride; + int recon_uv_stride = lst_yv12->uv_stride; + int uv_mb_height = 16 >> (lst_yv12->y_height > lst_yv12->uv_height); int64_t intra_error = 0; int64_t coded_error = 0; int64_t sr_coded_error = 0; int sum_mvr = 0, sum_mvc = 0; int sum_mvr_abs = 0, sum_mvc_abs = 0; - int sum_mvrs = 0, sum_mvcs = 0; + int64_t sum_mvrs = 0, sum_mvcs = 0; int mvcount = 0; int intercount = 0; int second_ref_count = 0; @@ -509,113 +418,146 @@ void vp9_first_pass(VP9_COMP *cpi) { int neutral_count = 0; int new_mv_count = 0; int sum_in_vectors = 0; - uint32_t lastmv_as_int = 0; + MV lastmv = {0, 0}; + TWO_PASS *twopass = &cpi->twopass; + const MV zero_mv = {0, 0}; + const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12; + LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ? + &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : NULL; - int_mv zero_ref_mv; +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->MBs); + } +#endif - zero_ref_mv.as_int = 0; + vp9_clear_system_state(); - vp9_clear_system_state(); // __asm emms; + set_first_pass_params(cpi); + vp9_set_quantizer(cm, find_fp_qindex()); + + if (lc != NULL) { + MV_REFERENCE_FRAME ref_frame = LAST_FRAME; + twopass = &lc->twopass; + + if (cpi->common.current_video_frame == 0) { + cpi->ref_frame_flags = 0; + } else { + if (lc->current_video_frame_in_layer < + (unsigned int)cpi->svc.number_temporal_layers) + cpi->ref_frame_flags = VP9_GOLD_FLAG; + else + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + } + + vp9_scale_references(cpi); + + // Use either last frame or alt frame for motion search. + if (cpi->ref_frame_flags & VP9_LAST_FLAG) { + first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME); + ref_frame = LAST_FRAME; + if (first_ref_buf == NULL) + first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME); + } else if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { + first_ref_buf = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME); + ref_frame = GOLDEN_FRAME; + if (first_ref_buf == NULL) + first_ref_buf = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + } + + recon_y_stride = new_yv12->y_stride; + recon_uv_stride = new_yv12->uv_stride; + uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height); + + // Disable golden frame for svc first pass for now. + gld_yv12 = NULL; + set_ref_ptrs(cm, xd, ref_frame, NONE); + + cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source, + &cpi->scaled_source); + } + + vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); vp9_setup_src_planes(x, cpi->Source, 0, 0); - setup_pre_planes(xd, 0, lst_yv12, 0, 0, NULL); - setup_dst_planes(xd, new_yv12, 0, 0); + vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL); + vp9_setup_dst_planes(xd->plane, new_yv12, 0, 0); - xd->mi_8x8 = cm->mi_grid_visible; - // required for vp9_frame_init_quantizer - xd->mi_8x8[0] = cm->mi; - - setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); + xd->mi = cm->mi_grid_visible; + xd->mi[0] = cm->mi; vp9_frame_init_quantizer(cpi); for (i = 0; i < MAX_MB_PLANE; ++i) { p[i].coeff = ctx->coeff_pbuf[i][1]; - pd[i].qcoeff = ctx->qcoeff_pbuf[i][1]; + p[i].qcoeff = ctx->qcoeff_pbuf[i][1]; pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1]; - pd[i].eobs = ctx->eobs_pbuf[i][1]; + p[i].eobs = ctx->eobs_pbuf[i][1]; } x->skip_recode = 0; + vp9_init_mv_probs(cm); + vp9_initialize_rd_consts(cpi); - // Initialise the MV cost table to the defaults - // if( cm->current_video_frame == 0) - // if ( 0 ) - { - vp9_init_mv_probs(cm); - vp9_initialize_rd_consts(cpi); - } - - // tiling is ignored in the first pass + // Tiling is ignored in the first pass. vp9_tile_init(&tile, cm, 0, 0); - // for each macroblock row in image - for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { - int_mv best_ref_mv; + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { + MV best_ref_mv = {0, 0}; - best_ref_mv.as_int = 0; - - // reset above block coeffs + // Reset above block coeffs. xd->up_available = (mb_row != 0); recon_yoffset = (mb_row * recon_y_stride * 16); - recon_uvoffset = (mb_row * recon_uv_stride * 8); + recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height); // Set up limit values for motion vectors to prevent them extending - // outside the UMV borders + // outside the UMV borders. x->mv_row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16); x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16; - // for each macroblock col in image - for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { + for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { int this_error; - int gf_motion_error = INT_MAX; - int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); - double error_weight; + const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); + double error_weight = 1.0; + const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col); +#if CONFIG_FP_MB_STATS + const int mb_index = mb_row * cm->mb_cols + mb_col; +#endif - vp9_clear_system_state(); // __asm emms; - error_weight = 1.0; // avoid uninitialized warnings + vp9_clear_system_state(); xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset; xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset; xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset; xd->left_available = (mb_col != 0); - - if (mb_col * 2 + 1 < cm->mi_cols) { - if (mb_row * 2 + 1 < cm->mi_rows) { - xd->mi_8x8[0]->mbmi.sb_type = BLOCK_16X16; - } else { - xd->mi_8x8[0]->mbmi.sb_type = BLOCK_16X8; - } - } else { - if (mb_row * 2 + 1 < cm->mi_rows) { - xd->mi_8x8[0]->mbmi.sb_type = BLOCK_8X16; - } else { - xd->mi_8x8[0]->mbmi.sb_type = BLOCK_8X8; - } - } - xd->mi_8x8[0]->mbmi.ref_frame[0] = INTRA_FRAME; + xd->mi[0]->mbmi.sb_type = bsize; + xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME; set_mi_row_col(xd, &tile, - mb_row << 1, - num_8x8_blocks_high_lookup[xd->mi_8x8[0]->mbmi.sb_type], - mb_col << 1, - num_8x8_blocks_wide_lookup[xd->mi_8x8[0]->mbmi.sb_type], + mb_row << 1, num_8x8_blocks_high_lookup[bsize], + mb_col << 1, num_8x8_blocks_wide_lookup[bsize], cm->mi_rows, cm->mi_cols); if (cpi->oxcf.aq_mode == VARIANCE_AQ) { - int energy = vp9_block_energy(cpi, x, xd->mi_8x8[0]->mbmi.sb_type); + const int energy = vp9_block_energy(cpi, x, bsize); error_weight = vp9_vaq_inv_q_ratio(energy); } - // do intra 16x16 prediction - this_error = vp9_encode_intra(x, use_dc_pred); + // Do intra 16x16 prediction. + x->skip_encode = 0; + xd->mi[0]->mbmi.mode = DC_PRED; + xd->mi[0]->mbmi.tx_size = use_dc_pred ? + (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4; + vp9_encode_intra_block_plane(x, bsize, 0); + this_error = vp9_get_mb_ss(x->plane[0].src_diff); + if (cpi->oxcf.aq_mode == VARIANCE_AQ) { - vp9_clear_system_state(); // __asm emms; - this_error *= error_weight; + vp9_clear_system_state(); + this_error = (int)(this_error * error_weight); } - // intrapenalty below deals with situations where the intra and inter - // error scores are very low (eg a plain black frame). + // Intrapenalty below deals with situations where the intra and inter + // error scores are very low (e.g. a plain black frame). // We do not have special cases in first pass for 0,0 and nearest etc so // all inter modes carry an overhead cost estimate for the mv. // When the error score is very low this causes us to pick all or lots of @@ -623,155 +565,228 @@ void vp9_first_pass(VP9_COMP *cpi) { // This penalty adds a cost matching that of a 0,0 mv to the intra case. this_error += intrapenalty; - // Cumulative intra error total + // Accumulate the intra error. intra_error += (int64_t)this_error; +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + // initialization + cpi->twopass.frame_mb_stats_buf[mb_index] = 0; + } +#endif + // Set up limit values for motion vectors to prevent them extending // outside the UMV borders. x->mv_col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16); - x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) - + BORDER_MV_PIXELS_B16; + x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16; - // Other than for the first frame do a motion search + // Other than for the first frame do a motion search. if (cm->current_video_frame > 0) { - int tmp_err; - int motion_error = INT_MAX; - int_mv mv, tmp_mv; + int tmp_err, motion_error, raw_motion_error; + // Assume 0,0 motion with no mv overhead. + MV mv = {0, 0} , tmp_mv = {0, 0}; + struct buf_2d unscaled_last_source_buf_2d; - // Simple 0,0 motion with no mv overhead - zz_motion_search(cpi, x, lst_yv12, &motion_error, recon_yoffset); - mv.as_int = tmp_mv.as_int = 0; + xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; + motion_error = get_prediction_error(bsize, &x->plane[0].src, + &xd->plane[0].pre[0]); - // Test last reference frame using the previous best mv as the - // starting point (best reference) for the search - first_pass_motion_search(cpi, x, &best_ref_mv, - &mv.as_mv, lst_yv12, - &motion_error, recon_yoffset); - if (cpi->oxcf.aq_mode == VARIANCE_AQ) { - vp9_clear_system_state(); // __asm emms; - motion_error *= error_weight; - } + // Compute the motion error of the 0,0 motion using the last source + // frame as the reference. Skip the further motion search on + // reconstructed frame if this error is small. + unscaled_last_source_buf_2d.buf = + cpi->unscaled_last_source->y_buffer + recon_yoffset; + unscaled_last_source_buf_2d.stride = + cpi->unscaled_last_source->y_stride; + raw_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &unscaled_last_source_buf_2d); - // If the current best reference mv is not centered on 0,0 then do a 0,0 - // based search as well. - if (best_ref_mv.as_int) { - tmp_err = INT_MAX; - first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv.as_mv, - lst_yv12, &tmp_err, recon_yoffset); + // TODO(pengchong): Replace the hard-coded threshold + if (raw_motion_error > 25 || lc != NULL) { + // Test last reference frame using the previous best mv as the + // starting point (best reference) for the search. + first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error); if (cpi->oxcf.aq_mode == VARIANCE_AQ) { - vp9_clear_system_state(); // __asm emms; - tmp_err *= error_weight; + vp9_clear_system_state(); + motion_error = (int)(motion_error * error_weight); } - if (tmp_err < motion_error) { - motion_error = tmp_err; - mv.as_int = tmp_mv.as_int; - } - } + // If the current best reference mv is not centered on 0,0 then do a + // 0,0 based search as well. + if (!is_zero_mv(&best_ref_mv)) { + tmp_err = INT_MAX; + first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err); + if (cpi->oxcf.aq_mode == VARIANCE_AQ) { + vp9_clear_system_state(); + tmp_err = (int)(tmp_err * error_weight); + } - // Experimental search in an older reference frame - if (cm->current_video_frame > 1) { - // Simple 0,0 motion with no mv overhead - zz_motion_search(cpi, x, gld_yv12, - &gf_motion_error, recon_yoffset); - - first_pass_motion_search(cpi, x, &zero_ref_mv, - &tmp_mv.as_mv, gld_yv12, - &gf_motion_error, recon_yoffset); - if (cpi->oxcf.aq_mode == VARIANCE_AQ) { - vp9_clear_system_state(); // __asm emms; - gf_motion_error *= error_weight; + if (tmp_err < motion_error) { + motion_error = tmp_err; + mv = tmp_mv; + } } - if ((gf_motion_error < motion_error) && - (gf_motion_error < this_error)) { - second_ref_count++; + // Search in an older reference frame. + if (cm->current_video_frame > 1 && gld_yv12 != NULL) { + // Assume 0,0 motion with no mv overhead. + int gf_motion_error; + + xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset; + gf_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &xd->plane[0].pre[0]); + + first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, + &gf_motion_error); + if (cpi->oxcf.aq_mode == VARIANCE_AQ) { + vp9_clear_system_state(); + gf_motion_error = (int)(gf_motion_error * error_weight); + } + + if (gf_motion_error < motion_error && gf_motion_error < this_error) + ++second_ref_count; + + // Reset to last frame as reference buffer. + xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; + xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset; + xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset; + + // In accumulating a score for the older reference frame take the + // best of the motion predicted score and the intra coded error + // (just as will be done for) accumulation of "coded_error" for + // the last frame. + if (gf_motion_error < this_error) + sr_coded_error += gf_motion_error; + else + sr_coded_error += this_error; + } else { + sr_coded_error += motion_error; } - - // Reset to last frame as reference buffer - xd->plane[0].pre[0].buf = lst_yv12->y_buffer + recon_yoffset; - xd->plane[1].pre[0].buf = lst_yv12->u_buffer + recon_uvoffset; - xd->plane[2].pre[0].buf = lst_yv12->v_buffer + recon_uvoffset; - - // In accumulating a score for the older reference frame - // take the best of the motion predicted score and - // the intra coded error (just as will be done for) - // accumulation of "coded_error" for the last frame. - if (gf_motion_error < this_error) - sr_coded_error += gf_motion_error; - else - sr_coded_error += this_error; } else { sr_coded_error += motion_error; } - /* Intra assumed best */ - best_ref_mv.as_int = 0; + + // Start by assuming that intra mode is best. + best_ref_mv.row = 0; + best_ref_mv.col = 0; + +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + // intra predication statistics + cpi->twopass.frame_mb_stats_buf[mb_index] = 0; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK; + if (this_error > FPMB_ERROR_LARGE_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK; + } else if (this_error < FPMB_ERROR_SMALL_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK; + } + } +#endif if (motion_error <= this_error) { - // Keep a count of cases where the inter and intra were - // very close and very low. This helps with scene cut - // detection for example in cropped clips with black bars - // at the sides or top and bottom. - if ((((this_error - intrapenalty) * 9) <= - (motion_error * 10)) && - (this_error < (2 * intrapenalty))) { - neutral_count++; - } + // Keep a count of cases where the inter and intra were very close + // and very low. This helps with scene cut detection for example in + // cropped clips with black bars at the sides or top and bottom. + if (((this_error - intrapenalty) * 9 <= motion_error * 10) && + this_error < 2 * intrapenalty) + ++neutral_count; - mv.as_mv.row *= 8; - mv.as_mv.col *= 8; + mv.row *= 8; + mv.col *= 8; this_error = motion_error; - vp9_set_mbmode_and_mvs(x, NEWMV, &mv); - xd->mi_8x8[0]->mbmi.tx_size = TX_4X4; - xd->mi_8x8[0]->mbmi.ref_frame[0] = LAST_FRAME; - xd->mi_8x8[0]->mbmi.ref_frame[1] = NONE; - vp9_build_inter_predictors_sby(xd, mb_row << 1, - mb_col << 1, - xd->mi_8x8[0]->mbmi.sb_type); - vp9_encode_sby(x, xd->mi_8x8[0]->mbmi.sb_type); - sum_mvr += mv.as_mv.row; - sum_mvr_abs += abs(mv.as_mv.row); - sum_mvc += mv.as_mv.col; - sum_mvc_abs += abs(mv.as_mv.col); - sum_mvrs += mv.as_mv.row * mv.as_mv.row; - sum_mvcs += mv.as_mv.col * mv.as_mv.col; - intercount++; + xd->mi[0]->mbmi.mode = NEWMV; + xd->mi[0]->mbmi.mv[0].as_mv = mv; + xd->mi[0]->mbmi.tx_size = TX_4X4; + xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME; + xd->mi[0]->mbmi.ref_frame[1] = NONE; + vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize); + vp9_encode_sby_pass1(x, bsize); + sum_mvr += mv.row; + sum_mvr_abs += abs(mv.row); + sum_mvc += mv.col; + sum_mvc_abs += abs(mv.col); + sum_mvrs += mv.row * mv.row; + sum_mvcs += mv.col * mv.col; + ++intercount; - best_ref_mv.as_int = mv.as_int; + best_ref_mv = mv; - // Was the vector non-zero - if (mv.as_int) { - mvcount++; +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + // inter predication statistics + cpi->twopass.frame_mb_stats_buf[mb_index] = 0; + cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK; + if (this_error > FPMB_ERROR_LARGE_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_ERROR_LARGE_MASK; + } else if (this_error < FPMB_ERROR_SMALL_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_ERROR_SMALL_MASK; + } + } +#endif - // Was it different from the last non zero vector - if (mv.as_int != lastmv_as_int) - new_mv_count++; - lastmv_as_int = mv.as_int; + if (!is_zero_mv(&mv)) { + ++mvcount; - // Does the Row vector point inwards or outwards +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + cpi->twopass.frame_mb_stats_buf[mb_index] &= + ~FPMB_MOTION_ZERO_MASK; + // check estimated motion direction + if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) { + // right direction + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_MOTION_RIGHT_MASK; + } else if (mv.as_mv.row < 0 && + abs(mv.as_mv.row) >= abs(mv.as_mv.col)) { + // up direction + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_MOTION_UP_MASK; + } else if (mv.as_mv.col < 0 && + abs(mv.as_mv.col) >= abs(mv.as_mv.row)) { + // left direction + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_MOTION_LEFT_MASK; + } else { + // down direction + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_MOTION_DOWN_MASK; + } + } +#endif + + // Non-zero vector, was it different from the last non zero vector? + if (!is_equal_mv(&mv, &lastmv)) + ++new_mv_count; + lastmv = mv; + + // Does the row vector point inwards or outwards? if (mb_row < cm->mb_rows / 2) { - if (mv.as_mv.row > 0) - sum_in_vectors--; - else if (mv.as_mv.row < 0) - sum_in_vectors++; + if (mv.row > 0) + --sum_in_vectors; + else if (mv.row < 0) + ++sum_in_vectors; } else if (mb_row > cm->mb_rows / 2) { - if (mv.as_mv.row > 0) - sum_in_vectors++; - else if (mv.as_mv.row < 0) - sum_in_vectors--; + if (mv.row > 0) + ++sum_in_vectors; + else if (mv.row < 0) + --sum_in_vectors; } - // Does the Row vector point inwards or outwards + // Does the col vector point inwards or outwards? if (mb_col < cm->mb_cols / 2) { - if (mv.as_mv.col > 0) - sum_in_vectors--; - else if (mv.as_mv.col < 0) - sum_in_vectors++; + if (mv.col > 0) + --sum_in_vectors; + else if (mv.col < 0) + ++sum_in_vectors; } else if (mb_col > cm->mb_cols / 2) { - if (mv.as_mv.col > 0) - sum_in_vectors++; - else if (mv.as_mv.col < 0) - sum_in_vectors--; + if (mv.col > 0) + ++sum_in_vectors; + else if (mv.col < 0) + --sum_in_vectors; } } } @@ -780,108 +795,109 @@ void vp9_first_pass(VP9_COMP *cpi) { } coded_error += (int64_t)this_error; - // adjust to the next column of macroblocks + // Adjust to the next column of MBs. x->plane[0].src.buf += 16; - x->plane[1].src.buf += 8; - x->plane[2].src.buf += 8; + x->plane[1].src.buf += uv_mb_height; + x->plane[2].src.buf += uv_mb_height; recon_yoffset += 16; - recon_uvoffset += 8; + recon_uvoffset += uv_mb_height; } - // adjust to the next row of mbs + // Adjust to the next row of MBs. x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols; - x->plane[1].src.buf += 8 * x->plane[1].src.stride - 8 * cm->mb_cols; - x->plane[2].src.buf += 8 * x->plane[1].src.stride - 8 * cm->mb_cols; + x->plane[1].src.buf += uv_mb_height * x->plane[1].src.stride - + uv_mb_height * cm->mb_cols; + x->plane[2].src.buf += uv_mb_height * x->plane[1].src.stride - + uv_mb_height * cm->mb_cols; - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); } - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); { - double weight = 0.0; - FIRSTPASS_STATS fps; - fps.frame = cm->current_video_frame; + fps.frame = cm->current_video_frame; + fps.spatial_layer_id = cpi->svc.spatial_layer_id; fps.intra_error = (double)(intra_error >> 8); fps.coded_error = (double)(coded_error >> 8); fps.sr_coded_error = (double)(sr_coded_error >> 8); - weight = simple_weight(cpi->Source); - - - if (weight < 0.1) - weight = 0.1; - - fps.ssim_weighted_pred_err = fps.coded_error * weight; - - fps.pcnt_inter = 0.0; - fps.pcnt_motion = 0.0; - fps.MVr = 0.0; - fps.mvr_abs = 0.0; - fps.MVc = 0.0; - fps.mvc_abs = 0.0; - fps.MVrv = 0.0; - fps.MVcv = 0.0; - fps.mv_in_out_count = 0.0; - fps.new_mv_count = 0.0; - fps.count = 1.0; - - fps.pcnt_inter = 1.0 * (double)intercount / cm->MBs; - fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs; - fps.pcnt_neutral = 1.0 * (double)neutral_count / cm->MBs; + fps.count = 1.0; + fps.pcnt_inter = (double)intercount / cm->MBs; + fps.pcnt_second_ref = (double)second_ref_count / cm->MBs; + fps.pcnt_neutral = (double)neutral_count / cm->MBs; if (mvcount > 0) { - fps.MVr = (double)sum_mvr / (double)mvcount; - fps.mvr_abs = (double)sum_mvr_abs / (double)mvcount; - fps.MVc = (double)sum_mvc / (double)mvcount; - fps.mvc_abs = (double)sum_mvc_abs / (double)mvcount; - fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) / - (double)mvcount; - fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) / - (double)mvcount; - fps.mv_in_out_count = (double)sum_in_vectors / (double)(mvcount * 2); + fps.MVr = (double)sum_mvr / mvcount; + fps.mvr_abs = (double)sum_mvr_abs / mvcount; + fps.MVc = (double)sum_mvc / mvcount; + fps.mvc_abs = (double)sum_mvc_abs / mvcount; + fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / mvcount)) / mvcount; + fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / mvcount)) / mvcount; + fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2); fps.new_mv_count = new_mv_count; - - fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs; + fps.pcnt_motion = (double)mvcount / cm->MBs; + } else { + fps.MVr = 0.0; + fps.mvr_abs = 0.0; + fps.MVc = 0.0; + fps.mvc_abs = 0.0; + fps.MVrv = 0.0; + fps.MVcv = 0.0; + fps.mv_in_out_count = 0.0; + fps.new_mv_count = 0.0; + fps.pcnt_motion = 0.0; } // TODO(paulwilkins): Handle the case when duration is set to 0, or // something less than the full time between subsequent values of // cpi->source_time_stamp. - fps.duration = (double)(cpi->source->ts_end - - cpi->source->ts_start); + fps.duration = (double)(source->ts_end - source->ts_start); - // don't want to do output stats with a stack variable! - cpi->twopass.this_frame_stats = fps; - output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.this_frame_stats); - accumulate_stats(&cpi->twopass.total_stats, &fps); + // Don't want to do output stats with a stack variable! + twopass->this_frame_stats = fps; + output_stats(&twopass->this_frame_stats, cpi->output_pkt_list); + accumulate_stats(&twopass->total_stats, &fps); + +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + output_fpmb_stats(twopass->frame_mb_stats_buf, cm, cpi->output_pkt_list); + } +#endif } // Copy the previous Last Frame back into gf and and arf buffers if - // the prediction is good enough... but also dont allow it to lag too far - if ((cpi->twopass.sr_update_lag > 3) || + // the prediction is good enough... but also don't allow it to lag too far. + if ((twopass->sr_update_lag > 3) || ((cm->current_video_frame > 0) && - (cpi->twopass.this_frame_stats.pcnt_inter > 0.20) && - ((cpi->twopass.this_frame_stats.intra_error / - DOUBLE_DIVIDE_CHECK(cpi->twopass.this_frame_stats.coded_error)) > - 2.0))) { - vp8_yv12_copy_frame(lst_yv12, gld_yv12); - cpi->twopass.sr_update_lag = 1; + (twopass->this_frame_stats.pcnt_inter > 0.20) && + ((twopass->this_frame_stats.intra_error / + DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) { + if (gld_yv12 != NULL) { + vp8_yv12_copy_frame(lst_yv12, gld_yv12); + } + twopass->sr_update_lag = 1; } else { - cpi->twopass.sr_update_lag++; + ++twopass->sr_update_lag; } - // swap frame pointers so last frame refers to the frame we just compressed - swap_yv12(lst_yv12, new_yv12); - vp9_extend_frame_borders(lst_yv12, cm->subsampling_x, cm->subsampling_y); + vp9_extend_frame_borders(new_yv12); + + if (lc != NULL) { + vp9_update_reference_frames(cpi); + } else { + // Swap frame pointers so last frame refers to the frame we just compressed. + swap_yv12(lst_yv12, new_yv12); + } // Special case for the first frame. Copy into the GF buffer as a second // reference. - if (cm->current_video_frame == 0) + if (cm->current_video_frame == 0 && gld_yv12 != NULL) { vp8_yv12_copy_frame(lst_yv12, gld_yv12); + } - // use this to see what the first pass reconstruction looks like + // Use this to see what the first pass reconstruction looks like. if (0) { char filename[512]; FILE *recon_file; @@ -897,52 +913,9 @@ void vp9_first_pass(VP9_COMP *cpi) { fclose(recon_file); } - cm->current_video_frame++; -} - -// Estimate a cost per mb attributable to overheads such as the coding of -// modes and motion vectors. -// Currently simplistic in its assumptions for testing. -// - - -static double bitcost(double prob) { - return -(log(prob) / log(2.0)); -} - -static int64_t estimate_modemvcost(VP9_COMP *cpi, - FIRSTPASS_STATS *fpstats) { -#if 0 - int mv_cost; - int mode_cost; - - double av_pct_inter = fpstats->pcnt_inter / fpstats->count; - double av_pct_motion = fpstats->pcnt_motion / fpstats->count; - double av_intra = (1.0 - av_pct_inter); - - double zz_cost; - double motion_cost; - double intra_cost; - - zz_cost = bitcost(av_pct_inter - av_pct_motion); - motion_cost = bitcost(av_pct_motion); - intra_cost = bitcost(av_intra); - - // Estimate of extra bits per mv overhead for mbs - // << 9 is the normalization to the (bits * 512) used in vp9_bits_per_mb - mv_cost = ((int)(fpstats->new_mv_count / fpstats->count) * 8) << 9; - - // Crude estimate of overhead cost from modes - // << 9 is the normalization to (bits * 512) used in vp9_bits_per_mb - mode_cost = - (int)((((av_pct_inter - av_pct_motion) * zz_cost) + - (av_pct_motion * motion_cost) + - (av_intra * intra_cost)) * cpi->common.MBs) << 9; - - // return mv_cost + mode_cost; - // TODO(paulwilkins): Fix overhead costs for extended Q range. -#endif - return 0; + ++cm->current_video_frame; + if (cpi->use_svc) + vp9_inc_frame_in_layer(cpi); } static double calc_correction_factor(double err_per_mb, @@ -953,342 +926,167 @@ static double calc_correction_factor(double err_per_mb, const double error_term = err_per_mb / err_divisor; // Adjustment based on actual quantizer to power term. - const double power_term = MIN(vp9_convert_qindex_to_q(q) * 0.01 + pt_low, + const double power_term = MIN(vp9_convert_qindex_to_q(q) * 0.0125 + pt_low, pt_high); - // Calculate correction factor + // Calculate correction factor. if (power_term < 1.0) assert(error_term >= 0.0); return fclamp(pow(error_term, power_term), 0.05, 5.0); } -// Given a current maxQ value sets a range for future values. -// PGW TODO.. -// This code removes direct dependency on QIndex to determine the range -// (now uses the actual quantizer) but has not been tuned. -static void adjust_maxq_qrange(VP9_COMP *cpi) { - int i; - // Set the max corresponding to cpi->avg_q * 2.0 - double q = cpi->avg_q * 2.0; - cpi->twopass.maxq_max_limit = cpi->worst_quality; - for (i = cpi->best_quality; i <= cpi->worst_quality; i++) { - cpi->twopass.maxq_max_limit = i; - if (vp9_convert_qindex_to_q(i) >= q) - break; - } +static int get_twopass_worst_quality(const VP9_COMP *cpi, + const FIRSTPASS_STATS *stats, + int section_target_bandwidth) { + const RATE_CONTROL *const rc = &cpi->rc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; - // Set the min corresponding to cpi->avg_q * 0.5 - q = cpi->avg_q * 0.5; - cpi->twopass.maxq_min_limit = cpi->best_quality; - for (i = cpi->worst_quality; i >= cpi->best_quality; i--) { - cpi->twopass.maxq_min_limit = i; - if (vp9_convert_qindex_to_q(i) <= q) - break; - } -} - -static int estimate_max_q(VP9_COMP *cpi, - FIRSTPASS_STATS *fpstats, - int section_target_bandwitdh) { - int q; - int num_mbs = cpi->common.MBs; - int target_norm_bits_per_mb; - - double section_err = fpstats->coded_error / fpstats->count; - double sr_correction; - double err_per_mb = section_err / num_mbs; - double err_correction_factor; - double speed_correction = 1.0; - - if (section_target_bandwitdh <= 0) - return cpi->twopass.maxq_max_limit; // Highest value allowed - - target_norm_bits_per_mb = section_target_bandwitdh < (1 << 20) - ? (512 * section_target_bandwitdh) / num_mbs - : 512 * (section_target_bandwitdh / num_mbs); - - // Look at the drop in prediction quality between the last frame - // and the GF buffer (which contained an older frame). - if (fpstats->sr_coded_error > fpstats->coded_error) { - double sr_err_diff = (fpstats->sr_coded_error - fpstats->coded_error) / - (fpstats->count * cpi->common.MBs); - sr_correction = fclamp(pow(sr_err_diff / 32.0, 0.25), 0.75, 1.25); + if (section_target_bandwidth <= 0) { + return rc->worst_quality; // Highest value allowed } else { - sr_correction = 0.75; + const int num_mbs = cpi->common.MBs; + const double section_err = stats->coded_error / stats->count; + const double err_per_mb = section_err / num_mbs; + const double speed_term = 1.0 + 0.04 * oxcf->speed; + const int target_norm_bits_per_mb = ((uint64_t)section_target_bandwidth << + BPER_MB_NORMBITS) / num_mbs; + int q; + int is_svc_upper_layer = 0; + if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0) + is_svc_upper_layer = 1; + + // Try and pick a max Q that will be high enough to encode the + // content at the given rate. + for (q = rc->best_quality; q < rc->worst_quality; ++q) { + const double factor = + calc_correction_factor(err_per_mb, ERR_DIVISOR, + is_svc_upper_layer ? SVC_FACTOR_PT_LOW : + FACTOR_PT_LOW, FACTOR_PT_HIGH, q); + const int bits_per_mb = vp9_rc_bits_per_mb(INTER_FRAME, q, + factor * speed_term); + if (bits_per_mb <= target_norm_bits_per_mb) + break; + } + + // Restriction on active max q for constrained quality mode. + if (cpi->oxcf.rc_mode == VPX_CQ) + q = MAX(q, oxcf->cq_level); + return q; } - - // Calculate a corrective factor based on a rolling ratio of bits spent - // vs target bits - if (cpi->rolling_target_bits > 0 && - cpi->active_worst_quality < cpi->worst_quality) { - double rolling_ratio = (double)cpi->rolling_actual_bits / - (double)cpi->rolling_target_bits; - - if (rolling_ratio < 0.95) - cpi->twopass.est_max_qcorrection_factor -= 0.005; - else if (rolling_ratio > 1.05) - cpi->twopass.est_max_qcorrection_factor += 0.005; - - cpi->twopass.est_max_qcorrection_factor = fclamp( - cpi->twopass.est_max_qcorrection_factor, 0.1, 10.0); - } - - // Corrections for higher compression speed settings - // (reduced compression expected) - // FIXME(jimbankoski): Once we settle on vp9 speed features we need to - // change this code. - if (cpi->compressor_speed == 1) - speed_correction = cpi->oxcf.cpu_used <= 5 ? - 1.04 + (/*cpi->oxcf.cpu_used*/0 * 0.04) : - 1.25; - - // Try and pick a max Q that will be high enough to encode the - // content at the given rate. - for (q = cpi->twopass.maxq_min_limit; q < cpi->twopass.maxq_max_limit; q++) { - int bits_per_mb_at_this_q; - - err_correction_factor = calc_correction_factor(err_per_mb, - ERR_DIVISOR, 0.4, 0.90, q) * - sr_correction * speed_correction * - cpi->twopass.est_max_qcorrection_factor; - - bits_per_mb_at_this_q = vp9_bits_per_mb(INTER_FRAME, q, - err_correction_factor); - - if (bits_per_mb_at_this_q <= target_norm_bits_per_mb) - break; - } - - // Restriction on active max q for constrained quality mode. - if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && - q < cpi->cq_target_quality) - q = cpi->cq_target_quality; - - // Adjust maxq_min_limit and maxq_max_limit limits based on - // average q observed in clip for non kf/gf/arf frames - // Give average a chance to settle though. - // PGW TODO.. This code is broken for the extended Q range - if (cpi->ni_frames > ((int)cpi->twopass.total_stats.count >> 8) && - cpi->ni_frames > 25) - adjust_maxq_qrange(cpi); - - return q; -} - -// For cq mode estimate a cq level that matches the observed -// complexity and data rate. -static int estimate_cq(VP9_COMP *cpi, - FIRSTPASS_STATS *fpstats, - int section_target_bandwitdh) { - int q; - int num_mbs = cpi->common.MBs; - int target_norm_bits_per_mb; - - double section_err = (fpstats->coded_error / fpstats->count); - double err_per_mb = section_err / num_mbs; - double err_correction_factor; - double sr_err_diff; - double sr_correction; - double speed_correction = 1.0; - double clip_iiratio; - double clip_iifactor; - - target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) - ? (512 * section_target_bandwitdh) / num_mbs - : 512 * (section_target_bandwitdh / num_mbs); - - - // Corrections for higher compression speed settings - // (reduced compression expected) - if (cpi->compressor_speed == 1) { - if (cpi->oxcf.cpu_used <= 5) - speed_correction = 1.04 + (/*cpi->oxcf.cpu_used*/ 0 * 0.04); - else - speed_correction = 1.25; - } - - // Look at the drop in prediction quality between the last frame - // and the GF buffer (which contained an older frame). - if (fpstats->sr_coded_error > fpstats->coded_error) { - sr_err_diff = - (fpstats->sr_coded_error - fpstats->coded_error) / - (fpstats->count * cpi->common.MBs); - sr_correction = (sr_err_diff / 32.0); - sr_correction = pow(sr_correction, 0.25); - if (sr_correction < 0.75) - sr_correction = 0.75; - else if (sr_correction > 1.25) - sr_correction = 1.25; - } else { - sr_correction = 0.75; - } - - // II ratio correction factor for clip as a whole - clip_iiratio = cpi->twopass.total_stats.intra_error / - DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats.coded_error); - clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025); - if (clip_iifactor < 0.80) - clip_iifactor = 0.80; - - // Try and pick a Q that can encode the content at the given rate. - for (q = 0; q < MAXQ; q++) { - int bits_per_mb_at_this_q; - - // Error per MB based correction factor - err_correction_factor = - calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, q) * - sr_correction * speed_correction * clip_iifactor; - - bits_per_mb_at_this_q = - vp9_bits_per_mb(INTER_FRAME, q, err_correction_factor); - - if (bits_per_mb_at_this_q <= target_norm_bits_per_mb) - break; - } - - // Clip value to range "best allowed to (worst allowed - 1)" - q = select_cq_level(q); - if (q >= cpi->worst_quality) - q = cpi->worst_quality - 1; - if (q < cpi->best_quality) - q = cpi->best_quality; - - return q; } extern void vp9_new_framerate(VP9_COMP *cpi, double framerate); void vp9_init_second_pass(VP9_COMP *cpi) { - FIRSTPASS_STATS this_frame; - FIRSTPASS_STATS *start_pos; + SVC *const svc = &cpi->svc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const int is_two_pass_svc = (svc->number_spatial_layers > 1) || + (svc->number_temporal_layers > 1); + TWO_PASS *const twopass = is_two_pass_svc ? + &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass; + double frame_rate; + FIRSTPASS_STATS *stats; - double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate; - double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * - cpi->oxcf.two_pass_vbrmin_section / 100); + zero_stats(&twopass->total_stats); + zero_stats(&twopass->total_left_stats); - if (two_pass_min_rate < lower_bounds_min_rate) - two_pass_min_rate = lower_bounds_min_rate; - - zero_stats(&cpi->twopass.total_stats); - zero_stats(&cpi->twopass.total_left_stats); - - if (!cpi->twopass.stats_in_end) + if (!twopass->stats_in_end) return; - cpi->twopass.total_stats = *cpi->twopass.stats_in_end; - cpi->twopass.total_left_stats = cpi->twopass.total_stats; + stats = &twopass->total_stats; - // each frame can have a different duration, as the frame rate in the source - // isn't guaranteed to be constant. The frame rate prior to the first frame - // encoded in the second pass is a guess. However the sum duration is not. - // Its calculated based on the actual durations of all frames from the first - // pass. - vp9_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count / - cpi->twopass.total_stats.duration); + *stats = *twopass->stats_in_end; + twopass->total_left_stats = *stats; - cpi->output_framerate = cpi->oxcf.framerate; - cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * - cpi->oxcf.target_bandwidth / 10000000.0); - cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration * - two_pass_min_rate / 10000000.0); + frame_rate = 10000000.0 * stats->count / stats->duration; + // Each frame can have a different duration, as the frame rate in the source + // isn't guaranteed to be constant. The frame rate prior to the first frame + // encoded in the second pass is a guess. However, the sum duration is not. + // It is calculated based on the actual durations of all frames from the + // first pass. + + if (is_two_pass_svc) { + vp9_update_spatial_layer_framerate(cpi, frame_rate); + twopass->bits_left = (int64_t)(stats->duration * + svc->layer_context[svc->spatial_layer_id].target_bandwidth / + 10000000.0); + } else { + vp9_new_framerate(cpi, frame_rate); + twopass->bits_left = (int64_t)(stats->duration * oxcf->target_bandwidth / + 10000000.0); + } // Calculate a minimum intra value to be used in determining the IIratio // scores used in the second pass. We have this minimum to make sure // that clips that are static but "low complexity" in the intra domain - // are still boosted appropriately for KF/GF/ARF - cpi->twopass.kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs; - cpi->twopass.gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs; - - // This variable monitors how far behind the second ref update is lagging - cpi->twopass.sr_update_lag = 1; - - // Scan the first pass file and calculate an average Intra / Inter error score - // ratio for the sequence. - { - double sum_iiratio = 0.0; - double IIRatio; - - start_pos = cpi->twopass.stats_in; // Note the starting "file" position. - - while (input_stats(cpi, &this_frame) != EOF) { - IIRatio = this_frame.intra_error - / DOUBLE_DIVIDE_CHECK(this_frame.coded_error); - IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio; - sum_iiratio += IIRatio; - } - - cpi->twopass.avg_iiratio = sum_iiratio / - DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats.count); - - // Reset file position - reset_fpf_position(cpi, start_pos); + // are still boosted appropriately for KF/GF/ARF. + if (!is_two_pass_svc) { + // We don't know the number of MBs for each layer at this point. + // So we will do it later. + twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs; + twopass->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs; } + // This variable monitors how far behind the second ref update is lagging. + twopass->sr_update_lag = 1; + // Scan the first pass file and calculate a modified total error based upon // the bias/power function used to allocate bits. { - start_pos = cpi->twopass.stats_in; // Note starting "file" position - - cpi->twopass.modified_error_total = 0.0; - cpi->twopass.modified_error_used = 0.0; - - while (input_stats(cpi, &this_frame) != EOF) { - cpi->twopass.modified_error_total += - calculate_modified_err(cpi, &this_frame); + const double avg_error = stats->coded_error / + DOUBLE_DIVIDE_CHECK(stats->count); + const FIRSTPASS_STATS *s = twopass->stats_in; + double modified_error_total = 0.0; + twopass->modified_error_min = (avg_error * + oxcf->two_pass_vbrmin_section) / 100; + twopass->modified_error_max = (avg_error * + oxcf->two_pass_vbrmax_section) / 100; + while (s < twopass->stats_in_end) { + modified_error_total += calculate_modified_err(twopass, oxcf, s); + ++s; } - cpi->twopass.modified_error_left = cpi->twopass.modified_error_total; - - reset_fpf_position(cpi, start_pos); // Reset file position + twopass->modified_error_left = modified_error_total; } + + // Reset the vbr bits off target counter + cpi->rc.vbr_bits_off_target = 0; } -void vp9_end_second_pass(VP9_COMP *cpi) { -} - -// This function gives and estimate of how badly we believe -// the prediction quality is decaying from frame to frame. -static double get_prediction_decay_rate(VP9_COMP *cpi, - FIRSTPASS_STATS *next_frame) { - double prediction_decay_rate; - double second_ref_decay; - double mb_sr_err_diff; - - // Initial basis is the % mbs inter coded - prediction_decay_rate = next_frame->pcnt_inter; - +// This function gives an estimate of how badly we believe the prediction +// quality is decaying from frame to frame. +static double get_prediction_decay_rate(const VP9_COMMON *cm, + const FIRSTPASS_STATS *next_frame) { // Look at the observed drop in prediction quality between the last frame // and the GF buffer (which contains an older frame). - mb_sr_err_diff = (next_frame->sr_coded_error - next_frame->coded_error) / - cpi->common.MBs; - if (mb_sr_err_diff <= 512.0) { - second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0); - second_ref_decay = pow(second_ref_decay, 0.5); - if (second_ref_decay < 0.85) - second_ref_decay = 0.85; - else if (second_ref_decay > 1.0) - second_ref_decay = 1.0; - } else { - second_ref_decay = 0.85; - } + const double mb_sr_err_diff = (next_frame->sr_coded_error - + next_frame->coded_error) / cm->MBs; + const double second_ref_decay = mb_sr_err_diff <= 512.0 + ? fclamp(pow(1.0 - (mb_sr_err_diff / 512.0), 0.5), 0.85, 1.0) + : 0.85; - if (second_ref_decay < prediction_decay_rate) - prediction_decay_rate = second_ref_decay; - - return prediction_decay_rate; + return MIN(second_ref_decay, next_frame->pcnt_inter); } +// This function gives an estimate of how badly we believe the prediction +// quality is decaying from frame to frame. +static double get_zero_motion_factor(const FIRSTPASS_STATS *frame) { + const double sr_ratio = frame->coded_error / + DOUBLE_DIVIDE_CHECK(frame->sr_coded_error); + const double zero_motion_pct = frame->pcnt_inter - + frame->pcnt_motion; + + return MIN(sr_ratio, zero_motion_pct); +} + + // Function to test for a condition where a complex transition is followed // by a static section. For example in slide shows where there is a fade // between slides. This is to help with more optimal kf and gf positioning. -static int detect_transition_to_still( - VP9_COMP *cpi, - int frame_interval, - int still_interval, - double loop_decay_rate, - double last_decay_rate) { - int trans_to_still = 0; - +static int detect_transition_to_still(const TWO_PASS *twopass, + int frame_interval, int still_interval, + double loop_decay_rate, + double last_decay_rate) { // Break clause to detect very still sections after motion // For example a static image after a fade or other transition // instead of a clean scene cut. @@ -1296,135 +1094,98 @@ static int detect_transition_to_still( loop_decay_rate >= 0.999 && last_decay_rate < 0.9) { int j; - FIRSTPASS_STATS *position = cpi->twopass.stats_in; - FIRSTPASS_STATS tmp_next_frame; - double zz_inter; - // Look ahead a few frames to see if static condition - // persists... - for (j = 0; j < still_interval; j++) { - if (EOF == input_stats(cpi, &tmp_next_frame)) + // Look ahead a few frames to see if static condition persists... + for (j = 0; j < still_interval; ++j) { + const FIRSTPASS_STATS *stats = &twopass->stats_in[j]; + if (stats >= twopass->stats_in_end) break; - zz_inter = - (tmp_next_frame.pcnt_inter - tmp_next_frame.pcnt_motion); - if (zz_inter < 0.999) + if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break; } - // Reset file position - reset_fpf_position(cpi, position); - // Only if it does do we signal a transition to still - if (j == still_interval) - trans_to_still = 1; + // Only if it does do we signal a transition to still. + return j == still_interval; } - return trans_to_still; + return 0; } // This function detects a flash through the high relative pcnt_second_ref // score in the frame following a flash frame. The offset passed in should -// reflect this -static int detect_flash(VP9_COMP *cpi, int offset) { - FIRSTPASS_STATS next_frame; +// reflect this. +static int detect_flash(const TWO_PASS *twopass, int offset) { + const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset); - int flash_detected = 0; - - // Read the frame data. - // The return is FALSE (no flash detected) if not a valid frame - if (read_frame_stats(cpi, &next_frame, offset) != EOF) { - // What we are looking for here is a situation where there is a - // brief break in prediction (such as a flash) but subsequent frames - // are reasonably well predicted by an earlier (pre flash) frame. - // The recovery after a flash is indicated by a high pcnt_second_ref - // comapred to pcnt_inter. - if (next_frame.pcnt_second_ref > next_frame.pcnt_inter && - next_frame.pcnt_second_ref >= 0.5) - flash_detected = 1; - } - - return flash_detected; + // What we are looking for here is a situation where there is a + // brief break in prediction (such as a flash) but subsequent frames + // are reasonably well predicted by an earlier (pre flash) frame. + // The recovery after a flash is indicated by a high pcnt_second_ref + // compared to pcnt_inter. + return next_frame != NULL && + next_frame->pcnt_second_ref > next_frame->pcnt_inter && + next_frame->pcnt_second_ref >= 0.5; } -// Update the motion related elements to the GF arf boost calculation -static void accumulate_frame_motion_stats( - FIRSTPASS_STATS *this_frame, - double *this_frame_mv_in_out, - double *mv_in_out_accumulator, - double *abs_mv_in_out_accumulator, - double *mv_ratio_accumulator) { - // double this_frame_mv_in_out; - double this_frame_mvr_ratio; - double this_frame_mvc_ratio; - double motion_pct; +// Update the motion related elements to the GF arf boost calculation. +static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats, + double *mv_in_out, + double *mv_in_out_accumulator, + double *abs_mv_in_out_accumulator, + double *mv_ratio_accumulator) { + const double pct = stats->pcnt_motion; - // Accumulate motion stats. - motion_pct = this_frame->pcnt_motion; + // Accumulate Motion In/Out of frame stats. + *mv_in_out = stats->mv_in_out_count * pct; + *mv_in_out_accumulator += *mv_in_out; + *abs_mv_in_out_accumulator += fabs(*mv_in_out); - // Accumulate Motion In/Out of frame stats - *this_frame_mv_in_out = this_frame->mv_in_out_count * motion_pct; - *mv_in_out_accumulator += this_frame->mv_in_out_count * motion_pct; - *abs_mv_in_out_accumulator += - fabs(this_frame->mv_in_out_count * motion_pct); + // Accumulate a measure of how uniform (or conversely how random) the motion + // field is (a ratio of abs(mv) / mv). + if (pct > 0.05) { + const double mvr_ratio = fabs(stats->mvr_abs) / + DOUBLE_DIVIDE_CHECK(fabs(stats->MVr)); + const double mvc_ratio = fabs(stats->mvc_abs) / + DOUBLE_DIVIDE_CHECK(fabs(stats->MVc)); - // Accumulate a measure of how uniform (or conversely how random) - // the motion field is. (A ratio of absmv / mv) - if (motion_pct > 0.05) { - this_frame_mvr_ratio = fabs(this_frame->mvr_abs) / - DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVr)); - - this_frame_mvc_ratio = fabs(this_frame->mvc_abs) / - DOUBLE_DIVIDE_CHECK(fabs(this_frame->MVc)); - - *mv_ratio_accumulator += - (this_frame_mvr_ratio < this_frame->mvr_abs) - ? (this_frame_mvr_ratio * motion_pct) - : this_frame->mvr_abs * motion_pct; - - *mv_ratio_accumulator += - (this_frame_mvc_ratio < this_frame->mvc_abs) - ? (this_frame_mvc_ratio * motion_pct) - : this_frame->mvc_abs * motion_pct; + *mv_ratio_accumulator += pct * (mvr_ratio < stats->mvr_abs ? + mvr_ratio : stats->mvr_abs); + *mv_ratio_accumulator += pct * (mvc_ratio < stats->mvc_abs ? + mvc_ratio : stats->mvc_abs); } } // Calculate a baseline boost number for the current frame. -static double calc_frame_boost( - VP9_COMP *cpi, - FIRSTPASS_STATS *this_frame, - double this_frame_mv_in_out) { +static double calc_frame_boost(const TWO_PASS *twopass, + const FIRSTPASS_STATS *this_frame, + double this_frame_mv_in_out) { double frame_boost; - // Underlying boost factor is based on inter intra error ratio - if (this_frame->intra_error > cpi->twopass.gf_intra_err_min) + // Underlying boost factor is based on inter intra error ratio. + if (this_frame->intra_error > twopass->gf_intra_err_min) frame_boost = (IIFACTOR * this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)); else - frame_boost = (IIFACTOR * cpi->twopass.gf_intra_err_min / + frame_boost = (IIFACTOR * twopass->gf_intra_err_min / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)); - // Increase boost for frames where new data coming into frame - // (eg zoom out). Slightly reduce boost if there is a net balance - // of motion out of the frame (zoom in). - // The range for this_frame_mv_in_out is -1.0 to +1.0 + // Increase boost for frames where new data coming into frame (e.g. zoom out). + // Slightly reduce boost if there is a net balance of motion out of the frame + // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0. if (this_frame_mv_in_out > 0.0) frame_boost += frame_boost * (this_frame_mv_in_out * 2.0); - // In extreme case boost is halved + // In the extreme case the boost is halved. else frame_boost += frame_boost * (this_frame_mv_in_out / 2.0); - // Clip to maximum - if (frame_boost > GF_RMAX) - frame_boost = GF_RMAX; - - return frame_boost; + return MIN(frame_boost, GF_RMAX); } static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, int *f_boost, int *b_boost) { - FIRSTPASS_STATS this_frame; - + TWO_PASS *const twopass = &cpi->twopass; int i; double boost_score = 0.0; double mv_ratio_accumulator = 0.0; @@ -1435,36 +1196,37 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int arf_boost; int flash_detected = 0; - // Search forward from the proposed arf/next gf position - for (i = 0; i < f_frames; i++) { - if (read_frame_stats(cpi, &this_frame, (i + offset)) == EOF) + // Search forward from the proposed arf/next gf position. + for (i = 0; i < f_frames; ++i) { + const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset); + if (this_frame == NULL) break; - // Update the motion related elements to the boost calculation - accumulate_frame_motion_stats(&this_frame, + // Update the motion related elements to the boost calculation. + accumulate_frame_motion_stats(this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, &abs_mv_in_out_accumulator, &mv_ratio_accumulator); // We want to discount the flash frame itself and the recovery // frame that follows as both will have poor scores. - flash_detected = detect_flash(cpi, (i + offset)) || - detect_flash(cpi, (i + offset + 1)); + flash_detected = detect_flash(twopass, i + offset) || + detect_flash(twopass, i + offset + 1); - // Cumulative effect of prediction quality decay + // Accumulate the effect of prediction quality decay. if (!flash_detected) { - decay_accumulator *= get_prediction_decay_rate(cpi, &this_frame); + decay_accumulator *= get_prediction_decay_rate(&cpi->common, this_frame); decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR ? MIN_DECAY_FACTOR : decay_accumulator; } - boost_score += (decay_accumulator * - calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out)); + boost_score += decay_accumulator * calc_frame_boost(twopass, this_frame, + this_frame_mv_in_out); } *f_boost = (int)boost_score; - // Reset for backward looking loop + // Reset for backward looking loop. boost_score = 0.0; mv_ratio_accumulator = 0.0; decay_accumulator = 1.0; @@ -1472,31 +1234,32 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, mv_in_out_accumulator = 0.0; abs_mv_in_out_accumulator = 0.0; - // Search backward towards last gf position - for (i = -1; i >= -b_frames; i--) { - if (read_frame_stats(cpi, &this_frame, (i + offset)) == EOF) + // Search backward towards last gf position. + for (i = -1; i >= -b_frames; --i) { + const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset); + if (this_frame == NULL) break; - // Update the motion related elements to the boost calculation - accumulate_frame_motion_stats(&this_frame, + // Update the motion related elements to the boost calculation. + accumulate_frame_motion_stats(this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, &abs_mv_in_out_accumulator, &mv_ratio_accumulator); // We want to discount the the flash frame itself and the recovery // frame that follows as both will have poor scores. - flash_detected = detect_flash(cpi, (i + offset)) || - detect_flash(cpi, (i + offset + 1)); + flash_detected = detect_flash(twopass, i + offset) || + detect_flash(twopass, i + offset + 1); - // Cumulative effect of prediction quality decay + // Cumulative effect of prediction quality decay. if (!flash_detected) { - decay_accumulator *= get_prediction_decay_rate(cpi, &this_frame); + decay_accumulator *= get_prediction_decay_rate(&cpi->common, this_frame); decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR ? MIN_DECAY_FACTOR : decay_accumulator; } - boost_score += (decay_accumulator * - calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out)); + boost_score += decay_accumulator * calc_frame_boost(twopass, this_frame, + this_frame_mv_in_out); } *b_boost = (int)boost_score; @@ -1507,148 +1270,256 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, return arf_boost; } -#if CONFIG_MULTIPLE_ARF -// Work out the frame coding order for a GF or an ARF group. -// The current implementation codes frames in their natural order for a -// GF group, and inserts additional ARFs into an ARF group using a -// binary split approach. -// NOTE: this function is currently implemented recursively. -static void schedule_frames(VP9_COMP *cpi, const int start, const int end, - const int arf_idx, const int gf_or_arf_group, - const int level) { - int i, abs_end, half_range; - int *cfo = cpi->frame_coding_order; - int idx = cpi->new_frame_coding_order_period; +// Calculate a section intra ratio used in setting max loop filter. +static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin, + const FIRSTPASS_STATS *end, + int section_length) { + const FIRSTPASS_STATS *s = begin; + double intra_error = 0.0; + double coded_error = 0.0; + int i = 0; - // If (end < 0) an ARF should be coded at position (-end). - assert(start >= 0); - - // printf("start:%d end:%d\n", start, end); - - // GF Group: code frames in logical order. - if (gf_or_arf_group == 0) { - assert(end >= start); - for (i = start; i <= end; ++i) { - cfo[idx] = i; - cpi->arf_buffer_idx[idx] = arf_idx; - cpi->arf_weight[idx] = -1; - ++idx; - } - cpi->new_frame_coding_order_period = idx; - return; + while (s < end && i < section_length) { + intra_error += s->intra_error; + coded_error += s->coded_error; + ++s; + ++i; } - // ARF Group: work out the ARF schedule. - // Mark ARF frames as negative. - if (end < 0) { - // printf("start:%d end:%d\n", -end, -end); - // ARF frame is at the end of the range. - cfo[idx] = end; - // What ARF buffer does this ARF use as predictor. - cpi->arf_buffer_idx[idx] = (arf_idx > 2) ? (arf_idx - 1) : 2; - cpi->arf_weight[idx] = level; - ++idx; - abs_end = -end; - } else { - abs_end = end; - } - - half_range = (abs_end - start) >> 1; - - // ARFs may not be adjacent, they must be separated by at least - // MIN_GF_INTERVAL non-ARF frames. - if ((start + MIN_GF_INTERVAL) >= (abs_end - MIN_GF_INTERVAL)) { - // printf("start:%d end:%d\n", start, abs_end); - // Update the coding order and active ARF. - for (i = start; i <= abs_end; ++i) { - cfo[idx] = i; - cpi->arf_buffer_idx[idx] = arf_idx; - cpi->arf_weight[idx] = -1; - ++idx; - } - cpi->new_frame_coding_order_period = idx; - } else { - // Place a new ARF at the mid-point of the range. - cpi->new_frame_coding_order_period = idx; - schedule_frames(cpi, start, -(start + half_range), arf_idx + 1, - gf_or_arf_group, level + 1); - schedule_frames(cpi, start + half_range + 1, abs_end, arf_idx, - gf_or_arf_group, level + 1); - } + return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error)); } -#define FIXED_ARF_GROUP_SIZE 16 +// Calculate the total bits to allocate in this GF/ARF group. +static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi, + double gf_group_err) { + const RATE_CONTROL *const rc = &cpi->rc; + const TWO_PASS *const twopass = &cpi->twopass; + const int max_bits = frame_max_bits(rc, &cpi->oxcf); + int64_t total_group_bits; -void define_fixed_arf_period(VP9_COMP *cpi) { + // Calculate the bits to be allocated to the group as a whole. + if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) { + total_group_bits = (int64_t)(twopass->kf_group_bits * + (gf_group_err / twopass->kf_group_error_left)); + } else { + total_group_bits = 0; + } + + // Clamp odd edge cases. + total_group_bits = (total_group_bits < 0) ? + 0 : (total_group_bits > twopass->kf_group_bits) ? + twopass->kf_group_bits : total_group_bits; + + // Clip based on user supplied data rate variability limit. + if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval) + total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval; + + return total_group_bits; +} + +// Calculate the number bits extra to assign to boosted frames in a group. +static int calculate_boost_bits(int frame_count, + int boost, int64_t total_group_bits) { + int allocation_chunks; + + // return 0 for invalid inputs (could arise e.g. through rounding errors) + if (!boost || (total_group_bits <= 0) || (frame_count <= 0) ) + return 0; + + allocation_chunks = (frame_count * 100) + boost; + + // Prevent overflow. + if (boost > 1023) { + int divisor = boost >> 10; + boost /= divisor; + allocation_chunks /= divisor; + } + + // Calculate the number of extra bits for use in the boosted frame or frames. + return MAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks), 0); +} + +// Current limit on maximum number of active arfs in a GF/ARF group. +#define MAX_ACTIVE_ARFS 2 +#define ARF_SLOT1 2 +#define ARF_SLOT2 3 +// This function indirects the choice of buffers for arfs. +// At the moment the values are fixed but this may change as part of +// the integration process with other codec features that swap buffers around. +static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) { + arf_buffer_indices[0] = ARF_SLOT1; + arf_buffer_indices[1] = ARF_SLOT2; +} + +static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, + double group_error, int gf_arf_bits) { + RATE_CONTROL *const rc = &cpi->rc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + FIRSTPASS_STATS frame_stats; int i; - int max_level = INT_MIN; + int frame_index = 1; + int target_frame_size; + int key_frame; + const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf); + int64_t total_group_bits = gf_group_bits; + double modified_err = 0.0; + double err_fraction; + int mid_boost_bits = 0; + int mid_frame_idx; + unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS]; + int alt_frame_index = frame_index; + int has_temporal_layers = is_two_pass_svc(cpi) && + cpi->svc.number_temporal_layers > 1; - assert(cpi->multi_arf_enabled); - assert(cpi->oxcf.lag_in_frames >= FIXED_ARF_GROUP_SIZE); + // Only encode alt reference frame in temporal base layer. + if (has_temporal_layers) + alt_frame_index = cpi->svc.number_temporal_layers; - // Save the weight of the last frame in the sequence before next - // sequence pattern overwrites it. - cpi->this_frame_weight = cpi->arf_weight[cpi->sequence_number]; - assert(cpi->this_frame_weight >= 0); + key_frame = cpi->common.frame_type == KEY_FRAME || + vp9_is_upper_layer_key_frame(cpi); - // Initialize frame coding order variables. - cpi->new_frame_coding_order_period = 0; - cpi->next_frame_in_order = 0; - cpi->arf_buffered = 0; - vp9_zero(cpi->frame_coding_order); - vp9_zero(cpi->arf_buffer_idx); - vpx_memset(cpi->arf_weight, -1, sizeof(cpi->arf_weight)); + get_arf_buffer_indices(arf_buffer_indices); - if (cpi->twopass.frames_to_key <= (FIXED_ARF_GROUP_SIZE + 8)) { - // Setup a GF group close to the keyframe. - cpi->source_alt_ref_pending = 0; - cpi->baseline_gf_interval = cpi->twopass.frames_to_key; - schedule_frames(cpi, 0, (cpi->baseline_gf_interval - 1), 2, 0, 0); + // For key frames the frame target rate is already set and it + // is also the golden frame. + if (!key_frame) { + if (rc->source_alt_ref_active) { + gf_group->update_type[0] = OVERLAY_UPDATE; + gf_group->rf_level[0] = INTER_NORMAL; + gf_group->bit_allocation[0] = 0; + gf_group->arf_update_idx[0] = arf_buffer_indices[0]; + gf_group->arf_ref_idx[0] = arf_buffer_indices[0]; + } else { + gf_group->update_type[0] = GF_UPDATE; + gf_group->rf_level[0] = GF_ARF_STD; + gf_group->bit_allocation[0] = gf_arf_bits; + gf_group->arf_update_idx[0] = arf_buffer_indices[0]; + gf_group->arf_ref_idx[0] = arf_buffer_indices[0]; + } + + // Step over the golden frame / overlay frame + if (EOF == input_stats(twopass, &frame_stats)) + return; + } + + // Deduct the boost bits for arf (or gf if it is not a key frame) + // from the group total. + if (rc->source_alt_ref_pending || !key_frame) + total_group_bits -= gf_arf_bits; + + // Store the bits to spend on the ARF if there is one. + if (rc->source_alt_ref_pending) { + gf_group->update_type[alt_frame_index] = ARF_UPDATE; + gf_group->rf_level[alt_frame_index] = GF_ARF_STD; + gf_group->bit_allocation[alt_frame_index] = gf_arf_bits; + + if (has_temporal_layers) + gf_group->arf_src_offset[alt_frame_index] = + (unsigned char)(rc->baseline_gf_interval - + cpi->svc.number_temporal_layers); + else + gf_group->arf_src_offset[alt_frame_index] = + (unsigned char)(rc->baseline_gf_interval - 1); + + gf_group->arf_update_idx[alt_frame_index] = arf_buffer_indices[0]; + gf_group->arf_ref_idx[alt_frame_index] = + arf_buffer_indices[cpi->multi_arf_last_grp_enabled && + rc->source_alt_ref_active]; + if (!has_temporal_layers) + ++frame_index; + + if (cpi->multi_arf_enabled) { + // Set aside a slot for a level 1 arf. + gf_group->update_type[frame_index] = ARF_UPDATE; + gf_group->rf_level[frame_index] = GF_ARF_LOW; + gf_group->arf_src_offset[frame_index] = + (unsigned char)((rc->baseline_gf_interval >> 1) - 1); + gf_group->arf_update_idx[frame_index] = arf_buffer_indices[1]; + gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0]; + ++frame_index; + } + } + + // Define middle frame + mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1; + + // Allocate bits to the other frames in the group. + for (i = 0; i < rc->baseline_gf_interval - 1; ++i) { + int arf_idx = 0; + if (EOF == input_stats(twopass, &frame_stats)) + break; + + if (has_temporal_layers && frame_index == alt_frame_index) { + ++frame_index; + } + + modified_err = calculate_modified_err(twopass, oxcf, &frame_stats); + + if (group_error > 0) + err_fraction = modified_err / DOUBLE_DIVIDE_CHECK(group_error); + else + err_fraction = 0.0; + + target_frame_size = (int)((double)total_group_bits * err_fraction); + + if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) { + mid_boost_bits += (target_frame_size >> 4); + target_frame_size -= (target_frame_size >> 4); + + if (frame_index <= mid_frame_idx) + arf_idx = 1; + } + gf_group->arf_update_idx[frame_index] = arf_buffer_indices[arf_idx]; + gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx]; + + target_frame_size = clamp(target_frame_size, 0, + MIN(max_bits, (int)total_group_bits)); + + gf_group->update_type[frame_index] = LF_UPDATE; + gf_group->rf_level[frame_index] = INTER_NORMAL; + + gf_group->bit_allocation[frame_index] = target_frame_size; + ++frame_index; + } + + // Note: + // We need to configure the frame at the end of the sequence + 1 that will be + // the start frame for the next group. Otherwise prior to the call to + // vp9_rc_get_second_pass_params() the data will be undefined. + gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0]; + gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0]; + + if (rc->source_alt_ref_pending) { + gf_group->update_type[frame_index] = OVERLAY_UPDATE; + gf_group->rf_level[frame_index] = INTER_NORMAL; + + // Final setup for second arf and its overlay. + if (cpi->multi_arf_enabled) { + gf_group->bit_allocation[2] = + gf_group->bit_allocation[mid_frame_idx] + mid_boost_bits; + gf_group->update_type[mid_frame_idx] = OVERLAY_UPDATE; + gf_group->bit_allocation[mid_frame_idx] = 0; + } } else { - // Setup a fixed period ARF group. - cpi->source_alt_ref_pending = 1; - cpi->baseline_gf_interval = FIXED_ARF_GROUP_SIZE; - schedule_frames(cpi, 0, -(cpi->baseline_gf_interval - 1), 2, 1, 0); + gf_group->update_type[frame_index] = GF_UPDATE; + gf_group->rf_level[frame_index] = GF_ARF_STD; } - // Replace level indicator of -1 with correct level. - for (i = 0; i < cpi->new_frame_coding_order_period; ++i) { - if (cpi->arf_weight[i] > max_level) { - max_level = cpi->arf_weight[i]; - } - } - ++max_level; - for (i = 0; i < cpi->new_frame_coding_order_period; ++i) { - if (cpi->arf_weight[i] == -1) { - cpi->arf_weight[i] = max_level; - } - } - cpi->max_arf_level = max_level; -#if 0 - printf("\nSchedule: "); - for (i = 0; i < cpi->new_frame_coding_order_period; ++i) { - printf("%4d ", cpi->frame_coding_order[i]); - } - printf("\n"); - printf("ARFref: "); - for (i = 0; i < cpi->new_frame_coding_order_period; ++i) { - printf("%4d ", cpi->arf_buffer_idx[i]); - } - printf("\n"); - printf("Weight: "); - for (i = 0; i < cpi->new_frame_coding_order_period; ++i) { - printf("%4d ", cpi->arf_weight[i]); - } - printf("\n"); -#endif + // Note whether multi-arf was enabled this group for next time. + cpi->multi_arf_last_grp_enabled = cpi->multi_arf_enabled; } -#endif // Analyse and define a gf/arf group. static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { - FIRSTPASS_STATS next_frame = { 0 }; - FIRSTPASS_STATS *start_pos; + RATE_CONTROL *const rc = &cpi->rc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->twopass; + FIRSTPASS_STATS next_frame; + const FIRSTPASS_STATS *const start_pos = twopass->stats_in; int i; + double boost_score = 0.0; double old_boost_score = 0.0; double gf_group_err = 0.0; @@ -1659,116 +1530,114 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { double decay_accumulator = 1.0; double zero_motion_accumulator = 1.0; - double loop_decay_rate = 1.00; // Starting decay rate + double loop_decay_rate = 1.00; double last_loop_decay_rate = 1.00; double this_frame_mv_in_out = 0.0; double mv_in_out_accumulator = 0.0; double abs_mv_in_out_accumulator = 0.0; double mv_ratio_accumulator_thresh; - int max_bits = frame_max_bits(cpi); // Max for a single frame - - unsigned int allow_alt_ref = - cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames; + unsigned int allow_alt_ref = is_altref_enabled(cpi); int f_boost = 0; int b_boost = 0; int flash_detected; int active_max_gf_interval; + int64_t gf_group_bits; + double gf_group_error_left; + int gf_arf_bits; - cpi->twopass.gf_group_bits = 0; + // Reset the GF group data structures unless this is a key + // frame in which case it will already have been done. + if (cpi->common.frame_type != KEY_FRAME) { + vp9_zero(twopass->gf_group); + } - vp9_clear_system_state(); // __asm emms; - - start_pos = cpi->twopass.stats_in; + vp9_clear_system_state(); + vp9_zero(next_frame); // Load stats for the current frame. - mod_frame_err = calculate_modified_err(cpi, this_frame); + mod_frame_err = calculate_modified_err(twopass, oxcf, this_frame); - // Note the error of the frame at the start of the group (this will be - // the GF frame error if we code a normal gf + // Note the error of the frame at the start of the group. This will be + // the GF frame error if we code a normal gf. gf_first_frame_err = mod_frame_err; - // Special treatment if the current frame is a key frame (which is also - // a gf). If it is then its error score (and hence bit allocation) need - // to be subtracted out from the calculation for the GF group - if (cpi->common.frame_type == KEY_FRAME) + // If this is a key frame or the overlay from a previous arf then + // the error score / cost of this frame has already been accounted for. + if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active) gf_group_err -= gf_first_frame_err; // Motion breakout threshold for loop below depends on image size. mv_ratio_accumulator_thresh = (cpi->common.width + cpi->common.height) / 10.0; - // Work out a maximum interval for the GF. - // If the image appears completely static we can extend beyond this. - // The value chosen depends on the active Q range. At low Q we have - // bits to spare and are better with a smaller interval and smaller boost. - // At high Q when there are few bits to spare we are better with a longer - // interval to spread the cost of the GF. - active_max_gf_interval = - 12 + ((int)vp9_convert_qindex_to_q(cpi->active_worst_quality) >> 5); + // Work out a maximum interval for the GF group. + // If the image appears almost completely static we can extend beyond this. + if (cpi->multi_arf_allowed) { + active_max_gf_interval = rc->max_gf_interval; + } else { + // The value chosen depends on the active Q range. At low Q we have + // bits to spare and are better with a smaller interval and smaller boost. + // At high Q when there are few bits to spare we are better with a longer + // interval to spread the cost of the GF. + active_max_gf_interval = + 12 + ((int)vp9_convert_qindex_to_q(rc->last_q[INTER_FRAME]) >> 5); - if (active_max_gf_interval > cpi->max_gf_interval) - active_max_gf_interval = cpi->max_gf_interval; + if (active_max_gf_interval > rc->max_gf_interval) + active_max_gf_interval = rc->max_gf_interval; + } i = 0; - while (((i < cpi->twopass.static_scene_max_gf_interval) || - ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL)) && - (i < cpi->twopass.frames_to_key)) { - i++; // Increment the loop counter + while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) { + ++i; - // Accumulate error score of frames in this gf group - mod_frame_err = calculate_modified_err(cpi, this_frame); + // Accumulate error score of frames in this gf group. + mod_frame_err = calculate_modified_err(twopass, oxcf, this_frame); gf_group_err += mod_frame_err; - if (EOF == input_stats(cpi, &next_frame)) + if (EOF == input_stats(twopass, &next_frame)) break; // Test for the case where there is a brief flash but the prediction // quality back to an earlier frame is then restored. - flash_detected = detect_flash(cpi, 0); + flash_detected = detect_flash(twopass, 0); - // Update the motion related elements to the boost calculation + // Update the motion related elements to the boost calculation. accumulate_frame_motion_stats(&next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, &abs_mv_in_out_accumulator, &mv_ratio_accumulator); - // Cumulative effect of prediction quality decay + // Accumulate the effect of prediction quality decay. if (!flash_detected) { last_loop_decay_rate = loop_decay_rate; - loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame); + loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame); decay_accumulator = decay_accumulator * loop_decay_rate; // Monitor for static sections. - if ((next_frame.pcnt_inter - next_frame.pcnt_motion) < - zero_motion_accumulator) { - zero_motion_accumulator = - (next_frame.pcnt_inter - next_frame.pcnt_motion); - } + zero_motion_accumulator = MIN(zero_motion_accumulator, + get_zero_motion_factor(&next_frame)); - // Break clause to detect very still sections after motion - // (for example a static image after a fade or other transition). - if (detect_transition_to_still(cpi, i, 5, loop_decay_rate, + // Break clause to detect very still sections after motion. For example, + // a static image after a fade or other transition. + if (detect_transition_to_still(twopass, i, 5, loop_decay_rate, last_loop_decay_rate)) { allow_alt_ref = 0; break; } } - // Calculate a boost number for this frame - boost_score += - (decay_accumulator * - calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out)); + // Calculate a boost number for this frame. + boost_score += decay_accumulator * calc_frame_boost(twopass, &next_frame, + this_frame_mv_in_out); // Break out conditions. if ( - // Break at cpi->max_gf_interval unless almost totally static + // Break at active_max_gf_interval unless almost totally static. (i >= active_max_gf_interval && (zero_motion_accumulator < 0.995)) || ( - // Don't break out with a very short interval + // Don't break out with a very short interval. (i > MIN_GF_INTERVAL) && - // Don't break out very close to a key frame - ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) && ((boost_score > 125.0) || (next_frame.pcnt_inter < 0.75)) && (!flash_detected) && ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) || @@ -1784,592 +1653,176 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { old_boost_score = boost_score; } - cpi->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0); + twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0); - // Don't allow a gf too near the next kf - if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL) { - while (i < cpi->twopass.frames_to_key) { - i++; + // Don't allow a gf too near the next kf. + if ((rc->frames_to_key - i) < MIN_GF_INTERVAL) { + while (i < (rc->frames_to_key + !rc->next_key_frame_forced)) { + ++i; - if (EOF == input_stats(cpi, this_frame)) + if (EOF == input_stats(twopass, this_frame)) break; - if (i < cpi->twopass.frames_to_key) { - mod_frame_err = calculate_modified_err(cpi, this_frame); + if (i < rc->frames_to_key) { + mod_frame_err = calculate_modified_err(twopass, oxcf, this_frame); gf_group_err += mod_frame_err; } } } - // Set the interval until the next gf or arf. - cpi->baseline_gf_interval = i; + // Set the interval until the next gf. + if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active) + rc->baseline_gf_interval = i - 1; + else + rc->baseline_gf_interval = i; -#if CONFIG_MULTIPLE_ARF - if (cpi->multi_arf_enabled) { - // Initialize frame coding order variables. - cpi->new_frame_coding_order_period = 0; - cpi->next_frame_in_order = 0; - cpi->arf_buffered = 0; - vp9_zero(cpi->frame_coding_order); - vp9_zero(cpi->arf_buffer_idx); - vpx_memset(cpi->arf_weight, -1, sizeof(cpi->arf_weight)); + // Only encode alt reference frame in temporal base layer. So + // baseline_gf_interval should be multiple of a temporal layer group + // (typically the frame distance between two base layer frames) + if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) { + int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1; + int new_gf_interval = (rc->baseline_gf_interval + count) & (~count); + int j; + for (j = 0; j < new_gf_interval - rc->baseline_gf_interval; ++j) { + if (EOF == input_stats(twopass, this_frame)) + break; + gf_group_err += calculate_modified_err(twopass, oxcf, this_frame); + } + rc->baseline_gf_interval = new_gf_interval; } -#endif - // Should we use the alternate reference frame + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + + // Should we use the alternate reference frame. if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) && (i >= MIN_GF_INTERVAL) && - // dont use ARF very near next kf - (i <= (cpi->twopass.frames_to_key - MIN_GF_INTERVAL)) && - ((next_frame.pcnt_inter > 0.75) || - (next_frame.pcnt_second_ref > 0.5)) && - ((mv_in_out_accumulator / (double)i > -0.2) || - (mv_in_out_accumulator > -2.0)) && - (boost_score > 100)) { - // Alternative boost calculation for alt ref - cpi->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, - &b_boost); - cpi->source_alt_ref_pending = 1; + // For real scene cuts (not forced kfs) don't allow arf very near kf. + (rc->next_key_frame_forced || + (i <= (rc->frames_to_key - MIN_GF_INTERVAL)))) { + // Calculate the boost for alt ref. + rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, + &b_boost); + rc->source_alt_ref_pending = 1; -#if CONFIG_MULTIPLE_ARF - // Set the ARF schedule. - if (cpi->multi_arf_enabled) { - schedule_frames(cpi, 0, -(cpi->baseline_gf_interval - 1), 2, 1, 0); - } -#endif + // Test to see if multi arf is appropriate. + cpi->multi_arf_enabled = + (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) && + (zero_motion_accumulator < 0.995)) ? 1 : 0; } else { - cpi->gfu_boost = (int)boost_score; - cpi->source_alt_ref_pending = 0; -#if CONFIG_MULTIPLE_ARF - // Set the GF schedule. - if (cpi->multi_arf_enabled) { - schedule_frames(cpi, 0, cpi->baseline_gf_interval - 1, 2, 0, 0); - assert(cpi->new_frame_coding_order_period == cpi->baseline_gf_interval); - } -#endif + rc->gfu_boost = (int)boost_score; + rc->source_alt_ref_pending = 0; } -#if CONFIG_MULTIPLE_ARF - if (cpi->multi_arf_enabled && (cpi->common.frame_type != KEY_FRAME)) { - int max_level = INT_MIN; - // Replace level indicator of -1 with correct level. - for (i = 0; i < cpi->frame_coding_order_period; ++i) { - if (cpi->arf_weight[i] > max_level) { - max_level = cpi->arf_weight[i]; - } - } - ++max_level; - for (i = 0; i < cpi->frame_coding_order_period; ++i) { - if (cpi->arf_weight[i] == -1) { - cpi->arf_weight[i] = max_level; - } - } - cpi->max_arf_level = max_level; - } -#if 0 - if (cpi->multi_arf_enabled) { - printf("\nSchedule: "); - for (i = 0; i < cpi->new_frame_coding_order_period; ++i) { - printf("%4d ", cpi->frame_coding_order[i]); - } - printf("\n"); - printf("ARFref: "); - for (i = 0; i < cpi->new_frame_coding_order_period; ++i) { - printf("%4d ", cpi->arf_buffer_idx[i]); - } - printf("\n"); - printf("Weight: "); - for (i = 0; i < cpi->new_frame_coding_order_period; ++i) { - printf("%4d ", cpi->arf_weight[i]); - } - printf("\n"); - } -#endif -#endif + // Reset the file position. + reset_fpf_position(twopass, start_pos); - // Now decide how many bits should be allocated to the GF group as a - // proportion of those remaining in the kf group. - // The final key frame group in the clip is treated as a special case - // where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left. - // This is also important for short clips where there may only be one - // key frame. - if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats.count - - cpi->common.current_video_frame)) { - cpi->twopass.kf_group_bits = - (cpi->twopass.bits_left > 0) ? cpi->twopass.bits_left : 0; - } - - // Calculate the bits to be allocated to the group as a whole - if ((cpi->twopass.kf_group_bits > 0) && - (cpi->twopass.kf_group_error_left > 0)) { - cpi->twopass.gf_group_bits = - (int64_t)(cpi->twopass.kf_group_bits * - (gf_group_err / cpi->twopass.kf_group_error_left)); - } else { - cpi->twopass.gf_group_bits = 0; - } - cpi->twopass.gf_group_bits = - (cpi->twopass.gf_group_bits < 0) - ? 0 - : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits) - ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits; - - // Clip cpi->twopass.gf_group_bits based on user supplied data rate - // variability limit (cpi->oxcf.two_pass_vbrmax_section) - if (cpi->twopass.gf_group_bits > - (int64_t)max_bits * cpi->baseline_gf_interval) - cpi->twopass.gf_group_bits = (int64_t)max_bits * cpi->baseline_gf_interval; - - // Reset the file position - reset_fpf_position(cpi, start_pos); - - // Update the record of error used so far (only done once per gf group) - cpi->twopass.modified_error_used += gf_group_err; - - // Assign bits to the arf or gf. - for (i = 0; - i <= (cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME); - ++i) { - int allocation_chunks; - int q = cpi->oxcf.fixed_q < 0 ? cpi->last_q[INTER_FRAME] - : cpi->oxcf.fixed_q; - int gf_bits; - - int boost = (cpi->gfu_boost * vp9_gfboost_qadjust(q)) / 100; - - // Set max and minimum boost and hence minimum allocation - boost = clamp(boost, 125, (cpi->baseline_gf_interval + 1) * 200); - - if (cpi->source_alt_ref_pending && i == 0) - allocation_chunks = ((cpi->baseline_gf_interval + 1) * 100) + boost; - else - allocation_chunks = (cpi->baseline_gf_interval * 100) + (boost - 100); - - // Prevent overflow - if (boost > 1023) { - int divisor = boost >> 10; - boost /= divisor; - allocation_chunks /= divisor; - } - - // Calculate the number of bits to be spent on the gf or arf based on - // the boost number - gf_bits = (int)((double)boost * (cpi->twopass.gf_group_bits / - (double)allocation_chunks)); - - // If the frame that is to be boosted is simpler than the average for - // the gf/arf group then use an alternative calculation - // based on the error score of the frame itself - if (mod_frame_err < gf_group_err / (double)cpi->baseline_gf_interval) { - double alt_gf_grp_bits = - (double)cpi->twopass.kf_group_bits * - (mod_frame_err * (double)cpi->baseline_gf_interval) / - DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left); - - int alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits / - (double)allocation_chunks)); - - if (gf_bits > alt_gf_bits) - gf_bits = alt_gf_bits; - } else { - // If it is harder than other frames in the group make sure it at - // least receives an allocation in keeping with its relative error - // score, otherwise it may be worse off than an "un-boosted" frame. - int alt_gf_bits = (int)((double)cpi->twopass.kf_group_bits * - mod_frame_err / - DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left)); - - if (alt_gf_bits > gf_bits) - gf_bits = alt_gf_bits; - } - - // Dont allow a negative value for gf_bits - if (gf_bits < 0) - gf_bits = 0; - - // Add in minimum for a frame - gf_bits += cpi->min_frame_bandwidth; - - if (i == 0) { - cpi->twopass.gf_bits = gf_bits; - } - if (i == 1 || (!cpi->source_alt_ref_pending - && (cpi->common.frame_type != KEY_FRAME))) { - // Per frame bit target for this frame - cpi->per_frame_bandwidth = gf_bits; - } - } + // Calculate the bits to be allocated to the gf/arf group as a whole + gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err); + // Calculate the extra bits to be used for boosted frame(s) { - // Adjust KF group bits and error remaining - cpi->twopass.kf_group_error_left -= (int64_t)gf_group_err; - cpi->twopass.kf_group_bits -= cpi->twopass.gf_group_bits; + int q = rc->last_q[INTER_FRAME]; + int boost = (rc->gfu_boost * gfboost_qadjust(q)) / 100; - if (cpi->twopass.kf_group_bits < 0) - cpi->twopass.kf_group_bits = 0; + // Set max and minimum boost and hence minimum allocation. + boost = clamp(boost, 125, (rc->baseline_gf_interval + 1) * 200); - // Note the error score left in the remaining frames of the group. - // For normal GFs we want to remove the error score for the first frame - // of the group (except in Key frame case where this has already - // happened) - if (!cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME) - cpi->twopass.gf_group_error_left = (int64_t)(gf_group_err - - gf_first_frame_err); - else - cpi->twopass.gf_group_error_left = (int64_t)gf_group_err; - - cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits - - cpi->min_frame_bandwidth; - - if (cpi->twopass.gf_group_bits < 0) - cpi->twopass.gf_group_bits = 0; - - // This condition could fail if there are two kfs very close together - // despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the - // calculation of alt_extra_bits. - if (cpi->baseline_gf_interval >= 3) { - const int boost = cpi->source_alt_ref_pending ? b_boost : cpi->gfu_boost; - - if (boost >= 150) { - int alt_extra_bits; - int pct_extra = (boost - 100) / 50; - pct_extra = (pct_extra > 20) ? 20 : pct_extra; - - alt_extra_bits = (int)((cpi->twopass.gf_group_bits * pct_extra) / 100); - cpi->twopass.gf_group_bits -= alt_extra_bits; - } - } + // Calculate the extra bits to be used for boosted frame(s) + gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval, + boost, gf_group_bits); } + // Adjust KF group bits and error remaining. + twopass->kf_group_error_left -= (int64_t)gf_group_err; + + // If this is an arf update we want to remove the score for the overlay + // frame at the end which will usually be very cheap to code. + // The overlay frame has already, in effect, been coded so we want to spread + // the remaining bits among the other frames. + // For normal GFs remove the score for the GF itself unless this is + // also a key frame in which case it has already been accounted for. + if (rc->source_alt_ref_pending) { + gf_group_error_left = gf_group_err - mod_frame_err; + } else if (cpi->common.frame_type != KEY_FRAME) { + gf_group_error_left = gf_group_err - gf_first_frame_err; + } else { + gf_group_error_left = gf_group_err; + } + + // Allocate bits to each of the frames in the GF group. + allocate_gf_group_bits(cpi, gf_group_bits, gf_group_error_left, gf_arf_bits); + + // Reset the file position. + reset_fpf_position(twopass, start_pos); + + // Calculate a section intra ratio used in setting max loop filter. if (cpi->common.frame_type != KEY_FRAME) { - FIRSTPASS_STATS sectionstats; - - zero_stats(§ionstats); - reset_fpf_position(cpi, start_pos); - - for (i = 0; i < cpi->baseline_gf_interval; i++) { - input_stats(cpi, &next_frame); - accumulate_stats(§ionstats, &next_frame); - } - - avg_stats(§ionstats); - - cpi->twopass.section_intra_rating = (int) - (sectionstats.intra_error / - DOUBLE_DIVIDE_CHECK(sectionstats.coded_error)); - - reset_fpf_position(cpi, start_pos); + twopass->section_intra_rating = + calculate_section_intra_ratio(start_pos, twopass->stats_in_end, + rc->baseline_gf_interval); } } -// Allocate bits to a normal frame that is neither a gf an arf or a key frame. -static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { - int target_frame_size; - - double modified_err; - double err_fraction; - - // Max for a single frame. - int max_bits = frame_max_bits(cpi); - - // Calculate modified prediction error used in bit allocation. - modified_err = calculate_modified_err(cpi, this_frame); - - if (cpi->twopass.gf_group_error_left > 0) - // What portion of the remaining GF group error is used by this frame. - err_fraction = modified_err / cpi->twopass.gf_group_error_left; - else - err_fraction = 0.0; - - // How many of those bits available for allocation should we give it? - target_frame_size = (int)((double)cpi->twopass.gf_group_bits * err_fraction); - - // Clip target size to 0 - max_bits (or cpi->twopass.gf_group_bits) at - // the top end. - if (target_frame_size < 0) { - target_frame_size = 0; - } else { - if (target_frame_size > max_bits) - target_frame_size = max_bits; - - if (target_frame_size > cpi->twopass.gf_group_bits) - target_frame_size = (int)cpi->twopass.gf_group_bits; - } - - // Adjust error and bits remaining. - cpi->twopass.gf_group_error_left -= (int64_t)modified_err; - cpi->twopass.gf_group_bits -= target_frame_size; - - if (cpi->twopass.gf_group_bits < 0) - cpi->twopass.gf_group_bits = 0; - - // Add in the minimum number of bits that is set aside for every frame. - target_frame_size += cpi->min_frame_bandwidth; - - // Per frame bit target for this frame. - cpi->per_frame_bandwidth = target_frame_size; -} - -// Make a damped adjustment to the active max q. -static int adjust_active_maxq(int old_maxqi, int new_maxqi) { - int i; - const double old_q = vp9_convert_qindex_to_q(old_maxqi); - const double new_q = vp9_convert_qindex_to_q(new_maxqi); - const double target_q = ((old_q * 7.0) + new_q) / 8.0; - - if (target_q > old_q) { - for (i = old_maxqi; i <= new_maxqi; i++) - if (vp9_convert_qindex_to_q(i) >= target_q) - return i; - } else { - for (i = old_maxqi; i >= new_maxqi; i--) - if (vp9_convert_qindex_to_q(i) <= target_q) - return i; - } - - return new_maxqi; -} - -void vp9_second_pass(VP9_COMP *cpi) { - int tmp_q; - int frames_left = (int)(cpi->twopass.total_stats.count - - cpi->common.current_video_frame); - - FIRSTPASS_STATS this_frame; - FIRSTPASS_STATS this_frame_copy; - - double this_frame_intra_error; - double this_frame_coded_error; - - if (!cpi->twopass.stats_in) - return; - - vp9_clear_system_state(); - - if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) { - cpi->active_worst_quality = cpi->oxcf.cq_level; - } else { - // Special case code for first frame. - if (cpi->common.current_video_frame == 0) { - int section_target_bandwidth = - (int)(cpi->twopass.bits_left / frames_left); - cpi->twopass.est_max_qcorrection_factor = 1.0; - - // Set a cq_level in constrained quality mode. - // Commenting this code out for now since it does not seem to be - // working well. - /* - if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) { - int est_cq = estimate_cq(cpi, &cpi->twopass.total_left_stats, - section_target_bandwidth); - - if (est_cq > cpi->cq_target_quality) - cpi->cq_target_quality = est_cq; - else - cpi->cq_target_quality = cpi->oxcf.cq_level; - } - */ - - // guess at maxq needed in 2nd pass - cpi->twopass.maxq_max_limit = cpi->worst_quality; - cpi->twopass.maxq_min_limit = cpi->best_quality; - - tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats, - section_target_bandwidth); - - cpi->active_worst_quality = tmp_q; - cpi->ni_av_qi = tmp_q; - cpi->avg_q = vp9_convert_qindex_to_q(tmp_q); - - // Limit the maxq value returned subsequently. - // This increases the risk of overspend or underspend if the initial - // estimate for the clip is bad, but helps prevent excessive - // variation in Q, especially near the end of a clip - // where for example a small overspend may cause Q to crash - adjust_maxq_qrange(cpi); - } - - // The last few frames of a clip almost always have to few or too many - // bits and for the sake of over exact rate control we dont want to make - // radical adjustments to the allowed quantizer range just to use up a - // few surplus bits or get beneath the target rate. - else if ((cpi->common.current_video_frame < - (((unsigned int)cpi->twopass.total_stats.count * 255) >> 8)) && - ((cpi->common.current_video_frame + cpi->baseline_gf_interval) < - (unsigned int)cpi->twopass.total_stats.count)) { - int section_target_bandwidth = - (int)(cpi->twopass.bits_left / frames_left); - if (frames_left < 1) - frames_left = 1; - - tmp_q = estimate_max_q( - cpi, - &cpi->twopass.total_left_stats, - section_target_bandwidth); - - // Make a damped adjustment to active max Q - cpi->active_worst_quality = - adjust_active_maxq(cpi->active_worst_quality, tmp_q); - } - } - vp9_zero(this_frame); - if (EOF == input_stats(cpi, &this_frame)) - return; - - this_frame_intra_error = this_frame.intra_error; - this_frame_coded_error = this_frame.coded_error; - - // keyframe and section processing ! - if (cpi->twopass.frames_to_key == 0) { - // Define next KF group and assign bits to it - this_frame_copy = this_frame; - find_next_key_frame(cpi, &this_frame_copy); - } - - // Is this a GF / ARF (Note that a KF is always also a GF) - if (cpi->frames_till_gf_update_due == 0) { - // Define next gf group and assign bits to it - this_frame_copy = this_frame; - - cpi->gf_zeromotion_pct = 0; - -#if CONFIG_MULTIPLE_ARF - if (cpi->multi_arf_enabled) { - define_fixed_arf_period(cpi); - } else { -#endif - define_gf_group(cpi, &this_frame_copy); -#if CONFIG_MULTIPLE_ARF - } -#endif - - if (cpi->gf_zeromotion_pct > 995) { - // As long as max_thresh for encode breakout is small enough, it is ok - // to enable it for no-show frame, i.e. set enable_encode_breakout to 2. - if (!cpi->common.show_frame) - cpi->enable_encode_breakout = 0; - else - cpi->enable_encode_breakout = 2; - } - - // If we are going to code an altref frame at the end of the group - // and the current frame is not a key frame.... - // If the previous group used an arf this frame has already benefited - // from that arf boost and it should not be given extra bits - // If the previous group was NOT coded using arf we may want to apply - // some boost to this GF as well - if (cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME)) { - // Assign a standard frames worth of bits from those allocated - // to the GF group - int bak = cpi->per_frame_bandwidth; - this_frame_copy = this_frame; - assign_std_frame_bits(cpi, &this_frame_copy); - cpi->per_frame_bandwidth = bak; - } - } else { - // Otherwise this is an ordinary frame - // Assign bits from those allocated to the GF group - this_frame_copy = this_frame; - assign_std_frame_bits(cpi, &this_frame_copy); - } - - // Keep a globally available copy of this and the next frame's iiratio. - cpi->twopass.this_iiratio = (int)(this_frame_intra_error / - DOUBLE_DIVIDE_CHECK(this_frame_coded_error)); - { - FIRSTPASS_STATS next_frame; - if (lookup_next_frame_stats(cpi, &next_frame) != EOF) { - cpi->twopass.next_iiratio = (int)(next_frame.intra_error / - DOUBLE_DIVIDE_CHECK(next_frame.coded_error)); - } - } - - // Set nominal per second bandwidth for this frame - cpi->target_bandwidth = (int)(cpi->per_frame_bandwidth - * cpi->output_framerate); - if (cpi->target_bandwidth < 0) - cpi->target_bandwidth = 0; - - cpi->twopass.frames_to_key--; - - // Update the total stats remaining structure - subtract_stats(&cpi->twopass.total_left_stats, &this_frame); -} - -static int test_candidate_kf(VP9_COMP *cpi, - FIRSTPASS_STATS *last_frame, - FIRSTPASS_STATS *this_frame, - FIRSTPASS_STATS *next_frame) { +static int test_candidate_kf(TWO_PASS *twopass, + const FIRSTPASS_STATS *last_frame, + const FIRSTPASS_STATS *this_frame, + const FIRSTPASS_STATS *next_frame) { int is_viable_kf = 0; - // Does the frame satisfy the primary criteria of a key frame - // If so, then examine how well it predicts subsequent frames + // Does the frame satisfy the primary criteria of a key frame? + // If so, then examine how well it predicts subsequent frames. if ((this_frame->pcnt_second_ref < 0.10) && (next_frame->pcnt_second_ref < 0.10) && ((this_frame->pcnt_inter < 0.05) || - (((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .35) && + (((this_frame->pcnt_inter - this_frame->pcnt_neutral) < 0.35) && ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) && ((fabs(last_frame->coded_error - this_frame->coded_error) / - DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > - .40) || + DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > 0.40) || (fabs(last_frame->intra_error - this_frame->intra_error) / - DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > - .40) || + DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > 0.40) || ((next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5))))) { int i; - FIRSTPASS_STATS *start_pos; - - FIRSTPASS_STATS local_next_frame; - + const FIRSTPASS_STATS *start_pos = twopass->stats_in; + FIRSTPASS_STATS local_next_frame = *next_frame; double boost_score = 0.0; double old_boost_score = 0.0; double decay_accumulator = 1.0; - double next_iiratio; - local_next_frame = *next_frame; - - // Note the starting file position so we can reset to it - start_pos = cpi->twopass.stats_in; - - // Examine how well the key frame predicts subsequent frames - for (i = 0; i < 16; i++) { - next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error / - DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error)); + // Examine how well the key frame predicts subsequent frames. + for (i = 0; i < 16; ++i) { + double next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error / + DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error)); if (next_iiratio > RMAX) next_iiratio = RMAX; - // Cumulative effect of decay in prediction quality + // Cumulative effect of decay in prediction quality. if (local_next_frame.pcnt_inter > 0.85) - decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter; + decay_accumulator *= local_next_frame.pcnt_inter; else - decay_accumulator = - decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0); + decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0; - // decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter; - - // Keep a running total + // Keep a running total. boost_score += (decay_accumulator * next_iiratio); - // Test various breakout clauses + // Test various breakout clauses. if ((local_next_frame.pcnt_inter < 0.05) || (next_iiratio < 1.5) || (((local_next_frame.pcnt_inter - local_next_frame.pcnt_neutral) < 0.20) && (next_iiratio < 3.0)) || ((boost_score - old_boost_score) < 3.0) || - (local_next_frame.intra_error < 200) - ) { + (local_next_frame.intra_error < 200)) { break; } old_boost_score = boost_score; // Get the next frame details - if (EOF == input_stats(cpi, &local_next_frame)) + if (EOF == input_stats(twopass, &local_next_frame)) break; } @@ -2379,7 +1832,7 @@ static int test_candidate_kf(VP9_COMP *cpi, is_viable_kf = 1; } else { // Reset the file position - reset_fpf_position(cpi, start_pos); + reset_fpf_position(twopass, start_pos); is_viable_kf = 0; } @@ -2387,343 +1840,476 @@ static int test_candidate_kf(VP9_COMP *cpi, return is_viable_kf; } + static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { int i, j; - FIRSTPASS_STATS last_frame; - FIRSTPASS_STATS first_frame; + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const FIRSTPASS_STATS first_frame = *this_frame; + const FIRSTPASS_STATS *const start_position = twopass->stats_in; FIRSTPASS_STATS next_frame; - FIRSTPASS_STATS *start_position; - + FIRSTPASS_STATS last_frame; + int kf_bits = 0; double decay_accumulator = 1.0; double zero_motion_accumulator = 1.0; - double boost_score = 0; - double loop_decay_rate; - + double boost_score = 0.0; double kf_mod_err = 0.0; double kf_group_err = 0.0; - double kf_group_intra_err = 0.0; - double kf_group_coded_err = 0.0; double recent_loop_decay[8] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; vp9_zero(next_frame); - vp9_clear_system_state(); // __asm emms; - start_position = cpi->twopass.stats_in; - cpi->common.frame_type = KEY_FRAME; - // is this a forced key frame by interval - cpi->this_key_frame_forced = cpi->next_key_frame_forced; + // Reset the GF group data structures. + vp9_zero(*gf_group); - // Clear the alt ref active flag as this can never be active on a key frame - cpi->source_alt_ref_active = 0; + // Is this a forced key frame by interval. + rc->this_key_frame_forced = rc->next_key_frame_forced; - // Kf is always a gf so clear frames till next gf counter - cpi->frames_till_gf_update_due = 0; + // Clear the alt ref active flag and last group multi arf flags as they + // can never be set for a key frame. + rc->source_alt_ref_active = 0; + cpi->multi_arf_last_grp_enabled = 0; - cpi->twopass.frames_to_key = 1; + // KF is always a GF so clear frames till next gf counter. + rc->frames_till_gf_update_due = 0; - // Take a copy of the initial frame details - first_frame = *this_frame; + rc->frames_to_key = 1; - cpi->twopass.kf_group_bits = 0; // Total bits available to kf group - cpi->twopass.kf_group_error_left = 0; // Group modified error score. + twopass->kf_group_bits = 0; // Total bits available to kf group + twopass->kf_group_error_left = 0; // Group modified error score. - kf_mod_err = calculate_modified_err(cpi, this_frame); + kf_mod_err = calculate_modified_err(twopass, oxcf, this_frame); - // find the next keyframe + // Find the next keyframe. i = 0; - while (cpi->twopass.stats_in < cpi->twopass.stats_in_end) { - // Accumulate kf group error - kf_group_err += calculate_modified_err(cpi, this_frame); + while (twopass->stats_in < twopass->stats_in_end && + rc->frames_to_key < cpi->oxcf.key_freq) { + // Accumulate kf group error. + kf_group_err += calculate_modified_err(twopass, oxcf, this_frame); - // These figures keep intra and coded error counts for all frames including - // key frames in the group. The effect of the key frame itself can be - // subtracted out using the first_frame data collected above. - kf_group_intra_err += this_frame->intra_error; - kf_group_coded_err += this_frame->coded_error; - - // load a the next frame's stats + // Load the next frame's stats. last_frame = *this_frame; - input_stats(cpi, this_frame); + input_stats(twopass, this_frame); // Provided that we are not at the end of the file... - if (cpi->oxcf.auto_key - && lookup_next_frame_stats(cpi, &next_frame) != EOF) { - // Normal scene cut check - if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame)) + if (cpi->oxcf.auto_key && twopass->stats_in < twopass->stats_in_end) { + double loop_decay_rate; + + // Check for a scene cut. + if (test_candidate_kf(twopass, &last_frame, this_frame, + twopass->stats_in)) break; - - // How fast is prediction quality decaying - loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame); + // How fast is the prediction quality decaying? + loop_decay_rate = get_prediction_decay_rate(&cpi->common, + twopass->stats_in); // We want to know something about the recent past... rather than - // as used elsewhere where we are concened with decay in prediction + // as used elsewhere where we are concerned with decay in prediction // quality since the last GF or KF. recent_loop_decay[i % 8] = loop_decay_rate; decay_accumulator = 1.0; - for (j = 0; j < 8; j++) + for (j = 0; j < 8; ++j) decay_accumulator *= recent_loop_decay[j]; // Special check for transition or high motion followed by a - // to a static scene. - if (detect_transition_to_still(cpi, i, cpi->key_frame_frequency - i, + // static scene. + if (detect_transition_to_still(twopass, i, cpi->oxcf.key_freq - i, loop_decay_rate, decay_accumulator)) break; - // Step on to the next frame - cpi->twopass.frames_to_key++; + // Step on to the next frame. + ++rc->frames_to_key; // If we don't have a real key frame within the next two - // forcekeyframeevery intervals then break out of the loop. - if (cpi->twopass.frames_to_key >= 2 * (int)cpi->key_frame_frequency) + // key_freq intervals then break out of the loop. + if (rc->frames_to_key >= 2 * cpi->oxcf.key_freq) break; } else { - cpi->twopass.frames_to_key++; + ++rc->frames_to_key; } - i++; + ++i; } // If there is a max kf interval set by the user we must obey it. // We already breakout of the loop above at 2x max. - // This code centers the extra kf if the actual natural - // interval is between 1x and 2x - if (cpi->oxcf.auto_key - && cpi->twopass.frames_to_key > (int)cpi->key_frame_frequency) { - FIRSTPASS_STATS *current_pos = cpi->twopass.stats_in; - FIRSTPASS_STATS tmp_frame; + // This code centers the extra kf if the actual natural interval + // is between 1x and 2x. + if (cpi->oxcf.auto_key && + rc->frames_to_key > cpi->oxcf.key_freq) { + FIRSTPASS_STATS tmp_frame = first_frame; - cpi->twopass.frames_to_key /= 2; + rc->frames_to_key /= 2; - // Copy first frame details - tmp_frame = first_frame; - - // Reset to the start of the group - reset_fpf_position(cpi, start_position); + // Reset to the start of the group. + reset_fpf_position(twopass, start_position); kf_group_err = 0; - kf_group_intra_err = 0; - kf_group_coded_err = 0; - // Rescan to get the correct error data for the forced kf group - for (i = 0; i < cpi->twopass.frames_to_key; i++) { - // Accumulate kf group errors - kf_group_err += calculate_modified_err(cpi, &tmp_frame); - kf_group_intra_err += tmp_frame.intra_error; - kf_group_coded_err += tmp_frame.coded_error; - - // Load a the next frame's stats - input_stats(cpi, &tmp_frame); + // Rescan to get the correct error data for the forced kf group. + for (i = 0; i < rc->frames_to_key; ++i) { + kf_group_err += calculate_modified_err(twopass, oxcf, &tmp_frame); + input_stats(twopass, &tmp_frame); } - - // Reset to the start of the group - reset_fpf_position(cpi, current_pos); - - cpi->next_key_frame_forced = 1; + rc->next_key_frame_forced = 1; + } else if (twopass->stats_in == twopass->stats_in_end || + rc->frames_to_key >= cpi->oxcf.key_freq) { + rc->next_key_frame_forced = 1; } else { - cpi->next_key_frame_forced = 0; + rc->next_key_frame_forced = 0; } - // Special case for the last frame of the file - if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) { - // Accumulate kf group error - kf_group_err += calculate_modified_err(cpi, this_frame); - // These figures keep intra and coded error counts for all frames including - // key frames in the group. The effect of the key frame itself can be - // subtracted out using the first_frame data collected above. - kf_group_intra_err += this_frame->intra_error; - kf_group_coded_err += this_frame->coded_error; + if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) { + int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1; + int new_frame_to_key = (rc->frames_to_key + count) & (~count); + int j; + for (j = 0; j < new_frame_to_key - rc->frames_to_key; ++j) { + if (EOF == input_stats(twopass, this_frame)) + break; + kf_group_err += calculate_modified_err(twopass, oxcf, this_frame); + } + rc->frames_to_key = new_frame_to_key; + } + + // Special case for the last key frame of the file. + if (twopass->stats_in >= twopass->stats_in_end) { + // Accumulate kf group error. + kf_group_err += calculate_modified_err(twopass, oxcf, this_frame); } // Calculate the number of bits that should be assigned to the kf group. - if ((cpi->twopass.bits_left > 0) && - (cpi->twopass.modified_error_left > 0.0)) { - // Max for a single normal frame (not key frame) - int max_bits = frame_max_bits(cpi); + if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) { + // Maximum number of bits for a single normal frame (not key frame). + const int max_bits = frame_max_bits(rc, &cpi->oxcf); - // Maximum bits for the kf group + // Maximum number of bits allocated to the key frame group. int64_t max_grp_bits; // Default allocation based on bits left and relative - // complexity of the section - cpi->twopass.kf_group_bits = (int64_t)(cpi->twopass.bits_left * - (kf_group_err / - cpi->twopass.modified_error_left)); + // complexity of the section. + twopass->kf_group_bits = (int64_t)(twopass->bits_left * + (kf_group_err / twopass->modified_error_left)); // Clip based on maximum per frame rate defined by the user. - max_grp_bits = (int64_t)max_bits * (int64_t)cpi->twopass.frames_to_key; - if (cpi->twopass.kf_group_bits > max_grp_bits) - cpi->twopass.kf_group_bits = max_grp_bits; + max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key; + if (twopass->kf_group_bits > max_grp_bits) + twopass->kf_group_bits = max_grp_bits; } else { - cpi->twopass.kf_group_bits = 0; + twopass->kf_group_bits = 0; } - // Reset the first pass file position - reset_fpf_position(cpi, start_position); + twopass->kf_group_bits = MAX(0, twopass->kf_group_bits); - // Determine how big to make this keyframe based on how well the subsequent - // frames use inter blocks. + // Reset the first pass file position. + reset_fpf_position(twopass, start_position); + + // Scan through the kf group collating various stats used to deteermine + // how many bits to spend on it. decay_accumulator = 1.0; boost_score = 0.0; - loop_decay_rate = 1.00; // Starting decay rate - - // Scan through the kf group collating various stats. - for (i = 0; i < cpi->twopass.frames_to_key; i++) { - double r; - - if (EOF == input_stats(cpi, &next_frame)) + for (i = 0; i < rc->frames_to_key; ++i) { + if (EOF == input_stats(twopass, &next_frame)) break; // Monitor for static sections. - if ((next_frame.pcnt_inter - next_frame.pcnt_motion) < - zero_motion_accumulator) { - zero_motion_accumulator = - (next_frame.pcnt_inter - next_frame.pcnt_motion); - } + zero_motion_accumulator =MIN(zero_motion_accumulator, + get_zero_motion_factor(&next_frame)); // For the first few frames collect data to decide kf boost. - if (i <= (cpi->max_gf_interval * 2)) { - if (next_frame.intra_error > cpi->twopass.kf_intra_err_min) + if (i <= (rc->max_gf_interval * 2)) { + double r; + if (next_frame.intra_error > twopass->kf_intra_err_min) r = (IIKFACTOR2 * next_frame.intra_error / DOUBLE_DIVIDE_CHECK(next_frame.coded_error)); else - r = (IIKFACTOR2 * cpi->twopass.kf_intra_err_min / + r = (IIKFACTOR2 * twopass->kf_intra_err_min / DOUBLE_DIVIDE_CHECK(next_frame.coded_error)); if (r > RMAX) r = RMAX; - // How fast is prediction quality decaying - if (!detect_flash(cpi, 0)) { - loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame); - decay_accumulator = decay_accumulator * loop_decay_rate; - decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR - ? MIN_DECAY_FACTOR : decay_accumulator; + // How fast is prediction quality decaying. + if (!detect_flash(twopass, 0)) { + const double loop_decay_rate = get_prediction_decay_rate(&cpi->common, + &next_frame); + decay_accumulator *= loop_decay_rate; + decay_accumulator = MAX(decay_accumulator, MIN_DECAY_FACTOR); } boost_score += (decay_accumulator * r); } } - { - FIRSTPASS_STATS sectionstats; + reset_fpf_position(twopass, start_position); - zero_stats(§ionstats); - reset_fpf_position(cpi, start_position); + // Store the zero motion percentage + twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0); - for (i = 0; i < cpi->twopass.frames_to_key; i++) { - input_stats(cpi, &next_frame); - accumulate_stats(§ionstats, &next_frame); - } + // Calculate a section intra ratio used in setting max loop filter. + twopass->section_intra_rating = + calculate_section_intra_ratio(start_position, twopass->stats_in_end, + rc->frames_to_key); - avg_stats(§ionstats); + // Work out how many bits to allocate for the key frame itself. + rc->kf_boost = (int)boost_score; - cpi->twopass.section_intra_rating = (int) - (sectionstats.intra_error - / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error)); - } + if (rc->kf_boost < (rc->frames_to_key * 3)) + rc->kf_boost = (rc->frames_to_key * 3); + if (rc->kf_boost < MIN_KF_BOOST) + rc->kf_boost = MIN_KF_BOOST; - // Reset the first pass file position - reset_fpf_position(cpi, start_position); + kf_bits = calculate_boost_bits((rc->frames_to_key - 1), + rc->kf_boost, twopass->kf_group_bits); - // Work out how many bits to allocate for the key frame itself - if (1) { - int kf_boost = (int)boost_score; - int allocation_chunks; - int alt_kf_bits; + twopass->kf_group_bits -= kf_bits; - if (kf_boost < (cpi->twopass.frames_to_key * 3)) - kf_boost = (cpi->twopass.frames_to_key * 3); + // Save the bits to spend on the key frame. + gf_group->bit_allocation[0] = kf_bits; + gf_group->update_type[0] = KF_UPDATE; + gf_group->rf_level[0] = KF_STD; - if (kf_boost < 300) // Min KF boost - kf_boost = 300; - - // Make a note of baseline boost and the zero motion - // accumulator value for use elsewhere. - cpi->kf_boost = kf_boost; - cpi->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0); - - // We do three calculations for kf size. - // The first is based on the error score for the whole kf group. - // The second (optionaly) on the key frames own error if this is - // smaller than the average for the group. - // The final one insures that the frame receives at least the - // allocation it would have received based on its own error score vs - // the error score remaining - // Special case if the sequence appears almost totaly static - // In this case we want to spend almost all of the bits on the - // key frame. - // cpi->twopass.frames_to_key-1 because key frame itself is taken - // care of by kf_boost. - if (zero_motion_accumulator >= 0.99) { - allocation_chunks = - ((cpi->twopass.frames_to_key - 1) * 10) + kf_boost; - } else { - allocation_chunks = - ((cpi->twopass.frames_to_key - 1) * 100) + kf_boost; - } - - // Prevent overflow - if (kf_boost > 1028) { - int divisor = kf_boost >> 10; - kf_boost /= divisor; - allocation_chunks /= divisor; - } - - cpi->twopass.kf_group_bits = - (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits; - - // Calculate the number of bits to be spent on the key frame - cpi->twopass.kf_bits = - (int)((double)kf_boost * - ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks)); - - // If the key frame is actually easier than the average for the - // kf group (which does sometimes happen... eg a blank intro frame) - // Then use an alternate calculation based on the kf error score - // which should give a smaller key frame. - if (kf_mod_err < kf_group_err / cpi->twopass.frames_to_key) { - double alt_kf_grp_bits = - ((double)cpi->twopass.bits_left * - (kf_mod_err * (double)cpi->twopass.frames_to_key) / - DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left)); - - alt_kf_bits = (int)((double)kf_boost * - (alt_kf_grp_bits / (double)allocation_chunks)); - - if (cpi->twopass.kf_bits > alt_kf_bits) { - cpi->twopass.kf_bits = alt_kf_bits; - } - } else { - // Else if it is much harder than other frames in the group make sure - // it at least receives an allocation in keeping with its relative - // error score - alt_kf_bits = - (int)((double)cpi->twopass.bits_left * - (kf_mod_err / - DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left))); - - if (alt_kf_bits > cpi->twopass.kf_bits) { - cpi->twopass.kf_bits = alt_kf_bits; - } - } - - cpi->twopass.kf_group_bits -= cpi->twopass.kf_bits; - // Add in the minimum frame allowance - cpi->twopass.kf_bits += cpi->min_frame_bandwidth; - - // Peer frame bit target for this frame - cpi->per_frame_bandwidth = cpi->twopass.kf_bits; - // Convert to a per second bitrate - cpi->target_bandwidth = (int)(cpi->twopass.kf_bits * - cpi->output_framerate); - } - - // Note the total error score of the kf group minus the key frame itself - cpi->twopass.kf_group_error_left = (int)(kf_group_err - kf_mod_err); + // Note the total error score of the kf group minus the key frame itself. + twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err); // Adjust the count of total modified error left. // The count of bits left is adjusted elsewhere based on real coded frame // sizes. - cpi->twopass.modified_error_left -= kf_group_err; + twopass->modified_error_left -= kf_group_err; +} + +// For VBR...adjustment to the frame target based on error from previous frames +void vbr_rate_correction(int * this_frame_target, + const int64_t vbr_bits_off_target) { + int max_delta = (*this_frame_target * 15) / 100; + + // vbr_bits_off_target > 0 means we have extra bits to spend + if (vbr_bits_off_target > 0) { + *this_frame_target += + (vbr_bits_off_target > max_delta) ? max_delta + : (int)vbr_bits_off_target; + } else { + *this_frame_target -= + (vbr_bits_off_target < -max_delta) ? max_delta + : (int)-vbr_bits_off_target; + } +} + +// Define the reference buffers that will be updated post encode. +void configure_buffer_updates(VP9_COMP *cpi) { + TWO_PASS *const twopass = &cpi->twopass; + + cpi->rc.is_src_frame_alt_ref = 0; + switch (twopass->gf_group.update_type[twopass->gf_group.index]) { + case KF_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 1; + break; + case LF_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 0; + break; + case GF_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 0; + break; + case OVERLAY_UPDATE: + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 0; + cpi->rc.is_src_frame_alt_ref = 1; + break; + case ARF_UPDATE: + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 1; + break; + default: + assert(0); + break; + } + if (is_two_pass_svc(cpi)) { + if (cpi->svc.layer_context[cpi->svc.spatial_layer_id].gold_ref_idx < 0) + cpi->refresh_golden_frame = 0; + if (cpi->alt_ref_source == NULL) + cpi->refresh_alt_ref_frame = 0; + } +} + + +void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + int frames_left; + FIRSTPASS_STATS this_frame; + FIRSTPASS_STATS this_frame_copy; + + int target_rate; + LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ? + &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : 0; + + if (lc != NULL) { + frames_left = (int)(twopass->total_stats.count - + lc->current_video_frame_in_layer); + } else { + frames_left = (int)(twopass->total_stats.count - + cm->current_video_frame); + } + + if (!twopass->stats_in) + return; + + // If this is an arf frame then we dont want to read the stats file or + // advance the input pointer as we already have what we need. + if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { + int target_rate; + configure_buffer_updates(cpi); + target_rate = gf_group->bit_allocation[gf_group->index]; + target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate); + rc->base_frame_target = target_rate; + + // Correction to rate target based on prior over or under shoot. + if (cpi->oxcf.rc_mode == VPX_VBR) + vbr_rate_correction(&target_rate, rc->vbr_bits_off_target); + + vp9_rc_set_frame_target(cpi, target_rate); + cm->frame_type = INTER_FRAME; + + if (lc != NULL) { + if (cpi->svc.spatial_layer_id == 0) { + lc->is_key_frame = 0; + } else { + lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame; + + if (lc->is_key_frame) + cpi->ref_frame_flags &= (~VP9_LAST_FLAG); + } + } + + return; + } + + vp9_clear_system_state(); + + if (lc != NULL && twopass->kf_intra_err_min == 0) { + twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs; + twopass->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs; + } + + if (cpi->oxcf.rc_mode == VPX_Q) { + twopass->active_worst_quality = cpi->oxcf.cq_level; + } else if (cm->current_video_frame == 0 || + (lc != NULL && lc->current_video_frame_in_layer == 0)) { + // Special case code for first frame. + const int section_target_bandwidth = (int)(twopass->bits_left / + frames_left); + const int tmp_q = get_twopass_worst_quality(cpi, &twopass->total_left_stats, + section_target_bandwidth); + twopass->active_worst_quality = tmp_q; + rc->ni_av_qi = tmp_q; + rc->avg_q = vp9_convert_qindex_to_q(tmp_q); + } + vp9_zero(this_frame); + if (EOF == input_stats(twopass, &this_frame)) + return; + + // Local copy of the current frame's first pass stats. + this_frame_copy = this_frame; + + // Keyframe and section processing. + if (rc->frames_to_key == 0 || + (cpi->frame_flags & FRAMEFLAGS_KEY)) { + // Define next KF group and assign bits to it. + find_next_key_frame(cpi, &this_frame_copy); + } else { + cm->frame_type = INTER_FRAME; + } + + if (lc != NULL) { + if (cpi->svc.spatial_layer_id == 0) { + lc->is_key_frame = (cm->frame_type == KEY_FRAME); + if (lc->is_key_frame) { + cpi->ref_frame_flags &= + (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); + lc->frames_from_key_frame = 0; + } + } else { + cm->frame_type = INTER_FRAME; + lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame; + + if (lc->is_key_frame) { + cpi->ref_frame_flags &= (~VP9_LAST_FLAG); + lc->frames_from_key_frame = 0; + } + } + } + + // Define a new GF/ARF group. (Should always enter here for key frames). + if (rc->frames_till_gf_update_due == 0) { + define_gf_group(cpi, &this_frame_copy); + + if (twopass->gf_zeromotion_pct > 995) { + // As long as max_thresh for encode breakout is small enough, it is ok + // to enable it for show frame, i.e. set allow_encode_breakout to + // ENCODE_BREAKOUT_LIMITED. + if (!cm->show_frame) + cpi->allow_encode_breakout = ENCODE_BREAKOUT_DISABLED; + else + cpi->allow_encode_breakout = ENCODE_BREAKOUT_LIMITED; + } + + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + if (lc != NULL) + cpi->refresh_golden_frame = 1; + } + + configure_buffer_updates(cpi); + + target_rate = gf_group->bit_allocation[gf_group->index]; + if (cpi->common.frame_type == KEY_FRAME) + target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate); + else + target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate); + + rc->base_frame_target = target_rate; + + // Correction to rate target based on prior over or under shoot. + if (cpi->oxcf.rc_mode == VPX_VBR) + vbr_rate_correction(&target_rate, rc->vbr_bits_off_target); + + vp9_rc_set_frame_target(cpi, target_rate); + + // Update the total stats remaining structure. + subtract_stats(&twopass->total_left_stats, &this_frame); +} + +void vp9_twopass_postencode_update(VP9_COMP *cpi) { + TWO_PASS *const twopass = &cpi->twopass; + RATE_CONTROL *const rc = &cpi->rc; + + // VBR correction is done through rc->vbr_bits_off_target. Based on the + // sign of this value, a limited % adjustment is made to the target rate + // of subsequent frames, to try and push it back towards 0. This method + // is designed to prevent extreme behaviour at the end of a clip + // or group of frames. + const int bits_used = rc->base_frame_target; + rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size; + + twopass->bits_left = MAX(twopass->bits_left - bits_used, 0); + + if (cpi->common.frame_type != KEY_FRAME && + !vp9_is_upper_layer_key_frame(cpi)) { + twopass->kf_group_bits -= bits_used; + } + twopass->kf_group_bits = MAX(twopass->kf_group_bits, 0); + + // Increment the gf group index ready for the next frame. + ++twopass->gf_group.index; } diff --git a/media/libvpx/vp9/encoder/vp9_firstpass.h b/media/libvpx/vp9/encoder/vp9_firstpass.h index c18d11e04313..aaa6b0311c62 100644 --- a/media/libvpx/vp9/encoder/vp9_firstpass.h +++ b/media/libvpx/vp9/encoder/vp9_firstpass.h @@ -10,14 +10,127 @@ #ifndef VP9_ENCODER_VP9_FIRSTPASS_H_ #define VP9_ENCODER_VP9_FIRSTPASS_H_ -#include "vp9/encoder/vp9_onyx_int.h" -void vp9_init_first_pass(VP9_COMP *cpi); -void vp9_first_pass(VP9_COMP *cpi); -void vp9_end_first_pass(VP9_COMP *cpi); +#include "vp9/encoder/vp9_lookahead.h" +#include "vp9/encoder/vp9_ratectrl.h" -void vp9_init_second_pass(VP9_COMP *cpi); -void vp9_second_pass(VP9_COMP *cpi); -void vp9_end_second_pass(VP9_COMP *cpi); +#ifdef __cplusplus +extern "C" { +#endif + +#if CONFIG_FP_MB_STATS + +#define FPMB_DCINTRA_MASK 0x01 + +#define FPMB_MOTION_ZERO_MASK 0x02 +#define FPMB_MOTION_LEFT_MASK 0x04 +#define FPMB_MOTION_RIGHT_MASK 0x08 +#define FPMB_MOTION_UP_MASK 0x10 +#define FPMB_MOTION_DOWN_MASK 0x20 + +#define FPMB_ERROR_SMALL_MASK 0x40 +#define FPMB_ERROR_LARGE_MASK 0x80 +#define FPMB_ERROR_SMALL_TH 2000 +#define FPMB_ERROR_LARGE_TH 48000 + +typedef struct { + uint8_t *mb_stats_start; + uint8_t *mb_stats_end; +} FIRSTPASS_MB_STATS; +#endif + +typedef struct { + double frame; + double intra_error; + double coded_error; + double sr_coded_error; + double pcnt_inter; + double pcnt_motion; + double pcnt_second_ref; + double pcnt_neutral; + double MVr; + double mvr_abs; + double MVc; + double mvc_abs; + double MVrv; + double MVcv; + double mv_in_out_count; + double new_mv_count; + double duration; + double count; + int64_t spatial_layer_id; +} FIRSTPASS_STATS; + +typedef enum { + KF_UPDATE = 0, + LF_UPDATE = 1, + GF_UPDATE = 2, + ARF_UPDATE = 3, + OVERLAY_UPDATE = 4, + FRAME_UPDATE_TYPES = 5 +} FRAME_UPDATE_TYPE; + +typedef struct { + unsigned char index; + RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1]; + FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1]; + unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1]; + unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1]; + unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1]; + int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1]; +} GF_GROUP; + +typedef struct { + unsigned int section_intra_rating; + FIRSTPASS_STATS total_stats; + FIRSTPASS_STATS this_frame_stats; + const FIRSTPASS_STATS *stats_in; + const FIRSTPASS_STATS *stats_in_start; + const FIRSTPASS_STATS *stats_in_end; + FIRSTPASS_STATS total_left_stats; + int first_pass_done; + int64_t bits_left; + double modified_error_min; + double modified_error_max; + double modified_error_left; + double kf_intra_err_min; + double gf_intra_err_min; + +#if CONFIG_FP_MB_STATS + uint8_t *frame_mb_stats_buf; + uint8_t *this_frame_mb_stats; + FIRSTPASS_MB_STATS firstpass_mb_stats; +#endif + + // Projected total bits available for a key frame group of frames + int64_t kf_group_bits; + + // Error score of frames still to be coded in kf group + int64_t kf_group_error_left; + int sr_update_lag; + + int kf_zeromotion_pct; + int gf_zeromotion_pct; + + int active_worst_quality; + + GF_GROUP gf_group; +} TWO_PASS; + +struct VP9_COMP; + +void vp9_init_first_pass(struct VP9_COMP *cpi); +void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi); +void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source); +void vp9_end_first_pass(struct VP9_COMP *cpi); + +void vp9_init_second_pass(struct VP9_COMP *cpi); +void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi); + +// Post encode update of the rate control parameters for 2-pass +void vp9_twopass_postencode_update(struct VP9_COMP *cpi); +#ifdef __cplusplus +} // extern "C" +#endif #endif // VP9_ENCODER_VP9_FIRSTPASS_H_ diff --git a/media/libvpx/vp9/encoder/vp9_lookahead.c b/media/libvpx/vp9/encoder/vp9_lookahead.c index c28c868457a0..823e7a162420 100644 --- a/media/libvpx/vp9/encoder/vp9_lookahead.c +++ b/media/libvpx/vp9/encoder/vp9_lookahead.c @@ -11,22 +11,16 @@ #include #include "./vpx_config.h" + #include "vp9/common/vp9_common.h" + +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_extend.h" #include "vp9/encoder/vp9_lookahead.h" -#include "vp9/common/vp9_extend.h" - -struct lookahead_ctx { - unsigned int max_sz; /* Absolute size of the queue */ - unsigned int sz; /* Number of buffers currently in the queue */ - unsigned int read_idx; /* Read index */ - unsigned int write_idx; /* Write index */ - struct lookahead_entry *buf; /* Buffer list */ -}; - /* Return the buffer at the given absolute index and increment the index */ -static struct lookahead_entry * pop(struct lookahead_ctx *ctx, - unsigned int *idx) { +static struct lookahead_entry *pop(struct lookahead_ctx *ctx, + unsigned int *idx) { unsigned int index = *idx; struct lookahead_entry *buf = ctx->buf + index; @@ -52,16 +46,22 @@ void vp9_lookahead_destroy(struct lookahead_ctx *ctx) { } -struct lookahead_ctx * vp9_lookahead_init(unsigned int width, - unsigned int height, - unsigned int subsampling_x, - unsigned int subsampling_y, - unsigned int depth) { +struct lookahead_ctx *vp9_lookahead_init(unsigned int width, + unsigned int height, + unsigned int subsampling_x, + unsigned int subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif + unsigned int depth) { struct lookahead_ctx *ctx = NULL; // Clamp the lookahead queue depth depth = clamp(depth, 1, MAX_LAG_BUFFERS); + // Allocate memory to keep previous source frames available. + depth += MAX_PRE_FRAMES; + // Allocate the lookahead structures ctx = calloc(1, sizeof(*ctx)); if (ctx) { @@ -73,7 +73,10 @@ struct lookahead_ctx * vp9_lookahead_init(unsigned int width, for (i = 0; i < depth; i++) if (vp9_alloc_frame_buffer(&ctx->buf[i].img, width, height, subsampling_x, subsampling_y, - VP9BORDERINPIXELS)) +#if CONFIG_VP9_HIGHBITDEPTH + use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS)) goto bail; } return ctx; @@ -85,8 +88,7 @@ struct lookahead_ctx * vp9_lookahead_init(unsigned int width, #define USE_PARTIAL_COPY 0 int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, - int64_t ts_start, int64_t ts_end, unsigned int flags, - unsigned char *active_map) { + int64_t ts_start, int64_t ts_end, unsigned int flags) { struct lookahead_entry *buf; #if USE_PARTIAL_COPY int row, col, active_end; @@ -94,7 +96,7 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, int mb_cols = (src->y_width + 15) >> 4; #endif - if (ctx->sz + 1 > ctx->max_sz) + if (ctx->sz + 1 + MAX_PRE_FRAMES > ctx->max_sz) return 1; ctx->sz++; buf = pop(ctx, &ctx->write_idx); @@ -157,11 +159,11 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, } -struct lookahead_entry * vp9_lookahead_pop(struct lookahead_ctx *ctx, - int drain) { +struct lookahead_entry *vp9_lookahead_pop(struct lookahead_ctx *ctx, + int drain) { struct lookahead_entry *buf = NULL; - if (ctx->sz && (drain || ctx->sz == ctx->max_sz)) { + if (ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) { buf = pop(ctx, &ctx->read_idx); ctx->sz--; } @@ -169,17 +171,28 @@ struct lookahead_entry * vp9_lookahead_pop(struct lookahead_ctx *ctx, } -struct lookahead_entry * vp9_lookahead_peek(struct lookahead_ctx *ctx, - int index) { +struct lookahead_entry *vp9_lookahead_peek(struct lookahead_ctx *ctx, + int index) { struct lookahead_entry *buf = NULL; - assert(index < (int)ctx->max_sz); - if (index < (int)ctx->sz) { - index += ctx->read_idx; - if (index >= (int)ctx->max_sz) - index -= ctx->max_sz; - buf = ctx->buf + index; + if (index >= 0) { + // Forward peek + if (index < (int)ctx->sz) { + index += ctx->read_idx; + if (index >= (int)ctx->max_sz) + index -= ctx->max_sz; + buf = ctx->buf + index; + } + } else if (index < 0) { + // Backward peek + if (-index <= MAX_PRE_FRAMES) { + index += ctx->read_idx; + if (index < 0) + index += ctx->max_sz; + buf = ctx->buf + index; + } } + return buf; } diff --git a/media/libvpx/vp9/encoder/vp9_lookahead.h b/media/libvpx/vp9/encoder/vp9_lookahead.h index c773f8fcc6d6..27861933d04f 100644 --- a/media/libvpx/vp9/encoder/vp9_lookahead.h +++ b/media/libvpx/vp9/encoder/vp9_lookahead.h @@ -14,6 +14,15 @@ #include "vpx_scale/yv12config.h" #include "vpx/vpx_integer.h" +#if CONFIG_SPATIAL_SVC +#include "vpx/vp8cx.h" +#include "vpx/vpx_encoder.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + #define MAX_LAG_BUFFERS 25 struct lookahead_entry { @@ -21,10 +30,22 @@ struct lookahead_entry { int64_t ts_start; int64_t ts_end; unsigned int flags; + +#if CONFIG_SPATIAL_SVC + vpx_svc_parameters_t svc_params[VPX_SS_MAX_LAYERS]; +#endif }; +// The max of past frames we want to keep in the queue. +#define MAX_PRE_FRAMES 1 -struct lookahead_ctx; +struct lookahead_ctx { + unsigned int max_sz; /* Absolute size of the queue */ + unsigned int sz; /* Number of buffers currently in the queue */ + unsigned int read_idx; /* Read index */ + unsigned int write_idx; /* Write index */ + struct lookahead_entry *buf; /* Buffer list */ +}; /**\brief Initializes the lookahead stage * @@ -35,6 +56,9 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width, unsigned int height, unsigned int subsampling_x, unsigned int subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif unsigned int depth); @@ -59,8 +83,7 @@ void vp9_lookahead_destroy(struct lookahead_ctx *ctx); * \param[in] active_map Map that specifies which macroblock is active */ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, - int64_t ts_start, int64_t ts_end, unsigned int flags, - unsigned char *active_map); + int64_t ts_start, int64_t ts_end, unsigned int flags); /**\brief Get the next source buffer to encode @@ -94,4 +117,8 @@ struct lookahead_entry *vp9_lookahead_peek(struct lookahead_ctx *ctx, */ unsigned int vp9_lookahead_depth(struct lookahead_ctx *ctx); +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_ENCODER_VP9_LOOKAHEAD_H_ diff --git a/media/libvpx/vp9/encoder/vp9_mbgraph.c b/media/libvpx/vp9/encoder/vp9_mbgraph.c index 7b605b212f82..b8e7164929bd 100644 --- a/media/libvpx/vp9/encoder/vp9_mbgraph.c +++ b/media/libvpx/vp9/encoder/vp9_mbgraph.c @@ -11,8 +11,6 @@ #include #include "vpx_mem/vpx_mem.h" -#include "vp9/encoder/vp9_encodeintra.h" -#include "vp9/encoder/vp9_rdopt.h" #include "vp9/encoder/vp9_segmentation.h" #include "vp9/encoder/vp9_mcomp.h" #include "vp9/common/vp9_blockd.h" @@ -21,57 +19,54 @@ #include "vp9/common/vp9_systemdependent.h" - static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, - int_mv *ref_mv, - int_mv *dst_mv, + const MV *ref_mv, + MV *dst_mv, int mb_row, int mb_col) { - MACROBLOCK *const x = &cpi->mb; + MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16]; - unsigned int best_err; + const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + const vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16]; const int tmp_col_min = x->mv_col_min; const int tmp_col_max = x->mv_col_max; const int tmp_row_min = x->mv_row_min; const int tmp_row_max = x->mv_row_max; - int_mv ref_full; + MV ref_full; + int sad_list[5]; // Further step/diamond searches as necessary - int step_param = cpi->sf.reduce_first_step_size + - (cpi->speed < 8 ? (cpi->speed > 5 ? 1 : 0) : 2); - step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2)); + int step_param = mv_sf->reduce_first_step_size; + step_param = MIN(step_param, MAX_MVSEARCH_STEPS - 2); - vp9_clamp_mv_min_max(x, &ref_mv->as_mv); + vp9_set_mv_search_range(x, ref_mv); - ref_full.as_mv.col = ref_mv->as_mv.col >> 3; - ref_full.as_mv.row = ref_mv->as_mv.row >> 3; + ref_full.col = ref_mv->col >> 3; + ref_full.row = ref_mv->row >> 3; /*cpi->sf.search_method == HEX*/ - best_err = vp9_hex_search(x, &ref_full.as_mv, step_param, x->errorperbit, - 0, &v_fn_ptr, - 0, &ref_mv->as_mv, &dst_mv->as_mv); + vp9_hex_search(x, &ref_full, step_param, x->errorperbit, 0, + cond_sad_list(cpi, sad_list), + &v_fn_ptr, 0, ref_mv, dst_mv); // Try sub-pixel MC // if (bestsme > error_thresh && bestsme < INT_MAX) { int distortion; unsigned int sse; - best_err = cpi->find_fractional_mv_step( - x, - &dst_mv->as_mv, &ref_mv->as_mv, - cpi->common.allow_high_precision_mv, - x->errorperbit, &v_fn_ptr, - 0, cpi->sf.subpel_iters_per_step, NULL, NULL, - & distortion, &sse); + cpi->find_fractional_mv_step( + x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit, + &v_fn_ptr, 0, mv_sf->subpel_iters_per_step, + cond_sad_list(cpi, sad_list), + NULL, NULL, + &distortion, &sse, NULL, 0, 0); } - vp9_set_mbmode_and_mvs(x, NEWMV, dst_mv); + xd->mi[0]->mbmi.mode = NEWMV; + xd->mi[0]->mbmi.mv[0].as_mv = *dst_mv; + vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16); - best_err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, - xd->plane[0].dst.buf, xd->plane[0].dst.stride, - INT_MAX); /* restore UMV window */ x->mv_col_min = tmp_col_min; @@ -79,21 +74,21 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, x->mv_row_min = tmp_row_min; x->mv_row_max = tmp_row_max; - return best_err; + return vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, + xd->plane[0].dst.buf, xd->plane[0].dst.stride); } -static int do_16x16_motion_search(VP9_COMP *cpi, int_mv *ref_mv, int_mv *dst_mv, - int mb_row, int mb_col) { +static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv, + int_mv *dst_mv, int mb_row, int mb_col) { MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; unsigned int err, tmp_err; - int_mv tmp_mv; + MV tmp_mv; // Try zero MV first // FIXME should really use something like near/nearest MV and/or MV prediction err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, - xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride, - INT_MAX); + xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride); dst_mv->as_int = 0; // Test last reference frame using the previous best mv as the @@ -101,20 +96,19 @@ static int do_16x16_motion_search(VP9_COMP *cpi, int_mv *ref_mv, int_mv *dst_mv, tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv, mb_row, mb_col); if (tmp_err < err) { err = tmp_err; - dst_mv->as_int = tmp_mv.as_int; + dst_mv->as_mv = tmp_mv; } // If the current best reference mv is not centered on 0,0 then do a 0,0 // based search as well. - if (ref_mv->as_int) { + if (ref_mv->row != 0 || ref_mv->col != 0) { unsigned int tmp_err; - int_mv zero_ref_mv, tmp_mv; + MV zero_ref_mv = {0, 0}, tmp_mv; - zero_ref_mv.as_int = 0; tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv, mb_row, mb_col); if (tmp_err < err) { - dst_mv->as_int = tmp_mv.as_int; + dst_mv->as_mv = tmp_mv; err = tmp_err; } } @@ -130,19 +124,16 @@ static int do_16x16_zerozero_search(VP9_COMP *cpi, int_mv *dst_mv) { // Try zero MV first // FIXME should really use something like near/nearest MV and/or MV prediction err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, - xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride, - INT_MAX); + xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride); dst_mv->as_int = 0; return err; } -static int find_best_16x16_intra(VP9_COMP *cpi, - int mb_y_offset, - MB_PREDICTION_MODE *pbest_mode) { +static int find_best_16x16_intra(VP9_COMP *cpi, PREDICTION_MODE *pbest_mode) { MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - MB_PREDICTION_MODE best_mode = -1, mode; + PREDICTION_MODE best_mode = -1, mode; unsigned int best_err = INT_MAX; // calculate SATD for each intra prediction mode; @@ -150,12 +141,13 @@ static int find_best_16x16_intra(VP9_COMP *cpi, for (mode = DC_PRED; mode <= TM_PRED; mode++) { unsigned int err; - xd->mi_8x8[0]->mbmi.mode = mode; + xd->mi[0]->mbmi.mode = mode; vp9_predict_intra_block(xd, 0, 2, TX_16X16, mode, x->plane[0].src.buf, x->plane[0].src.stride, - xd->plane[0].dst.buf, xd->plane[0].dst.stride); + xd->plane[0].dst.buf, xd->plane[0].dst.stride, + 0, 0, 0); err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, - xd->plane[0].dst.buf, xd->plane[0].dst.stride, best_err); + xd->plane[0].dst.buf, xd->plane[0].dst.stride); // find best if (err < best_err) { @@ -177,11 +169,8 @@ static void update_mbgraph_mb_stats YV12_BUFFER_CONFIG *buf, int mb_y_offset, YV12_BUFFER_CONFIG *golden_ref, - int_mv *prev_golden_ref_mv, - int gld_y_offset, + const MV *prev_golden_ref_mv, YV12_BUFFER_CONFIG *alt_ref, - int_mv *prev_alt_ref_mv, - int arf_y_offset, int mb_row, int mb_col ) { @@ -198,7 +187,7 @@ static void update_mbgraph_mb_stats xd->plane[0].dst.stride = get_frame_new_buffer(cm)->y_stride; // do intra 16x16 prediction - intra_error = find_best_16x16_intra(cpi, mb_y_offset, + intra_error = find_best_16x16_intra(cpi, &stats->ref[INTRA_FRAME].m.mode); if (intra_error <= 0) intra_error = 1; @@ -246,34 +235,31 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi, int mb_col, mb_row, offset = 0; int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0; - int_mv arf_top_mv, gld_top_mv; - MODE_INFO mi_local = { { 0 } }; + MV gld_top_mv = {0, 0}; + MODE_INFO mi_local; + vp9_zero(mi_local); // Set up limit values for motion vectors to prevent them extending outside // the UMV borders. - arf_top_mv.as_int = 0; - gld_top_mv.as_int = 0; x->mv_row_min = -BORDER_MV_PIXELS_B16; x->mv_row_max = (cm->mb_rows - 1) * 8 + BORDER_MV_PIXELS_B16; xd->up_available = 0; xd->plane[0].dst.stride = buf->y_stride; xd->plane[0].pre[0].stride = buf->y_stride; xd->plane[1].dst.stride = buf->uv_stride; - xd->mi_8x8[0] = &mi_local; + xd->mi[0] = &mi_local; mi_local.mbmi.sb_type = BLOCK_16X16; mi_local.mbmi.ref_frame[0] = LAST_FRAME; mi_local.mbmi.ref_frame[1] = NONE; for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { - int_mv arf_left_mv, gld_left_mv; + MV gld_left_mv = gld_top_mv; int mb_y_in_offset = mb_y_offset; int arf_y_in_offset = arf_y_offset; int gld_y_in_offset = gld_y_offset; // Set up limit values for motion vectors to prevent them extending outside // the UMV borders. - arf_left_mv.as_int = arf_top_mv.as_int; - gld_left_mv.as_int = gld_top_mv.as_int; x->mv_col_min = -BORDER_MV_PIXELS_B16; x->mv_col_max = (cm->mb_cols - 1) * 8 + BORDER_MV_PIXELS_B16; xd->left_available = 0; @@ -282,14 +268,11 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi, MBGRAPH_MB_STATS *mb_stats = &stats->mb_stats[offset + mb_col]; update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset, - golden_ref, &gld_left_mv, gld_y_in_offset, - alt_ref, &arf_left_mv, arf_y_in_offset, + golden_ref, &gld_left_mv, alt_ref, mb_row, mb_col); - arf_left_mv.as_int = mb_stats->ref[ALTREF_FRAME].m.mv.as_int; - gld_left_mv.as_int = mb_stats->ref[GOLDEN_FRAME].m.mv.as_int; + gld_left_mv = mb_stats->ref[GOLDEN_FRAME].m.mv.as_mv; if (mb_col == 0) { - arf_top_mv.as_int = arf_left_mv.as_int; - gld_top_mv.as_int = gld_left_mv.as_int; + gld_top_mv = gld_left_mv; } xd->left_available = 1; mb_y_in_offset += 16; @@ -324,8 +307,8 @@ static void separate_arf_mbs(VP9_COMP *cpi) { 1)); // We are not interested in results beyond the alt ref itself. - if (n_frames > cpi->frames_till_gf_update_due) - n_frames = cpi->frames_till_gf_update_due; + if (n_frames > cpi->rc.frames_till_gf_update_due) + n_frames = cpi->rc.frames_till_gf_update_due; // defer cost to reference frames for (i = n_frames - 1; i >= 0; i--) { @@ -356,7 +339,7 @@ static void separate_arf_mbs(VP9_COMP *cpi) { for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) { // If any of the blocks in the sequence failed then the MB // goes in segment 0 - if (arf_not_zz[mi_row/2*cm->mb_cols + mi_col/2]) { + if (arf_not_zz[mi_row / 2 * cm->mb_cols + mi_col / 2]) { ncnt[0]++; cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 0; } else { @@ -378,11 +361,10 @@ static void separate_arf_mbs(VP9_COMP *cpi) { else cpi->static_mb_pct = 0; - cpi->seg0_cnt = ncnt[0]; - vp9_enable_segmentation((VP9_PTR)cpi); + vp9_enable_segmentation(&cm->seg); } else { cpi->static_mb_pct = 0; - vp9_disable_segmentation((VP9_PTR)cpi); + vp9_disable_segmentation(&cm->seg); } // Free localy allocated storage @@ -392,15 +374,13 @@ static void separate_arf_mbs(VP9_COMP *cpi) { void vp9_update_mbgraph_stats(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; int i, n_frames = vp9_lookahead_depth(cpi->lookahead); - YV12_BUFFER_CONFIG *golden_ref = - &cm->yv12_fb[cm->ref_frame_map[cpi->gld_fb_idx]]; + YV12_BUFFER_CONFIG *golden_ref = get_ref_frame_buffer(cpi, GOLDEN_FRAME); // we need to look ahead beyond where the ARF transitions into // being a GF - so exit if we don't look ahead beyond that - if (n_frames <= cpi->frames_till_gf_update_due) + if (n_frames <= cpi->rc.frames_till_gf_update_due) return; - if (n_frames > (int)cpi->frames_till_alt_ref_frame) - n_frames = cpi->frames_till_alt_ref_frame; + if (n_frames > MAX_LAG_BUFFERS) n_frames = MAX_LAG_BUFFERS; @@ -426,7 +406,7 @@ void vp9_update_mbgraph_stats(VP9_COMP *cpi) { golden_ref, cpi->Source); } - vp9_clear_system_state(); // __asm emms; + vp9_clear_system_state(); separate_arf_mbs(cpi); } diff --git a/media/libvpx/vp9/encoder/vp9_mbgraph.h b/media/libvpx/vp9/encoder/vp9_mbgraph.h index c5bca4d01f53..c3af972bc00b 100644 --- a/media/libvpx/vp9/encoder/vp9_mbgraph.h +++ b/media/libvpx/vp9/encoder/vp9_mbgraph.h @@ -11,6 +11,30 @@ #ifndef VP9_ENCODER_VP9_MBGRAPH_H_ #define VP9_ENCODER_VP9_MBGRAPH_H_ -void vp9_update_mbgraph_stats(VP9_COMP *cpi); +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + struct { + int err; + union { + int_mv mv; + PREDICTION_MODE mode; + } m; + } ref[MAX_REF_FRAMES]; +} MBGRAPH_MB_STATS; + +typedef struct { + MBGRAPH_MB_STATS *mb_stats; +} MBGRAPH_FRAME_STATS; + +struct VP9_COMP; + +void vp9_update_mbgraph_stats(struct VP9_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif #endif // VP9_ENCODER_VP9_MBGRAPH_H_ diff --git a/media/libvpx/vp9/encoder/vp9_mcomp.c b/media/libvpx/vp9/encoder/vp9_mcomp.c index a52f5b1b0af5..d6f6b2563363 100644 --- a/media/libvpx/vp9/encoder/vp9_mcomp.c +++ b/media/libvpx/vp9/encoder/vp9_mcomp.c @@ -16,19 +16,28 @@ #include "vpx_mem/vpx_mem.h" -#include "vp9/common/vp9_findnearmv.h" #include "vp9/common/vp9_common.h" -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_mcomp.h" // #define NEW_DIAMOND_SEARCH -void vp9_clamp_mv_min_max(MACROBLOCK *x, MV *mv) { - const int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0); - const int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0); - const int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL; - const int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL; +static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf, + const MV *mv) { + return &buf->buf[mv->row * buf->stride + mv->col]; +} + +void vp9_set_mv_search_range(MACROBLOCK *x, const MV *mv) { + int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0); + int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0); + int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL; + int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL; + + col_min = MAX(col_min, (MV_LOW >> 3) + 1); + row_min = MAX(row_min, (MV_LOW >> 3) + 1); + col_max = MIN(col_max, (MV_UPP >> 3) - 1); + row_max = MIN(row_max, (MV_UPP >> 3) - 1); // Get intersection of UMV window and valid MV window to reduce # of checks // in diamond search. @@ -42,25 +51,20 @@ void vp9_clamp_mv_min_max(MACROBLOCK *x, MV *mv) { x->mv_row_max = row_max; } -int vp9_init_search_range(VP9_COMP *cpi, int size) { +int vp9_init_search_range(int size) { int sr = 0; - // Minimum search size no matter what the passed in value. size = MAX(16, size); while ((size << sr) < MAX_FULL_PEL_VAL) sr++; - if (sr) - sr--; - - sr += cpi->sf.reduce_first_step_size; - sr = MIN(sr, (cpi->sf.max_step_search_steps - 2)); + sr = MIN(sr, MAX_MVSEARCH_STEPS - 2); return sr; } static INLINE int mv_cost(const MV *mv, - const int *joint_cost, int *comp_cost[2]) { + const int *joint_cost, int *const comp_cost[2]) { return joint_cost[vp9_get_mv_joint(mv)] + comp_cost[0][mv->row] + comp_cost[1][mv->col]; } @@ -84,63 +88,43 @@ static int mv_err_cost(const MV *mv, const MV *ref, return 0; } -static int mvsad_err_cost(const MV *mv, const MV *ref, - const int *mvjsadcost, int *mvsadcost[2], +static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref, int error_per_bit) { - if (mvsadcost) { + if (x->nmvsadcost) { const MV diff = { mv->row - ref->row, mv->col - ref->col }; - return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjsadcost, mvsadcost) * - error_per_bit, 8); + return ROUND_POWER_OF_TWO(mv_cost(&diff, x->nmvjointsadcost, + x->nmvsadcost) * error_per_bit, 8); } return 0; } -void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride) { - int len; - int search_site_count = 0; - - // Generate offsets for 4 search sites per step. - x->ss[search_site_count].mv.col = 0; - x->ss[search_site_count].mv.row = 0; - x->ss[search_site_count].offset = 0; - search_site_count++; - - for (len = MAX_FIRST_STEP; len > 0; len /= 2) { - // Compute offsets for search sites. - x->ss[search_site_count].mv.col = 0; - x->ss[search_site_count].mv.row = -len; - x->ss[search_site_count].offset = -len * stride; - search_site_count++; - - // Compute offsets for search sites. - x->ss[search_site_count].mv.col = 0; - x->ss[search_site_count].mv.row = len; - x->ss[search_site_count].offset = len * stride; - search_site_count++; - - // Compute offsets for search sites. - x->ss[search_site_count].mv.col = -len; - x->ss[search_site_count].mv.row = 0; - x->ss[search_site_count].offset = -len; - search_site_count++; - - // Compute offsets for search sites. - x->ss[search_site_count].mv.col = len; - x->ss[search_site_count].mv.row = 0; - x->ss[search_site_count].offset = len; - search_site_count++; - } - - x->ss_count = search_site_count; - x->searches_per_step = 4; -} - -void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) { +void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) { int len, ss_count = 1; - x->ss[0].mv.col = x->ss[0].mv.row = 0; - x->ss[0].offset = 0; + cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0; + cfg->ss[0].offset = 0; + + for (len = MAX_FIRST_STEP; len > 0; len /= 2) { + // Generate offsets for 4 search sites per step. + const MV ss_mvs[] = {{-len, 0}, {len, 0}, {0, -len}, {0, len}}; + int i; + for (i = 0; i < 4; ++i) { + search_site *const ss = &cfg->ss[ss_count++]; + ss->mv = ss_mvs[i]; + ss->offset = ss->mv.row * stride + ss->mv.col; + } + } + + cfg->ss_count = ss_count; + cfg->searches_per_step = 4; +} + +void vp9_init3smotion_compensation(search_site_config *cfg, int stride) { + int len, ss_count = 1; + + cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0; + cfg->ss[0].offset = 0; for (len = MAX_FIRST_STEP; len > 0; len /= 2) { // Generate offsets for 8 search sites per step. @@ -150,14 +134,14 @@ void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) { }; int i; for (i = 0; i < 8; ++i) { - search_site *const ss = &x->ss[ss_count++]; + search_site *const ss = &cfg->ss[ss_count++]; ss->mv = ss_mvs[i]; ss->offset = ss->mv.row * stride + ss->mv.col; } } - x->ss_count = ss_count; - x->searches_per_step = 8; + cfg->ss_count = ss_count; + cfg->searches_per_step = 8; } /* @@ -178,35 +162,34 @@ void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) { error_per_bit + 4096) >> 13 : 0) -#define SP(x) (((x) & 7) << 1) // convert motion vector component to offset - // for svf calc +// convert motion vector component to offset for svf calc +static INLINE int sp(int x) { + return (x & 7) << 1; +} -#define IFMVCV(r, c, s, e) \ - if (c >= minc && c <= maxc && r >= minr && r <= maxr) \ - s \ - else \ - e; - -/* pointer to predictor base of a motionvector */ -#define PRE(r, c) (y + (((r) >> 3) * y_stride + ((c) >> 3) -(offset))) - -/* returns subpixel variance error function */ -#define DIST(r, c) \ - vfp->svf(PRE(r, c), y_stride, SP(c), SP(r), z, src_stride, &sse) +static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { + return &buf[(r >> 3) * stride + (c >> 3)]; +} /* checks if (r, c) has better score than previous best */ #define CHECK_BETTER(v, r, c) \ - IFMVCV(r, c, { \ - thismse = (DIST(r, c)); \ - if ((v = MVC(r, c) + thismse) < besterr) { \ - besterr = v; \ - br = r; \ - bc = c; \ - *distortion = thismse; \ - *sse1 = sse; \ - } \ - }, \ - v = INT_MAX;) + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + if (second_pred == NULL) \ + thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ + src_stride, &sse); \ + else \ + thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \ + z, src_stride, &sse, second_pred); \ + if ((v = MVC(r, c) + thismse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } #define FIRST_LEVEL_CHECKS \ { \ @@ -273,93 +256,126 @@ void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) { } \ } -int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x, - MV *bestmv, const MV *ref_mv, - int allow_hp, - int error_per_bit, - const vp9_variance_fn_ptr_t *vfp, - int forced_stop, - int iters_per_step, - int *mvjcost, int *mvcost[2], - int *distortion, - unsigned int *sse1) { - uint8_t *z = x->plane[0].src.buf; - int src_stride = x->plane[0].src.stride; - MACROBLOCKD *xd = &x->e_mbd; - - unsigned int besterr = INT_MAX; - unsigned int sse; - unsigned int whichdir; - unsigned int halfiters = iters_per_step; - unsigned int quarteriters = iters_per_step; - unsigned int eighthiters = iters_per_step; - int thismse; - - const int y_stride = xd->plane[0].pre[0].stride; - const int offset = bestmv->row * y_stride + bestmv->col; - uint8_t *y = xd->plane[0].pre[0].buf + offset; - - int rr = ref_mv->row; - int rc = ref_mv->col; - int br = bestmv->row * 8; - int bc = bestmv->col * 8; - int hstep = 4; - const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX); - const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX); - const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX); - const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX); - - int tr = br; - int tc = bc; - - // central mv - bestmv->row <<= 3; - bestmv->col <<= 3; - - // calculate central point error - besterr = vfp->vf(y, y_stride, z, src_stride, sse1); - *distortion = besterr; +#define SETUP_SUBPEL_SEARCH \ + const uint8_t *const z = x->plane[0].src.buf; \ + const int src_stride = x->plane[0].src.stride; \ + const MACROBLOCKD *xd = &x->e_mbd; \ + unsigned int besterr = INT_MAX; \ + unsigned int sse; \ + unsigned int whichdir; \ + int thismse; \ + const unsigned int halfiters = iters_per_step; \ + const unsigned int quarteriters = iters_per_step; \ + const unsigned int eighthiters = iters_per_step; \ + const int y_stride = xd->plane[0].pre[0].stride; \ + const int offset = bestmv->row * y_stride + bestmv->col; \ + const uint8_t *const y = xd->plane[0].pre[0].buf; \ + \ + int rr = ref_mv->row; \ + int rc = ref_mv->col; \ + int br = bestmv->row * 8; \ + int bc = bestmv->col * 8; \ + int hstep = 4; \ + const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX); \ + const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX); \ + const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX); \ + const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX); \ + int tr = br; \ + int tc = bc; \ + \ + bestmv->row *= 8; \ + bestmv->col *= 8; \ + if (second_pred != NULL) { \ + DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); \ + vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); \ + besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); \ + } else { \ + besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1); \ + } \ + *distortion = besterr; \ besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); - // TODO(jbb): Each subsequent iteration checks at least one point in - // common with the last iteration could be 2 if diagonal is selected. - while (halfiters--) { - // 1/2 pel +int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x, + MV *bestmv, const MV *ref_mv, + int allow_hp, + int error_per_bit, + const vp9_variance_fn_ptr_t *vfp, + int forced_stop, + int iters_per_step, + int *sad_list, + int *mvjcost, int *mvcost[2], + int *distortion, + unsigned int *sse1, + const uint8_t *second_pred, + int w, int h) { + SETUP_SUBPEL_SEARCH; + + if (sad_list && + sad_list[0] != INT_MAX && sad_list[1] != INT_MAX && + sad_list[2] != INT_MAX && sad_list[3] != INT_MAX && + sad_list[4] != INT_MAX) { + unsigned int left, right, up, down, diag; + whichdir = (sad_list[1] < sad_list[3] ? 0 : 1) + + (sad_list[2] < sad_list[4] ? 0 : 2); + switch (whichdir) { + case 0: + CHECK_BETTER(left, tr, tc - hstep); + CHECK_BETTER(up, tr - hstep, tc); + CHECK_BETTER(diag, tr - hstep, tc - hstep); + break; + case 1: + CHECK_BETTER(right, tr, tc + hstep); + CHECK_BETTER(up, tr - hstep, tc); + CHECK_BETTER(diag, tr - hstep, tc + hstep); + break; + case 2: + CHECK_BETTER(left, tr, tc - hstep); + CHECK_BETTER(down, tr + hstep, tc); + CHECK_BETTER(diag, tr + hstep, tc - hstep); + break; + case 3: + CHECK_BETTER(right, tr, tc + hstep); + CHECK_BETTER(down, tr + hstep, tc); + CHECK_BETTER(diag, tr + hstep, tc + hstep); + break; + } + } else { FIRST_LEVEL_CHECKS; - // no reason to check the same one again. - if (tr == br && tc == bc) - break; - tr = br; - tc = bc; + if (halfiters > 1) { + SECOND_LEVEL_CHECKS; + } } - // TODO(yaowu): Each subsequent iteration checks at least one point in common - // with the last iteration could be 2 if diagonal is selected. + tr = br; + tc = bc; + + // Each subsequent iteration checks at least one point in common with + // the last iteration could be 2 ( if diag selected) 1/4 pel // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only if (forced_stop != 2) { hstep >>= 1; - while (quarteriters--) { - FIRST_LEVEL_CHECKS; - // no reason to check the same one again. - if (tr == br && tc == bc) - break; - tr = br; - tc = bc; + FIRST_LEVEL_CHECKS; + if (quarteriters > 1) { + SECOND_LEVEL_CHECKS; } + tr = br; + tc = bc; } if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) { hstep >>= 1; - while (eighthiters--) { - FIRST_LEVEL_CHECKS; - // no reason to check the same one again. - if (tr == br && tc == bc) - break; - tr = br; - tc = bc; + FIRST_LEVEL_CHECKS; + if (eighthiters > 1) { + SECOND_LEVEL_CHECKS; } + tr = br; + tc = bc; } + // These lines insure static analysis doesn't warn that + // tr and tc aren't used after the above point. + (void) tr; + (void) tc; bestmv->row = br; bestmv->col = bc; @@ -371,253 +387,21 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x, return besterr; } -int vp9_find_best_sub_pixel_tree(MACROBLOCK *x, +int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, + int *sad_list, int *mvjcost, int *mvcost[2], int *distortion, - unsigned int *sse1) { - uint8_t *z = x->plane[0].src.buf; - const int src_stride = x->plane[0].src.stride; - MACROBLOCKD *xd = &x->e_mbd; - unsigned int besterr = INT_MAX; - unsigned int sse; - unsigned int whichdir; - int thismse; - unsigned int halfiters = iters_per_step; - unsigned int quarteriters = iters_per_step; - unsigned int eighthiters = iters_per_step; - - const int y_stride = xd->plane[0].pre[0].stride; - const int offset = bestmv->row * y_stride + bestmv->col; - uint8_t *y = xd->plane[0].pre[0].buf + offset; - - int rr = ref_mv->row; - int rc = ref_mv->col; - int br = bestmv->row * 8; - int bc = bestmv->col * 8; - int hstep = 4; - const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX); - const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX); - const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX); - const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX); - - int tr = br; - int tc = bc; - - // central mv - bestmv->row *= 8; - bestmv->col *= 8; - - // calculate central point error - besterr = vfp->vf(y, y_stride, z, src_stride, sse1); - *distortion = besterr; - besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); - - // 1/2 pel - FIRST_LEVEL_CHECKS; - if (halfiters > 1) { - SECOND_LEVEL_CHECKS; - } - tr = br; - tc = bc; - - // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only - if (forced_stop != 2) { - hstep >>= 1; - FIRST_LEVEL_CHECKS; - if (quarteriters > 1) { - SECOND_LEVEL_CHECKS; - } - tr = br; - tc = bc; - } - - if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) { - hstep >>= 1; - FIRST_LEVEL_CHECKS; - if (eighthiters > 1) { - SECOND_LEVEL_CHECKS; - } - tr = br; - tc = bc; - } - - bestmv->row = br; - bestmv->col = bc; - - if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || - (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) - return INT_MAX; - - return besterr; -} - -#undef DIST -/* returns subpixel variance error function */ -#define DIST(r, c) \ - vfp->svaf(PRE(r, c), y_stride, SP(c), SP(r), \ - z, src_stride, &sse, second_pred) - -int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x, - MV *bestmv, const MV *ref_mv, - int allow_hp, - int error_per_bit, - const vp9_variance_fn_ptr_t *vfp, - int forced_stop, - int iters_per_step, - int *mvjcost, int *mvcost[2], - int *distortion, - unsigned int *sse1, - const uint8_t *second_pred, - int w, int h) { - uint8_t *const z = x->plane[0].src.buf; - const int src_stride = x->plane[0].src.stride; - MACROBLOCKD *const xd = &x->e_mbd; - - unsigned int besterr = INT_MAX; - unsigned int sse; - unsigned int whichdir; - unsigned int halfiters = iters_per_step; - unsigned int quarteriters = iters_per_step; - unsigned int eighthiters = iters_per_step; - int thismse; - - DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); - const int y_stride = xd->plane[0].pre[0].stride; - const int offset = bestmv->row * y_stride + bestmv->col; - uint8_t *const y = xd->plane[0].pre[0].buf + offset; - - int rr = ref_mv->row; - int rc = ref_mv->col; - int br = bestmv->row * 8; - int bc = bestmv->col * 8; - int hstep = 4; - const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX); - const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX); - const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX); - const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX); - - int tr = br; - int tc = bc; - - // central mv - bestmv->row *= 8; - bestmv->col *= 8; - - // calculate central point error - // TODO(yunqingwang): central pointer error was already calculated in full- - // pixel search, and can be passed in this function. - comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride); - besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); - *distortion = besterr; - besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); - - // Each subsequent iteration checks at least one point in - // common with the last iteration could be 2 ( if diag selected) - while (halfiters--) { - // 1/2 pel - FIRST_LEVEL_CHECKS; - // no reason to check the same one again. - if (tr == br && tc == bc) - break; - tr = br; - tc = bc; - } - - // Each subsequent iteration checks at least one point in common with - // the last iteration could be 2 ( if diag selected) 1/4 pel - - // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only - if (forced_stop != 2) { - hstep >>= 1; - while (quarteriters--) { - FIRST_LEVEL_CHECKS; - // no reason to check the same one again. - if (tr == br && tc == bc) - break; - tr = br; - tc = bc; - } - } - - if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) { - hstep >>= 1; - while (eighthiters--) { - FIRST_LEVEL_CHECKS; - // no reason to check the same one again. - if (tr == br && tc == bc) - break; - tr = br; - tc = bc; - } - } - bestmv->row = br; - bestmv->col = bc; - - if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || - (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) - return INT_MAX; - - return besterr; -} - -int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x, - MV *bestmv, const MV *ref_mv, - int allow_hp, - int error_per_bit, - const vp9_variance_fn_ptr_t *vfp, - int forced_stop, - int iters_per_step, - int *mvjcost, int *mvcost[2], - int *distortion, - unsigned int *sse1, - const uint8_t *second_pred, - int w, int h) { - uint8_t *z = x->plane[0].src.buf; - const int src_stride = x->plane[0].src.stride; - MACROBLOCKD *xd = &x->e_mbd; - unsigned int besterr = INT_MAX; - unsigned int sse; - unsigned int whichdir; - int thismse; - unsigned int halfiters = iters_per_step; - unsigned int quarteriters = iters_per_step; - unsigned int eighthiters = iters_per_step; - - DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64); - const int y_stride = xd->plane[0].pre[0].stride; - const int offset = bestmv->row * y_stride + bestmv->col; - uint8_t *y = xd->plane[0].pre[0].buf + offset; - - int rr = ref_mv->row; - int rc = ref_mv->col; - int br = bestmv->row * 8; - int bc = bestmv->col * 8; - int hstep = 4; - const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX); - const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX); - const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX); - const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX); - - int tr = br; - int tc = bc; - - // central mv - bestmv->row *= 8; - bestmv->col *= 8; - - // calculate central point error - // TODO(yunqingwang): central pointer error was already calculated in full- - // pixel search, and can be passed in this function. - comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride); - besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); - *distortion = besterr; - besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit); + unsigned int *sse1, + const uint8_t *second_pred, + int w, int h) { + SETUP_SUBPEL_SEARCH; + (void) sad_list; // to silence compiler warning // Each subsequent iteration checks at least one point in // common with the last iteration could be 2 ( if diag selected) @@ -652,6 +436,11 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x, tr = br; tc = bc; } + // These lines insure static analysis doesn't warn that + // tr and tc aren't used after the above point. + (void) tr; + (void) tc; + bestmv->row = br; bestmv->col = bc; @@ -664,49 +453,33 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x, #undef MVC #undef PRE -#undef DIST -#undef IFMVCV #undef CHECK_BETTER -#undef SP -#define CHECK_BOUNDS(range) \ - {\ - all_in = 1;\ - all_in &= ((br-range) >= x->mv_row_min);\ - all_in &= ((br+range) <= x->mv_row_max);\ - all_in &= ((bc-range) >= x->mv_col_min);\ - all_in &= ((bc+range) <= x->mv_col_max);\ - } +static INLINE int check_bounds(const MACROBLOCK *x, int row, int col, + int range) { + return ((row - range) >= x->mv_row_min) & + ((row + range) <= x->mv_row_max) & + ((col - range) >= x->mv_col_min) & + ((col + range) <= x->mv_col_max); +} -#define CHECK_POINT \ - {\ - if (this_mv.col < x->mv_col_min) continue;\ - if (this_mv.col > x->mv_col_max) continue;\ - if (this_mv.row < x->mv_row_min) continue;\ - if (this_mv.row > x->mv_row_max) continue;\ - } +static INLINE int is_mv_in(const MACROBLOCK *x, const MV *mv) { + return (mv->col >= x->mv_col_min) && (mv->col <= x->mv_col_max) && + (mv->row >= x->mv_row_min) && (mv->row <= x->mv_row_max); +} #define CHECK_BETTER \ {\ - if (thissad < bestsad)\ - {\ + if (thissad < bestsad) {\ if (use_mvcost) \ - thissad += mvsad_err_cost(&this_mv, &fcenter_mv.as_mv, \ - mvjsadcost, mvsadcost, \ - sad_per_bit);\ - if (thissad < bestsad)\ - {\ + thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);\ + if (thissad < bestsad) {\ bestsad = thissad;\ best_site = i;\ }\ }\ } -#define get_next_chkpts(list, i, n) \ - list[0] = ((i) == 0 ? (n) - 1 : (i) - 1); \ - list[1] = (i); \ - list[2] = ((i) == (n) - 1 ? 0 : (i) + 1); - #define MAX_PATTERN_SCALES 11 #define MAX_PATTERN_CANDIDATES 8 // max number of canddiates per scale #define PATTERN_CANDIDATES_REF 3 // number of refinement candidates @@ -715,56 +488,42 @@ int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x, // Each scale can have a different number of candidates and shape of // candidates as indicated in the num_candidates and candidates arrays // passed into this function -static int vp9_pattern_search(MACROBLOCK *x, +// +static int vp9_pattern_search(const MACROBLOCK *x, MV *ref_mv, int search_param, int sad_per_bit, int do_init_search, - int do_refine, + int *sad_list, const vp9_variance_fn_ptr_t *vfp, int use_mvcost, - const MV *center_mv, MV *best_mv, + const MV *center_mv, + MV *best_mv, const int num_candidates[MAX_PATTERN_SCALES], const MV candidates[MAX_PATTERN_SCALES] [MAX_PATTERN_CANDIDATES]) { - const MACROBLOCKD* const xd = &x->e_mbd; + const MACROBLOCKD *const xd = &x->e_mbd; static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = { 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, }; - int i, j, s, t; - uint8_t *what = x->plane[0].src.buf; - int what_stride = x->plane[0].src.stride; - int in_what_stride = xd->plane[0].pre[0].stride; + int i, s, t; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; int br, bc; - MV this_mv; int bestsad = INT_MAX; int thissad; - uint8_t *base_offset; - uint8_t *this_offset; int k = -1; - int all_in; - int best_site = -1; - int_mv fcenter_mv; + const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; int best_init_s = search_param_to_steps[search_param]; - int *mvjsadcost = x->nmvjointsadcost; - int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; - - fcenter_mv.as_mv.row = center_mv->row >> 3; - fcenter_mv.as_mv.col = center_mv->col >> 3; - // adjust ref_mv to make sure it is within MV range clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); br = ref_mv->row; bc = ref_mv->col; // Work out the start point for the search - base_offset = (uint8_t *)(xd->plane[0].pre[0].buf); - this_offset = base_offset + (br * in_what_stride) + bc; - this_mv.row = br; - this_mv.col = bc; - bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride, 0x7fffffff) - + mvsad_err_cost(&this_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, sad_per_bit); + bestsad = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, ref_mv), in_what->stride) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); // Search all possible scales upto the search param around the center point // pick the scale of the point that is best as the starting scale of @@ -773,27 +532,25 @@ static int vp9_pattern_search(MACROBLOCK *x, s = best_init_s; best_init_s = -1; for (t = 0; t <= s; ++t) { - best_site = -1; - CHECK_BOUNDS((1 << t)) - if (all_in) { + int best_site = -1; + if (check_bounds(x, br, bc, 1 << t)) { for (i = 0; i < num_candidates[t]; i++) { - this_mv.row = br + candidates[t][i].row; - this_mv.col = bc + candidates[t][i].col; - this_offset = base_offset + (this_mv.row * in_what_stride) + - this_mv.col; - thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, - bestsad); + const MV this_mv = {br + candidates[t][i].row, + bc + candidates[t][i].col}; + thissad = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), + in_what->stride); CHECK_BETTER } } else { for (i = 0; i < num_candidates[t]; i++) { - this_mv.row = br + candidates[t][i].row; - this_mv.col = bc + candidates[t][i].col; - CHECK_POINT - this_offset = base_offset + (this_mv.row * in_what_stride) + - this_mv.col; - thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, - bestsad); + const MV this_mv = {br + candidates[t][i].row, + bc + candidates[t][i].col}; + if (!is_mv_in(x, &this_mv)) + continue; + thissad = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), + in_what->stride); CHECK_BETTER } } @@ -813,31 +570,30 @@ static int vp9_pattern_search(MACROBLOCK *x, // If the center point is still the best, just skip this and move to // the refinement step. if (best_init_s != -1) { + int best_site = -1; s = best_init_s; - best_site = -1; + do { // No need to search all 6 points the 1st time if initial search was used if (!do_init_search || s != best_init_s) { - CHECK_BOUNDS((1 << s)) - if (all_in) { + if (check_bounds(x, br, bc, 1 << s)) { for (i = 0; i < num_candidates[s]; i++) { - this_mv.row = br + candidates[s][i].row; - this_mv.col = bc + candidates[s][i].col; - this_offset = base_offset + (this_mv.row * in_what_stride) + - this_mv.col; - thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, - bestsad); + const MV this_mv = {br + candidates[s][i].row, + bc + candidates[s][i].col}; + thissad = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), + in_what->stride); CHECK_BETTER } } else { for (i = 0; i < num_candidates[s]; i++) { - this_mv.row = br + candidates[s][i].row; - this_mv.col = bc + candidates[s][i].col; - CHECK_POINT - this_offset = base_offset + (this_mv.row * in_what_stride) + - this_mv.col; - thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, - bestsad); + const MV this_mv = {br + candidates[s][i].row, + bc + candidates[s][i].col}; + if (!is_mv_in(x, &this_mv)) + continue; + thissad = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), + in_what->stride); CHECK_BETTER } } @@ -854,28 +610,28 @@ static int vp9_pattern_search(MACROBLOCK *x, do { int next_chkpts_indices[PATTERN_CANDIDATES_REF]; best_site = -1; - CHECK_BOUNDS((1 << s)) + next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1; + next_chkpts_indices[1] = k; + next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1; - get_next_chkpts(next_chkpts_indices, k, num_candidates[s]); - if (all_in) { + if (check_bounds(x, br, bc, 1 << s)) { for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { - this_mv.row = br + candidates[s][next_chkpts_indices[i]].row; - this_mv.col = bc + candidates[s][next_chkpts_indices[i]].col; - this_offset = base_offset + (this_mv.row * (in_what_stride)) + - this_mv.col; - thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, - bestsad); + const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row, + bc + candidates[s][next_chkpts_indices[i]].col}; + thissad = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), + in_what->stride); CHECK_BETTER } } else { for (i = 0; i < PATTERN_CANDIDATES_REF; i++) { - this_mv.row = br + candidates[s][next_chkpts_indices[i]].row; - this_mv.col = bc + candidates[s][next_chkpts_indices[i]].col; - CHECK_POINT - this_offset = base_offset + (this_mv.row * (in_what_stride)) + - this_mv.col; - thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, - bestsad); + const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row, + bc + candidates[s][next_chkpts_indices[i]].col}; + if (!is_mv_in(x, &this_mv)) + continue; + thissad = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), + in_what->stride); CHECK_BETTER } } @@ -889,70 +645,80 @@ static int vp9_pattern_search(MACROBLOCK *x, } while (s--); } - // Check 4 1-away neighbors if do_refine is true. - // For most well-designed schemes do_refine will not be necessary. - if (do_refine) { - static const MV neighbors[4] = { - {0, -1}, { -1, 0}, {1, 0}, {0, 1}, - }; - for (j = 0; j < 16; j++) { - best_site = -1; - CHECK_BOUNDS(1) - if (all_in) { - for (i = 0; i < 4; i++) { - this_mv.row = br + neighbors[i].row; - this_mv.col = bc + neighbors[i].col; - this_offset = base_offset + (this_mv.row * (in_what_stride)) + - this_mv.col; - thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, - bestsad); - CHECK_BETTER - } - } else { - for (i = 0; i < 4; i++) { - this_mv.row = br + neighbors[i].row; - this_mv.col = bc + neighbors[i].col; - CHECK_POINT - this_offset = base_offset + (this_mv.row * (in_what_stride)) + - this_mv.col; - thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, - bestsad); - CHECK_BETTER - } - } - - if (best_site == -1) { - break; - } else { - br += neighbors[best_site].row; - bc += neighbors[best_site].col; + // Returns the one-away integer pel sad values around the best as follows: + // sad_list[0]: sad at the best integer pel + // sad_list[1]: sad at delta {0, -1} (left) from the best integer pel + // sad_list[2]: sad at delta {-1, 0} (top) from the best integer pel + // sad_list[3]: sad at delta { 0, 1} (right) from the best integer pel + // sad_list[4]: sad at delta { 1, 0} (bottom) from the best integer pel + if (sad_list) { + static const MV neighbors[4] = {{0, -1}, {-1, 0}, {0, 1}, {1, 0}}; + sad_list[0] = bestsad; + if (check_bounds(x, br, bc, 1)) { + for (i = 0; i < 4; i++) { + const MV this_mv = {br + neighbors[i].row, + bc + neighbors[i].col}; + sad_list[i + 1] = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), + in_what->stride); + } + } else { + for (i = 0; i < 4; i++) { + const MV this_mv = {br + neighbors[i].row, + bc + neighbors[i].col}; + if (!is_mv_in(x, &this_mv)) + sad_list[i + 1] = INT_MAX; + else + sad_list[i + 1] = vfp->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &this_mv), + in_what->stride); } } } - best_mv->row = br; best_mv->col = bc; - - this_offset = base_offset + (best_mv->row * in_what_stride) + - best_mv->col; - this_mv.row = best_mv->row * 8; - this_mv.col = best_mv->col * 8; - if (bestsad == INT_MAX) - return INT_MAX; - - return vfp->vf(what, what_stride, this_offset, in_what_stride, - (unsigned int *)&bestsad) + - use_mvcost ? mv_err_cost(&this_mv, center_mv, - x->nmvjointcost, x->mvcost, x->errorperbit) - : 0; + return bestsad; } +int vp9_get_mvpred_var(const MACROBLOCK *x, + const MV *best_mv, const MV *center_mv, + const vp9_variance_fn_ptr_t *vfp, + int use_mvcost) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const MV mv = {best_mv->row * 8, best_mv->col * 8}; + unsigned int unused; -int vp9_hex_search(MACROBLOCK *x, + return vfp->vf(what->buf, what->stride, + get_buf_from_mv(in_what, best_mv), in_what->stride, &unused) + + (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, + x->mvcost, x->errorperbit) : 0); +} + +int vp9_get_mvpred_av_var(const MACROBLOCK *x, + const MV *best_mv, const MV *center_mv, + const uint8_t *second_pred, + const vp9_variance_fn_ptr_t *vfp, + int use_mvcost) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const MV mv = {best_mv->row * 8, best_mv->col * 8}; + unsigned int unused; + + return vfp->svaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0, + what->buf, what->stride, &unused, second_pred) + + (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, + x->mvcost, x->errorperbit) : 0); +} + +int vp9_hex_search(const MACROBLOCK *x, MV *ref_mv, int search_param, int sad_per_bit, int do_init_search, + int *sad_list, const vp9_variance_fn_ptr_t *vfp, int use_mvcost, const MV *center_mv, MV *best_mv) { @@ -976,18 +742,18 @@ int vp9_hex_search(MACROBLOCK *x, {{-512, -1024}, {512, -1024}, {1024, 0}, {512, 1024}, { -512, 1024}, { -1024, 0}}, }; - return - vp9_pattern_search(x, ref_mv, search_param, sad_per_bit, - do_init_search, 0, vfp, use_mvcost, - center_mv, best_mv, - hex_num_candidates, hex_candidates); + return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit, + do_init_search, sad_list, vfp, use_mvcost, + center_mv, best_mv, + hex_num_candidates, hex_candidates); } -int vp9_bigdia_search(MACROBLOCK *x, +int vp9_bigdia_search(const MACROBLOCK *x, MV *ref_mv, int search_param, int sad_per_bit, int do_init_search, + int *sad_list, const vp9_variance_fn_ptr_t *vfp, int use_mvcost, const MV *center_mv, @@ -1019,16 +785,17 @@ int vp9_bigdia_search(MACROBLOCK *x, {-512, 512}, {-1024, 0}}, }; return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit, - do_init_search, 0, vfp, use_mvcost, + do_init_search, sad_list, vfp, use_mvcost, center_mv, best_mv, bigdia_num_candidates, bigdia_candidates); } -int vp9_square_search(MACROBLOCK *x, +int vp9_square_search(const MACROBLOCK *x, MV *ref_mv, int search_param, int sad_per_bit, int do_init_search, + int *sad_list, const vp9_variance_fn_ptr_t *vfp, int use_mvcost, const MV *center_mv, @@ -1060,173 +827,130 @@ int vp9_square_search(MACROBLOCK *x, {0, 1024}, {-1024, 1024}, {-1024, 0}}, }; return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit, - do_init_search, 0, vfp, use_mvcost, + do_init_search, sad_list, vfp, use_mvcost, center_mv, best_mv, square_num_candidates, square_candidates); -}; +} + +int vp9_fast_hex_search(const MACROBLOCK *x, + MV *ref_mv, + int search_param, + int sad_per_bit, + int do_init_search, // must be zero for fast_hex + int *sad_list, + const vp9_variance_fn_ptr_t *vfp, + int use_mvcost, + const MV *center_mv, + MV *best_mv) { + return vp9_hex_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param), + sad_per_bit, do_init_search, sad_list, vfp, use_mvcost, + center_mv, best_mv); +} + +int vp9_fast_dia_search(const MACROBLOCK *x, + MV *ref_mv, + int search_param, + int sad_per_bit, + int do_init_search, + int *sad_list, + const vp9_variance_fn_ptr_t *vfp, + int use_mvcost, + const MV *center_mv, + MV *best_mv) { + return vp9_bigdia_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param), + sad_per_bit, do_init_search, sad_list, vfp, + use_mvcost, center_mv, best_mv); +} -#undef CHECK_BOUNDS -#undef CHECK_POINT #undef CHECK_BETTER -int vp9_diamond_search_sad_c(MACROBLOCK *x, - int_mv *ref_mv, int_mv *best_mv, - int search_param, int sad_per_bit, int *num00, - vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, - int *mvcost[2], int_mv *center_mv) { - int i, j, step; +int vp9_full_range_search_c(const MACROBLOCK *x, + const search_site_config *cfg, + MV *ref_mv, MV *best_mv, + int search_param, int sad_per_bit, int *num00, + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *center_mv) { + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const int range = 64; + const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; + unsigned int best_sad = INT_MAX; + int r, c, i; + int start_col, end_col, start_row, end_row; - const MACROBLOCKD* const xd = &x->e_mbd; - uint8_t *what = x->plane[0].src.buf; - int what_stride = x->plane[0].src.stride; - uint8_t *in_what; - int in_what_stride = xd->plane[0].pre[0].stride; - uint8_t *best_address; + // The cfg and search_param parameters are not used in this search variant + (void)cfg; + (void)search_param; - int tot_steps; - int_mv this_mv; + clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); + *best_mv = *ref_mv; + *num00 = 11; + best_sad = fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, ref_mv), in_what->stride) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); + start_row = MAX(-range, x->mv_row_min - ref_mv->row); + start_col = MAX(-range, x->mv_col_min - ref_mv->col); + end_row = MIN(range, x->mv_row_max - ref_mv->row); + end_col = MIN(range, x->mv_col_max - ref_mv->col); - int bestsad = INT_MAX; - int best_site = 0; - int last_site = 0; - - int ref_row, ref_col; - int this_row_offset, this_col_offset; - search_site *ss; - - uint8_t *check_here; - int thissad; - int_mv fcenter_mv; - - int *mvjsadcost = x->nmvjointsadcost; - int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; - - fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; - fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; - - clamp_mv(&ref_mv->as_mv, - x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); - ref_row = ref_mv->as_mv.row; - ref_col = ref_mv->as_mv.col; - *num00 = 0; - best_mv->as_mv.row = ref_row; - best_mv->as_mv.col = ref_col; - - // Work out the start point for the search - in_what = (uint8_t *)(xd->plane[0].pre[0].buf + - (ref_row * (xd->plane[0].pre[0].stride)) + ref_col); - best_address = in_what; - - // Check the starting position - bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) - + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, sad_per_bit); - - // search_param determines the length of the initial step and hence the number - // of iterations - // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = - // (MAX_FIRST_STEP/4) pel... etc. - ss = &x->ss[search_param * x->searches_per_step]; - tot_steps = (x->ss_count / x->searches_per_step) - search_param; - - i = 1; - - for (step = 0; step < tot_steps; step++) { - for (j = 0; j < x->searches_per_step; j++) { - // Trap illegal vectors - this_row_offset = best_mv->as_mv.row + ss[i].mv.row; - this_col_offset = best_mv->as_mv.col + ss[i].mv.col; - - if ((this_col_offset > x->mv_col_min) && - (this_col_offset < x->mv_col_max) && - (this_row_offset > x->mv_row_min) && - (this_row_offset < x->mv_row_max)) { - check_here = ss[i].offset + best_address; - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, - bestsad); - - if (thissad < bestsad) { - this_mv.as_mv.row = this_row_offset; - this_mv.as_mv.col = this_col_offset; - thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, sad_per_bit); - - if (thissad < bestsad) { - bestsad = thissad; - best_site = i; - } + for (r = start_row; r <= end_row; ++r) { + for (c = start_col; c <= end_col; c += 4) { + if (c + 3 <= end_col) { + unsigned int sads[4]; + const uint8_t *addrs[4]; + for (i = 0; i < 4; ++i) { + const MV mv = {ref_mv->row + r, ref_mv->col + c + i}; + addrs[i] = get_buf_from_mv(in_what, &mv); } - } - i++; - } + fn_ptr->sdx4df(what->buf, what->stride, addrs, in_what->stride, sads); - if (best_site != last_site) { - best_mv->as_mv.row += ss[best_site].mv.row; - best_mv->as_mv.col += ss[best_site].mv.col; - best_address += ss[best_site].offset; - last_site = best_site; -#if defined(NEW_DIAMOND_SEARCH) - while (1) { - this_row_offset = best_mv->as_mv.row + ss[best_site].mv.row; - this_col_offset = best_mv->as_mv.col + ss[best_site].mv.col; - if ((this_col_offset > x->mv_col_min) && - (this_col_offset < x->mv_col_max) && - (this_row_offset > x->mv_row_min) && - (this_row_offset < x->mv_row_max)) { - check_here = ss[best_site].offset + best_address; - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, - bestsad); - if (thissad < bestsad) { - this_mv.as_mv.row = this_row_offset; - this_mv.as_mv.col = this_col_offset; - thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, sad_per_bit); - if (thissad < bestsad) { - bestsad = thissad; - best_mv->as_mv.row += ss[best_site].mv.row; - best_mv->as_mv.col += ss[best_site].mv.col; - best_address += ss[best_site].offset; - continue; + for (i = 0; i < 4; ++i) { + if (sads[i] < best_sad) { + const MV mv = {ref_mv->row + r, ref_mv->col + c + i}; + const unsigned int sad = sads[i] + + mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; } } } - break; - }; -#endif - } else if (best_address == in_what) { - (*num00)++; + } else { + for (i = 0; i < end_col - c; ++i) { + const MV mv = {ref_mv->row + r, ref_mv->col + c + i}; + unsigned int sad = fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), in_what->stride); + if (sad < best_sad) { + sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + } + } } } - this_mv.as_mv.row = best_mv->as_mv.row * 8; - this_mv.as_mv.col = best_mv->as_mv.col * 8; - - if (bestsad == INT_MAX) - return INT_MAX; - - return fn_ptr->vf(what, what_stride, best_address, in_what_stride, - (unsigned int *)(&thissad)) + - mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, - mvjcost, mvcost, x->errorperbit); + return best_sad; } -int vp9_diamond_search_sadx4(MACROBLOCK *x, - int_mv *ref_mv, int_mv *best_mv, int search_param, +int vp9_diamond_search_sad_c(const MACROBLOCK *x, + const search_site_config *cfg, + MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, - vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], int_mv *center_mv) { + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *center_mv) { int i, j, step; - const MACROBLOCKD* const xd = &x->e_mbd; + const MACROBLOCKD *const xd = &x->e_mbd; uint8_t *what = x->plane[0].src.buf; - int what_stride = x->plane[0].src.stride; - uint8_t *in_what; - int in_what_stride = xd->plane[0].pre[0].stride; - uint8_t *best_address; - - int tot_steps; - int_mv this_mv; + const int what_stride = x->plane[0].src.stride; + const uint8_t *in_what; + const int in_what_stride = xd->plane[0].pre[0].stride; + const uint8_t *best_address; unsigned int bestsad = INT_MAX; int best_site = 0; @@ -1234,45 +958,30 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x, int ref_row; int ref_col; - int this_row_offset; - int this_col_offset; - search_site *ss; - - uint8_t *check_here; - unsigned int thissad; - int_mv fcenter_mv; - - int *mvjsadcost = x->nmvjointsadcost; - int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; - - fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; - fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; - - clamp_mv(&ref_mv->as_mv, - x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); - ref_row = ref_mv->as_mv.row; - ref_col = ref_mv->as_mv.col; - *num00 = 0; - best_mv->as_mv.row = ref_row; - best_mv->as_mv.col = ref_col; - - // Work out the start point for the search - in_what = (uint8_t *)(xd->plane[0].pre[0].buf + - (ref_row * (xd->plane[0].pre[0].stride)) + ref_col); - best_address = in_what; - - // Check the starting position - bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) - + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, sad_per_bit); // search_param determines the length of the initial step and hence the number // of iterations. // 0 = initial step (MAX_FIRST_STEP) pel // 1 = (MAX_FIRST_STEP/2) pel, // 2 = (MAX_FIRST_STEP/4) pel... - ss = &x->ss[search_param * x->searches_per_step]; - tot_steps = (x->ss_count / x->searches_per_step) - search_param; + const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step]; + const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param; + + const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; + clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); + ref_row = ref_mv->row; + ref_col = ref_mv->col; + *num00 = 0; + best_mv->row = ref_row; + best_mv->col = ref_col; + + // Work out the start point for the search + in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; + best_address = in_what; + + // Check the starting position + bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride) + + mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit); i = 1; @@ -1281,10 +990,10 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x, // All_in is true if every one of the points we are checking are within // the bounds of the image. - all_in &= ((best_mv->as_mv.row + ss[i].mv.row) > x->mv_row_min); - all_in &= ((best_mv->as_mv.row + ss[i + 1].mv.row) < x->mv_row_max); - all_in &= ((best_mv->as_mv.col + ss[i + 2].mv.col) > x->mv_col_min); - all_in &= ((best_mv->as_mv.col + ss[i + 3].mv.col) < x->mv_col_max); + all_in &= ((best_mv->row + ss[i].mv.row) > x->mv_row_min); + all_in &= ((best_mv->row + ss[i + 1].mv.row) < x->mv_row_max); + all_in &= ((best_mv->col + ss[i + 2].mv.col) > x->mv_col_min); + all_in &= ((best_mv->col + ss[i + 3].mv.col) < x->mv_col_max); // If all the pixels are within the bounds we don't check whether the // search point is valid in this loop, otherwise we check each point @@ -1292,7 +1001,7 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x, if (all_in) { unsigned int sad_array[4]; - for (j = 0; j < x->searches_per_step; j += 4) { + for (j = 0; j < cfg->searches_per_step; j += 4) { unsigned char const *block_offset[4]; for (t = 0; t < 4; t++) @@ -1303,11 +1012,10 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x, for (t = 0; t < 4; t++, i++) { if (sad_array[t] < bestsad) { - this_mv.as_mv.row = best_mv->as_mv.row + ss[i].mv.row; - this_mv.as_mv.col = best_mv->as_mv.col + ss[i].mv.col; - sad_array[t] += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, sad_per_bit); - + const MV this_mv = {best_mv->row + ss[i].mv.row, + best_mv->col + ss[i].mv.col}; + sad_array[t] += mvsad_err_cost(x, &this_mv, &fcenter_mv, + sad_per_bit); if (sad_array[t] < bestsad) { bestsad = sad_array[t]; best_site = i; @@ -1316,25 +1024,18 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x, } } } else { - for (j = 0; j < x->searches_per_step; j++) { + for (j = 0; j < cfg->searches_per_step; j++) { // Trap illegal vectors - this_row_offset = best_mv->as_mv.row + ss[i].mv.row; - this_col_offset = best_mv->as_mv.col + ss[i].mv.col; + const MV this_mv = {best_mv->row + ss[i].mv.row, + best_mv->col + ss[i].mv.col}; - if ((this_col_offset > x->mv_col_min) && - (this_col_offset < x->mv_col_max) && - (this_row_offset > x->mv_row_min) && - (this_row_offset < x->mv_row_max)) { - check_here = ss[i].offset + best_address; - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, - bestsad); + if (is_mv_in(x, &this_mv)) { + const uint8_t *const check_here = ss[i].offset + best_address; + unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here, + in_what_stride); if (thissad < bestsad) { - this_mv.as_mv.row = this_row_offset; - this_mv.as_mv.col = this_col_offset; - thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, sad_per_bit); - + thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); if (thissad < bestsad) { bestsad = thissad; best_site = i; @@ -1345,30 +1046,24 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x, } } if (best_site != last_site) { - best_mv->as_mv.row += ss[best_site].mv.row; - best_mv->as_mv.col += ss[best_site].mv.col; + best_mv->row += ss[best_site].mv.row; + best_mv->col += ss[best_site].mv.col; best_address += ss[best_site].offset; last_site = best_site; #if defined(NEW_DIAMOND_SEARCH) while (1) { - this_row_offset = best_mv->as_mv.row + ss[best_site].mv.row; - this_col_offset = best_mv->as_mv.col + ss[best_site].mv.col; - if ((this_col_offset > x->mv_col_min) && - (this_col_offset < x->mv_col_max) && - (this_row_offset > x->mv_row_min) && - (this_row_offset < x->mv_row_max)) { - check_here = ss[best_site].offset + best_address; - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, - bestsad); + const MV this_mv = {best_mv->row + ss[best_site].mv.row, + best_mv->col + ss[best_site].mv.col}; + if (is_mv_in(x, &this_mv)) { + const uint8_t *const check_here = ss[best_site].offset + best_address; + unsigned int thissad = fn_ptr->sdf(what, what_stride, check_here, + in_what_stride); if (thissad < bestsad) { - this_mv.as_mv.row = this_row_offset; - this_mv.as_mv.col = this_col_offset; - thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, sad_per_bit); + thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); if (thissad < bestsad) { bestsad = thissad; - best_mv->as_mv.row += ss[best_site].mv.row; - best_mv->as_mv.col += ss[best_site].mv.col; + best_mv->row += ss[best_site].mv.row; + best_mv->col += ss[best_site].mv.col; best_address += ss[best_site].offset; continue; } @@ -1381,611 +1076,316 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x, (*num00)++; } } - - this_mv.as_mv.row = best_mv->as_mv.row * 8; - this_mv.as_mv.col = best_mv->as_mv.col * 8; - - if (bestsad == INT_MAX) - return INT_MAX; - - return fn_ptr->vf(what, what_stride, best_address, in_what_stride, - (unsigned int *)(&thissad)) + - mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, - mvjcost, mvcost, x->errorperbit); + return bestsad; } /* do_refine: If last step (1-away) of n-step search doesn't pick the center point as the best match, we will do a final 1-away diamond refining search */ -int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x, - int_mv *mvp_full, int step_param, - int sadpb, int further_steps, - int do_refine, vp9_variance_fn_ptr_t *fn_ptr, - int_mv *ref_mv, int_mv *dst_mv) { - int_mv temp_mv; - int thissme, n, num00; - int bestsme = cpi->diamond_search_sad(x, mvp_full, &temp_mv, - step_param, sadpb, &num00, - fn_ptr, x->nmvjointcost, - x->mvcost, ref_mv); - dst_mv->as_int = temp_mv.as_int; +int vp9_full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x, + MV *mvp_full, int step_param, + int sadpb, int further_steps, int do_refine, + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *ref_mv, MV *dst_mv) { + MV temp_mv; + int thissme, n, num00 = 0; + int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv, + step_param, sadpb, &n, + fn_ptr, ref_mv); + if (bestsme < INT_MAX) + bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); + *dst_mv = temp_mv; - n = num00; - num00 = 0; - - /* If there won't be more n-step search, check to see if refining search is - * needed. */ + // If there won't be more n-step search, check to see if refining search is + // needed. if (n > further_steps) do_refine = 0; while (n < further_steps) { - n++; + ++n; if (num00) { num00--; } else { - thissme = cpi->diamond_search_sad(x, mvp_full, &temp_mv, + thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv, step_param + n, sadpb, &num00, - fn_ptr, x->nmvjointcost, x->mvcost, - ref_mv); + fn_ptr, ref_mv); + if (thissme < INT_MAX) + thissme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); - /* check to see if refining search is needed. */ - if (num00 > (further_steps - n)) + // check to see if refining search is needed. + if (num00 > further_steps - n) do_refine = 0; if (thissme < bestsme) { bestsme = thissme; - dst_mv->as_int = temp_mv.as_int; + *dst_mv = temp_mv; } } } - /* final 1-away diamond refining search */ - if (do_refine == 1) { - int search_range = 8; - int_mv best_mv; - best_mv.as_int = dst_mv->as_int; + // final 1-away diamond refining search + if (do_refine) { + const int search_range = 8; + MV best_mv = *dst_mv; thissme = cpi->refining_search_sad(x, &best_mv, sadpb, search_range, - fn_ptr, x->nmvjointcost, x->mvcost, - ref_mv); - + fn_ptr, ref_mv); + if (thissme < INT_MAX) + thissme = vp9_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1); if (thissme < bestsme) { bestsme = thissme; - dst_mv->as_int = best_mv.as_int; + *dst_mv = best_mv; } } return bestsme; } -int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv, +int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv, int sad_per_bit, int distance, - vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, - int *mvcost[2], - int_mv *center_mv, int n) { - const MACROBLOCKD* const xd = &x->e_mbd; - uint8_t *what = x->plane[0].src.buf; - int what_stride = x->plane[0].src.stride; - uint8_t *in_what; - int in_what_stride = xd->plane[0].pre[0].stride; - int mv_stride = xd->plane[0].pre[0].stride; - uint8_t *bestaddress; - int_mv *best_mv = &x->e_mbd.mi_8x8[0]->bmi[n].as_mv[0]; - int_mv this_mv; - int bestsad = INT_MAX; + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *center_mv, MV *best_mv) { int r, c; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const int row_min = MAX(ref_mv->row - distance, x->mv_row_min); + const int row_max = MIN(ref_mv->row + distance, x->mv_row_max); + const int col_min = MAX(ref_mv->col - distance, x->mv_col_min); + const int col_max = MIN(ref_mv->col + distance, x->mv_col_max); + const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; + int best_sad = fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, ref_mv), in_what->stride) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); + *best_mv = *ref_mv; - uint8_t *check_here; - int thissad; - - int ref_row = ref_mv->as_mv.row; - int ref_col = ref_mv->as_mv.col; - - int row_min = ref_row - distance; - int row_max = ref_row + distance; - int col_min = ref_col - distance; - int col_max = ref_col + distance; - int_mv fcenter_mv; - - int *mvjsadcost = x->nmvjointsadcost; - int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; - - fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; - fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; - - // Work out the mid point for the search - in_what = xd->plane[0].pre[0].buf; - bestaddress = in_what + (ref_row * xd->plane[0].pre[0].stride) + ref_col; - - best_mv->as_mv.row = ref_row; - best_mv->as_mv.col = ref_col; - - // Baseline value at the centre - bestsad = fn_ptr->sdf(what, what_stride, bestaddress, - in_what_stride, 0x7fffffff) - + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, sad_per_bit); - - // Apply further limits to prevent us looking using vectors that stretch - // beyond the UMV border - col_min = MAX(col_min, x->mv_col_min); - col_max = MIN(col_max, x->mv_col_max); - row_min = MAX(row_min, x->mv_row_min); - row_max = MIN(row_max, x->mv_row_max); - - for (r = row_min; r < row_max; r++) { - this_mv.as_mv.row = r; - check_here = r * mv_stride + in_what + col_min; - - for (c = col_min; c < col_max; c++) { - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, - bestsad); - - this_mv.as_mv.col = c; - thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, sad_per_bit); - - if (thissad < bestsad) { - bestsad = thissad; - best_mv->as_mv.row = r; - best_mv->as_mv.col = c; - bestaddress = check_here; + for (r = row_min; r < row_max; ++r) { + for (c = col_min; c < col_max; ++c) { + const MV mv = {r, c}; + const int sad = fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), in_what->stride) + + mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; } - - check_here++; } } - - this_mv.as_mv.row = best_mv->as_mv.row * 8; - this_mv.as_mv.col = best_mv->as_mv.col * 8; - - if (bestsad < INT_MAX) - return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, - (unsigned int *)(&thissad)) + - mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, - mvjcost, mvcost, x->errorperbit); - else - return INT_MAX; + return best_sad; } -int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv, +int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv, int sad_per_bit, int distance, - vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, - int *mvcost[2], int_mv *center_mv, int n) { - const MACROBLOCKD* const xd = &x->e_mbd; - uint8_t *what = x->plane[0].src.buf; - int what_stride = x->plane[0].src.stride; - uint8_t *in_what; - int in_what_stride = xd->plane[0].pre[0].stride; - int mv_stride = xd->plane[0].pre[0].stride; - uint8_t *bestaddress; - int_mv *best_mv = &x->e_mbd.mi_8x8[0]->bmi[n].as_mv[0]; - int_mv this_mv; - unsigned int bestsad = INT_MAX; - int r, c; + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *center_mv, MV *best_mv) { + int r; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const int row_min = MAX(ref_mv->row - distance, x->mv_row_min); + const int row_max = MIN(ref_mv->row + distance, x->mv_row_max); + const int col_min = MAX(ref_mv->col - distance, x->mv_col_min); + const int col_max = MIN(ref_mv->col + distance, x->mv_col_max); + const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; + unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, ref_mv), in_what->stride) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); + *best_mv = *ref_mv; - uint8_t *check_here; - unsigned int thissad; + for (r = row_min; r < row_max; ++r) { + int c = col_min; + const uint8_t *check_here = &in_what->buf[r * in_what->stride + c]; - int ref_row = ref_mv->as_mv.row; - int ref_col = ref_mv->as_mv.col; + if (fn_ptr->sdx3f != NULL) { + while ((c + 2) < col_max) { + int i; + unsigned int sads[3]; - int row_min = ref_row - distance; - int row_max = ref_row + distance; - int col_min = ref_col - distance; - int col_max = ref_col + distance; + fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride, + sads); - unsigned int sad_array[3]; - int_mv fcenter_mv; - - int *mvjsadcost = x->nmvjointsadcost; - int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; - - fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; - fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; - - // Work out the mid point for the search - in_what = xd->plane[0].pre[0].buf; - bestaddress = in_what + (ref_row * xd->plane[0].pre[0].stride) + ref_col; - - best_mv->as_mv.row = ref_row; - best_mv->as_mv.col = ref_col; - - // Baseline value at the centre - bestsad = fn_ptr->sdf(what, what_stride, - bestaddress, in_what_stride, 0x7fffffff) - + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, sad_per_bit); - - // Apply further limits to prevent us looking using vectors that stretch - // beyond the UMV border - col_min = MAX(col_min, x->mv_col_min); - col_max = MIN(col_max, x->mv_col_max); - row_min = MAX(row_min, x->mv_row_min); - row_max = MIN(row_max, x->mv_row_max); - - for (r = row_min; r < row_max; r++) { - this_mv.as_mv.row = r; - check_here = r * mv_stride + in_what + col_min; - c = col_min; - - while ((c + 2) < col_max) { - int i; - - fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array); - - for (i = 0; i < 3; i++) { - thissad = sad_array[i]; - - if (thissad < bestsad) { - this_mv.as_mv.col = c; - thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, sad_per_bit); - - if (thissad < bestsad) { - bestsad = thissad; - best_mv->as_mv.row = r; - best_mv->as_mv.col = c; - bestaddress = check_here; + for (i = 0; i < 3; ++i) { + unsigned int sad = sads[i]; + if (sad < best_sad) { + const MV mv = {r, c}; + sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } } + ++check_here; + ++c; } - - check_here++; - c++; } } while (c < col_max) { - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, - bestsad); - - if (thissad < bestsad) { - this_mv.as_mv.col = c; - thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, sad_per_bit); - - if (thissad < bestsad) { - bestsad = thissad; - best_mv->as_mv.row = r; - best_mv->as_mv.col = c; - bestaddress = check_here; + unsigned int sad = fn_ptr->sdf(what->buf, what->stride, + check_here, in_what->stride); + if (sad < best_sad) { + const MV mv = {r, c}; + sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; } } - - check_here++; - c++; + ++check_here; + ++c; } } - this_mv.as_mv.row = best_mv->as_mv.row * 8; - this_mv.as_mv.col = best_mv->as_mv.col * 8; - - if (bestsad < INT_MAX) - return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, - (unsigned int *)(&thissad)) + - mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, - mvjcost, mvcost, x->errorperbit); - else - return INT_MAX; + return best_sad; } -int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv, +int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv, int sad_per_bit, int distance, - vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], - int_mv *center_mv, int n) { - const MACROBLOCKD* const xd = &x->e_mbd; - uint8_t *what = x->plane[0].src.buf; - int what_stride = x->plane[0].src.stride; - uint8_t *in_what; - int in_what_stride = xd->plane[0].pre[0].stride; - int mv_stride = xd->plane[0].pre[0].stride; - uint8_t *bestaddress; - int_mv *best_mv = &x->e_mbd.mi_8x8[0]->bmi[n].as_mv[0]; - int_mv this_mv; - unsigned int bestsad = INT_MAX; - int r, c; + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *center_mv, MV *best_mv) { + int r; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const int row_min = MAX(ref_mv->row - distance, x->mv_row_min); + const int row_max = MIN(ref_mv->row + distance, x->mv_row_max); + const int col_min = MAX(ref_mv->col - distance, x->mv_col_min); + const int col_max = MIN(ref_mv->col + distance, x->mv_col_max); + const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; + unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, ref_mv), in_what->stride) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit); + *best_mv = *ref_mv; - uint8_t *check_here; - unsigned int thissad; + for (r = row_min; r < row_max; ++r) { + int c = col_min; + const uint8_t *check_here = &in_what->buf[r * in_what->stride + c]; - int ref_row = ref_mv->as_mv.row; - int ref_col = ref_mv->as_mv.col; + if (fn_ptr->sdx8f != NULL) { + while ((c + 7) < col_max) { + int i; + unsigned int sads[8]; - int row_min = ref_row - distance; - int row_max = ref_row + distance; - int col_min = ref_col - distance; - int col_max = ref_col + distance; + fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride, + sads); - DECLARE_ALIGNED_ARRAY(16, uint32_t, sad_array8, 8); - unsigned int sad_array[3]; - int_mv fcenter_mv; - - int *mvjsadcost = x->nmvjointsadcost; - int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; - - fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; - fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; - - // Work out the mid point for the search - in_what = xd->plane[0].pre[0].buf; - bestaddress = in_what + (ref_row * xd->plane[0].pre[0].stride) + ref_col; - - best_mv->as_mv.row = ref_row; - best_mv->as_mv.col = ref_col; - - // Baseline value at the centre - bestsad = fn_ptr->sdf(what, what_stride, - bestaddress, in_what_stride, 0x7fffffff) - + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, sad_per_bit); - - // Apply further limits to prevent us looking using vectors that stretch - // beyond the UMV border - col_min = MAX(col_min, x->mv_col_min); - col_max = MIN(col_max, x->mv_col_max); - row_min = MAX(row_min, x->mv_row_min); - row_max = MIN(row_max, x->mv_row_max); - - for (r = row_min; r < row_max; r++) { - this_mv.as_mv.row = r; - check_here = r * mv_stride + in_what + col_min; - c = col_min; - - while ((c + 7) < col_max) { - int i; - - fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8); - - for (i = 0; i < 8; i++) { - thissad = (unsigned int)sad_array8[i]; - - if (thissad < bestsad) { - this_mv.as_mv.col = c; - thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, sad_per_bit); - - if (thissad < bestsad) { - bestsad = thissad; - best_mv->as_mv.row = r; - best_mv->as_mv.col = c; - bestaddress = check_here; + for (i = 0; i < 8; ++i) { + unsigned int sad = sads[i]; + if (sad < best_sad) { + const MV mv = {r, c}; + sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } } + ++check_here; + ++c; } - - check_here++; - c++; } } - while ((c + 2) < col_max && fn_ptr->sdx3f != NULL) { - int i; + if (fn_ptr->sdx3f != NULL) { + while ((c + 2) < col_max) { + int i; + unsigned int sads[3]; - fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array); + fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride, + sads); - for (i = 0; i < 3; i++) { - thissad = sad_array[i]; - - if (thissad < bestsad) { - this_mv.as_mv.col = c; - thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, sad_per_bit); - - if (thissad < bestsad) { - bestsad = thissad; - best_mv->as_mv.row = r; - best_mv->as_mv.col = c; - bestaddress = check_here; + for (i = 0; i < 3; ++i) { + unsigned int sad = sads[i]; + if (sad < best_sad) { + const MV mv = {r, c}; + sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } } + ++check_here; + ++c; } - - check_here++; - c++; } } while (c < col_max) { - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, - bestsad); - - if (thissad < bestsad) { - this_mv.as_mv.col = c; - thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, sad_per_bit); - - if (thissad < bestsad) { - bestsad = thissad; - best_mv->as_mv.row = r; - best_mv->as_mv.col = c; - bestaddress = check_here; + unsigned int sad = fn_ptr->sdf(what->buf, what->stride, + check_here, in_what->stride); + if (sad < best_sad) { + const MV mv = {r, c}; + sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; } } - - check_here++; - c++; + ++check_here; + ++c; } } - this_mv.as_mv.row = best_mv->as_mv.row * 8; - this_mv.as_mv.col = best_mv->as_mv.col * 8; - - if (bestsad < INT_MAX) - return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, - (unsigned int *)(&thissad)) + - mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, - mvjcost, mvcost, x->errorperbit); - else - return INT_MAX; + return best_sad; } -int vp9_refining_search_sad_c(MACROBLOCK *x, - int_mv *ref_mv, int error_per_bit, - int search_range, vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], int_mv *center_mv) { - const MACROBLOCKD* const xd = &x->e_mbd; - MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}}; + +int vp9_refining_search_sad_c(const MACROBLOCK *x, + MV *ref_mv, int error_per_bit, + int search_range, + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *center_mv) { + const MACROBLOCKD *const xd = &x->e_mbd; + const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}}; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; + const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv); + unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride, best_address, + in_what->stride) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit); int i, j; - int this_row_offset, this_col_offset; - - int what_stride = x->plane[0].src.stride; - int in_what_stride = xd->plane[0].pre[0].stride; - uint8_t *what = x->plane[0].src.buf; - uint8_t *best_address = xd->plane[0].pre[0].buf + - (ref_mv->as_mv.row * xd->plane[0].pre[0].stride) + - ref_mv->as_mv.col; - uint8_t *check_here; - unsigned int thissad; - int_mv this_mv; - unsigned int bestsad = INT_MAX; - int_mv fcenter_mv; - - int *mvjsadcost = x->nmvjointsadcost; - int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; - - fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; - fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; - - bestsad = fn_ptr->sdf(what, what_stride, best_address, - in_what_stride, 0x7fffffff) + - mvsad_err_cost(&ref_mv->as_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, error_per_bit); for (i = 0; i < search_range; i++) { int best_site = -1; - - for (j = 0; j < 4; j++) { - this_row_offset = ref_mv->as_mv.row + neighbors[j].row; - this_col_offset = ref_mv->as_mv.col + neighbors[j].col; - - if ((this_col_offset > x->mv_col_min) && - (this_col_offset < x->mv_col_max) && - (this_row_offset > x->mv_row_min) && - (this_row_offset < x->mv_row_max)) { - check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + - best_address; - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, - bestsad); - - if (thissad < bestsad) { - this_mv.as_mv.row = this_row_offset; - this_mv.as_mv.col = this_col_offset; - thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, error_per_bit); - - if (thissad < bestsad) { - bestsad = thissad; - best_site = j; - } - } - } - } - - if (best_site == -1) { - break; - } else { - ref_mv->as_mv.row += neighbors[best_site].row; - ref_mv->as_mv.col += neighbors[best_site].col; - best_address += (neighbors[best_site].row) * in_what_stride + - neighbors[best_site].col; - } - } - - this_mv.as_mv.row = ref_mv->as_mv.row * 8; - this_mv.as_mv.col = ref_mv->as_mv.col * 8; - - if (bestsad < INT_MAX) - return fn_ptr->vf(what, what_stride, best_address, in_what_stride, - (unsigned int *)(&thissad)) + - mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, - mvjcost, mvcost, x->errorperbit); - else - return INT_MAX; -} - -int vp9_refining_search_sadx4(MACROBLOCK *x, - int_mv *ref_mv, int error_per_bit, - int search_range, vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], int_mv *center_mv) { - const MACROBLOCKD* const xd = &x->e_mbd; - MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}}; - int i, j; - int this_row_offset, this_col_offset; - - int what_stride = x->plane[0].src.stride; - int in_what_stride = xd->plane[0].pre[0].stride; - uint8_t *what = x->plane[0].src.buf; - uint8_t *best_address = xd->plane[0].pre[0].buf + - (ref_mv->as_mv.row * xd->plane[0].pre[0].stride) + - ref_mv->as_mv.col; - uint8_t *check_here; - unsigned int thissad; - int_mv this_mv; - unsigned int bestsad = INT_MAX; - int_mv fcenter_mv; - - int *mvjsadcost = x->nmvjointsadcost; - int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; - - fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; - fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; - - bestsad = fn_ptr->sdf(what, what_stride, best_address, - in_what_stride, 0x7fffffff) + - mvsad_err_cost(&ref_mv->as_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, error_per_bit); - - for (i = 0; i < search_range; i++) { - int best_site = -1; - int all_in = ((ref_mv->as_mv.row - 1) > x->mv_row_min) & - ((ref_mv->as_mv.row + 1) < x->mv_row_max) & - ((ref_mv->as_mv.col - 1) > x->mv_col_min) & - ((ref_mv->as_mv.col + 1) < x->mv_col_max); + const int all_in = ((ref_mv->row - 1) > x->mv_row_min) & + ((ref_mv->row + 1) < x->mv_row_max) & + ((ref_mv->col - 1) > x->mv_col_min) & + ((ref_mv->col + 1) < x->mv_col_max); if (all_in) { - unsigned int sad_array[4]; - unsigned char const *block_offset[4]; - block_offset[0] = best_address - in_what_stride; - block_offset[1] = best_address - 1; - block_offset[2] = best_address + 1; - block_offset[3] = best_address + in_what_stride; + unsigned int sads[4]; + const uint8_t *const positions[4] = { + best_address - in_what->stride, + best_address - 1, + best_address + 1, + best_address + in_what->stride + }; - fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, - sad_array); + fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads); - for (j = 0; j < 4; j++) { - if (sad_array[j] < bestsad) { - this_mv.as_mv.row = ref_mv->as_mv.row + neighbors[j].row; - this_mv.as_mv.col = ref_mv->as_mv.col + neighbors[j].col; - sad_array[j] += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, error_per_bit); - - if (sad_array[j] < bestsad) { - bestsad = sad_array[j]; + for (j = 0; j < 4; ++j) { + if (sads[j] < best_sad) { + const MV mv = {ref_mv->row + neighbors[j].row, + ref_mv->col + neighbors[j].col}; + sads[j] += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit); + if (sads[j] < best_sad) { + best_sad = sads[j]; best_site = j; } } } } else { - for (j = 0; j < 4; j++) { - this_row_offset = ref_mv->as_mv.row + neighbors[j].row; - this_col_offset = ref_mv->as_mv.col + neighbors[j].col; + for (j = 0; j < 4; ++j) { + const MV mv = {ref_mv->row + neighbors[j].row, + ref_mv->col + neighbors[j].col}; - if ((this_col_offset > x->mv_col_min) && - (this_col_offset < x->mv_col_max) && - (this_row_offset > x->mv_row_min) && - (this_row_offset < x->mv_row_max)) { - check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + - best_address; - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, - bestsad); - - if (thissad < bestsad) { - this_mv.as_mv.row = this_row_offset; - this_mv.as_mv.col = this_col_offset; - thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, error_per_bit); - - if (thissad < bestsad) { - bestsad = thissad; + if (is_mv_in(x, &mv)) { + unsigned int sad = fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), + in_what->stride); + if (sad < best_sad) { + sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit); + if (sad < best_sad) { + best_sad = sad; best_site = j; } } @@ -1996,88 +1396,48 @@ int vp9_refining_search_sadx4(MACROBLOCK *x, if (best_site == -1) { break; } else { - ref_mv->as_mv.row += neighbors[best_site].row; - ref_mv->as_mv.col += neighbors[best_site].col; - best_address += (neighbors[best_site].row) * in_what_stride + - neighbors[best_site].col; + ref_mv->row += neighbors[best_site].row; + ref_mv->col += neighbors[best_site].col; + best_address = get_buf_from_mv(in_what, ref_mv); } } - this_mv.as_mv.row = ref_mv->as_mv.row * 8; - this_mv.as_mv.col = ref_mv->as_mv.col * 8; - - if (bestsad < INT_MAX) - return fn_ptr->vf(what, what_stride, best_address, in_what_stride, - (unsigned int *)(&thissad)) + - mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, - mvjcost, mvcost, x->errorperbit); - else - return INT_MAX; + return best_sad; } -/* This function is called when we do joint motion search in comp_inter_inter - * mode. - */ -int vp9_refining_search_8p_c(MACROBLOCK *x, - int_mv *ref_mv, int error_per_bit, - int search_range, vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], int_mv *center_mv, - const uint8_t *second_pred, int w, int h) { - const MACROBLOCKD* const xd = &x->e_mbd; - MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0}, - {-1, -1}, {1, -1}, {-1, 1}, {1, 1}}; +// This function is called when we do joint motion search in comp_inter_inter +// mode. +int vp9_refining_search_8p_c(const MACROBLOCK *x, + MV *ref_mv, int error_per_bit, + int search_range, + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *center_mv, + const uint8_t *second_pred) { + const MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0}, + {-1, -1}, {1, -1}, {-1, 1}, {1, 1}}; + const MACROBLOCKD *const xd = &x->e_mbd; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; + unsigned int best_sad = fn_ptr->sdaf(what->buf, what->stride, + get_buf_from_mv(in_what, ref_mv), in_what->stride, second_pred) + + mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit); int i, j; - int this_row_offset, this_col_offset; - int what_stride = x->plane[0].src.stride; - int in_what_stride = xd->plane[0].pre[0].stride; - uint8_t *what = x->plane[0].src.buf; - uint8_t *best_address = xd->plane[0].pre[0].buf + - (ref_mv->as_mv.row * xd->plane[0].pre[0].stride) + - ref_mv->as_mv.col; - uint8_t *check_here; - unsigned int thissad; - int_mv this_mv; - unsigned int bestsad = INT_MAX; - int_mv fcenter_mv; - - int *mvjsadcost = x->nmvjointsadcost; - int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; - - fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; - fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; - - /* Get compound pred by averaging two pred blocks. */ - bestsad = fn_ptr->sdaf(what, what_stride, best_address, in_what_stride, - second_pred, 0x7fffffff) + - mvsad_err_cost(&ref_mv->as_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, error_per_bit); - - for (i = 0; i < search_range; i++) { + for (i = 0; i < search_range; ++i) { int best_site = -1; - for (j = 0; j < 8; j++) { - this_row_offset = ref_mv->as_mv.row + neighbors[j].row; - this_col_offset = ref_mv->as_mv.col + neighbors[j].col; + for (j = 0; j < 8; ++j) { + const MV mv = {ref_mv->row + neighbors[j].row, + ref_mv->col + neighbors[j].col}; - if ((this_col_offset > x->mv_col_min) && - (this_col_offset < x->mv_col_max) && - (this_row_offset > x->mv_row_min) && - (this_row_offset < x->mv_row_max)) { - check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + - best_address; - - /* Get compound block and use it to calculate SAD. */ - thissad = fn_ptr->sdaf(what, what_stride, check_here, in_what_stride, - second_pred, bestsad); - - if (thissad < bestsad) { - this_mv.as_mv.row = this_row_offset; - this_mv.as_mv.col = this_col_offset; - thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv, - mvjsadcost, mvsadcost, error_per_bit); - if (thissad < bestsad) { - bestsad = thissad; + if (is_mv_in(x, &mv)) { + unsigned int sad = fn_ptr->sdaf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), in_what->stride, second_pred); + if (sad < best_sad) { + sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit); + if (sad < best_sad) { + best_sad = sad; best_site = j; } } @@ -2087,24 +1447,63 @@ int vp9_refining_search_8p_c(MACROBLOCK *x, if (best_site == -1) { break; } else { - ref_mv->as_mv.row += neighbors[best_site].row; - ref_mv->as_mv.col += neighbors[best_site].col; - best_address += (neighbors[best_site].row) * in_what_stride + - neighbors[best_site].col; + ref_mv->row += neighbors[best_site].row; + ref_mv->col += neighbors[best_site].col; } } - - this_mv.as_mv.row = ref_mv->as_mv.row * 8; - this_mv.as_mv.col = ref_mv->as_mv.col * 8; - - if (bestsad < INT_MAX) { - // FIXME(rbultje, yunqing): add full-pixel averaging variance functions - // so we don't have to use the subpixel with xoff=0,yoff=0 here. - return fn_ptr->svaf(best_address, in_what_stride, 0, 0, what, what_stride, - (unsigned int *)(&thissad), second_pred) + - mv_err_cost(&this_mv.as_mv, ¢er_mv->as_mv, - mvjcost, mvcost, x->errorperbit); - } else { - return INT_MAX; - } + return best_sad; +} + +int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, MV *mvp_full, + int step_param, int error_per_bit, + int *sad_list, + const MV *ref_mv, MV *tmp_mv, + int var_max, int rd) { + const SPEED_FEATURES *const sf = &cpi->sf; + const SEARCH_METHODS method = sf->mv.search_method; + vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize]; + int var = 0; + if (sad_list) { + sad_list[0] = INT_MAX; + sad_list[1] = INT_MAX; + sad_list[2] = INT_MAX; + sad_list[3] = INT_MAX; + sad_list[4] = INT_MAX; + } + + switch (method) { + case FAST_DIAMOND: + var = vp9_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0, + sad_list, fn_ptr, 1, ref_mv, tmp_mv); + break; + case FAST_HEX: + var = vp9_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0, + sad_list, fn_ptr, 1, ref_mv, tmp_mv); + break; + case HEX: + var = vp9_hex_search(x, mvp_full, step_param, error_per_bit, 1, + sad_list, fn_ptr, 1, ref_mv, tmp_mv); + break; + case SQUARE: + var = vp9_square_search(x, mvp_full, step_param, error_per_bit, 1, + sad_list, fn_ptr, 1, ref_mv, tmp_mv); + break; + case BIGDIA: + var = vp9_bigdia_search(x, mvp_full, step_param, error_per_bit, 1, + sad_list, fn_ptr, 1, ref_mv, tmp_mv); + break; + case NSTEP: + var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit, + MAX_MVSEARCH_STEPS - 1 - step_param, + 1, fn_ptr, ref_mv, tmp_mv); + break; + default: + assert(!"Invalid search method."); + } + + if (method != NSTEP && rd && var < var_max) + var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, fn_ptr, 1); + + return var; } diff --git a/media/libvpx/vp9/encoder/vp9_mcomp.h b/media/libvpx/vp9/encoder/vp9_mcomp.h index bcab679c7e61..9b4734a6f0bc 100644 --- a/media/libvpx/vp9/encoder/vp9_mcomp.h +++ b/media/libvpx/vp9/encoder/vp9_mcomp.h @@ -15,117 +15,136 @@ #include "vp9/encoder/vp9_block.h" #include "vp9/encoder/vp9_variance.h" +#ifdef __cplusplus +extern "C" { +#endif + // The maximum number of steps in a step search given the largest // allowed initial step #define MAX_MVSEARCH_STEPS 11 -// Max full pel mv specified in 1 pel units -#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1) +// Max full pel mv specified in the unit of full pixel +// Enable the use of motion vector in range [-1023, 1023]. +#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS - 1)) - 1) // Maximum size of the first step in full pel units #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1)) // Allowed motion vector pixel distance outside image border // for Block_16x16 #define BORDER_MV_PIXELS_B16 (16 + VP9_INTERP_EXTEND) +// motion search site +typedef struct search_site { + MV mv; + int offset; +} search_site; -void vp9_clamp_mv_min_max(MACROBLOCK *x, MV *mv); +typedef struct search_site_config { + search_site ss[8 * MAX_MVSEARCH_STEPS + 1]; + int ss_count; + int searches_per_step; +} search_site_config; + +void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride); +void vp9_init3smotion_compensation(search_site_config *cfg, int stride); + +void vp9_set_mv_search_range(MACROBLOCK *x, const MV *mv); int vp9_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost, int *mvcost[2], int weight); -void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride); -void vp9_init3smotion_compensation(MACROBLOCK *x, int stride); + +// Utility to compute variance + MV rate cost for a given MV +int vp9_get_mvpred_var(const MACROBLOCK *x, + const MV *best_mv, const MV *center_mv, + const vp9_variance_fn_ptr_t *vfp, + int use_mvcost); +int vp9_get_mvpred_av_var(const MACROBLOCK *x, + const MV *best_mv, const MV *center_mv, + const uint8_t *second_pred, + const vp9_variance_fn_ptr_t *vfp, + int use_mvcost); struct VP9_COMP; -int vp9_init_search_range(struct VP9_COMP *cpi, int size); +struct SPEED_FEATURES; + +int vp9_init_search_range(int size); // Runs sequence of diamond searches in smaller steps for RD -int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x, - int_mv *mvp_full, int step_param, +int vp9_full_pixel_diamond(const struct VP9_COMP *cpi, MACROBLOCK *x, + MV *mvp_full, int step_param, int sadpb, int further_steps, int do_refine, - vp9_variance_fn_ptr_t *fn_ptr, - int_mv *ref_mv, int_mv *dst_mv); + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *ref_mv, MV *dst_mv); -int vp9_hex_search(MACROBLOCK *x, - MV *ref_mv, - int search_param, - int error_per_bit, - int do_init_search, - const vp9_variance_fn_ptr_t *vf, - int use_mvcost, - const MV *center_mv, - MV *best_mv); -int vp9_bigdia_search(MACROBLOCK *x, - MV *ref_mv, - int search_param, - int error_per_bit, - int do_init_search, - const vp9_variance_fn_ptr_t *vf, - int use_mvcost, - const MV *center_mv, - MV *best_mv); -int vp9_square_search(MACROBLOCK *x, - MV *ref_mv, - int search_param, - int error_per_bit, - int do_init_search, - const vp9_variance_fn_ptr_t *vf, - int use_mvcost, - const MV *center_mv, - MV *best_mv); +typedef int (integer_mv_pattern_search_fn) ( + const MACROBLOCK *x, + MV *ref_mv, + int search_param, + int error_per_bit, + int do_init_search, + int *sad_list, + const vp9_variance_fn_ptr_t *vf, + int use_mvcost, + const MV *center_mv, + MV *best_mv); + +integer_mv_pattern_search_fn vp9_hex_search; +integer_mv_pattern_search_fn vp9_bigdia_search; +integer_mv_pattern_search_fn vp9_square_search; +integer_mv_pattern_search_fn vp9_fast_hex_search; +integer_mv_pattern_search_fn vp9_fast_dia_search; typedef int (fractional_mv_step_fp) ( - MACROBLOCK *x, - MV *bestmv, const MV *ref_mv, - int allow_hp, - int error_per_bit, - const vp9_variance_fn_ptr_t *vfp, - int forced_stop, // 0 - full, 1 - qtr only, 2 - half only - int iters_per_step, - int *mvjcost, - int *mvcost[2], - int *distortion, - unsigned int *sse); -extern fractional_mv_step_fp vp9_find_best_sub_pixel_iterative; -extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree; - -typedef int (fractional_mv_step_comp_fp) ( - MACROBLOCK *x, + const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, // 0 - full, 1 - qtr only, 2 - half only int iters_per_step, + int *sad_list, int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1, const uint8_t *second_pred, int w, int h); -extern fractional_mv_step_comp_fp vp9_find_best_sub_pixel_comp_iterative; -extern fractional_mv_step_comp_fp vp9_find_best_sub_pixel_comp_tree; -typedef int (*vp9_full_search_fn_t)(MACROBLOCK *x, - int_mv *ref_mv, int sad_per_bit, - int distance, vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], - int_mv *center_mv, int n); +extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree; +extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned; -typedef int (*vp9_refining_search_fn_t)(MACROBLOCK *x, - int_mv *ref_mv, int sad_per_bit, +typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x, + const MV *ref_mv, int sad_per_bit, + int distance, + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *center_mv, MV *best_mv); + +typedef int (*vp9_refining_search_fn_t)(const MACROBLOCK *x, + MV *ref_mv, int sad_per_bit, int distance, - vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], - int_mv *center_mv); + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *center_mv); -typedef int (*vp9_diamond_search_fn_t)(MACROBLOCK *x, - int_mv *ref_mv, int_mv *best_mv, +typedef int (*vp9_diamond_search_fn_t)(const MACROBLOCK *x, + const search_site_config *cfg, + MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, - vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], - int_mv *center_mv); + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *center_mv); + +int vp9_refining_search_8p_c(const MACROBLOCK *x, + MV *ref_mv, int error_per_bit, + int search_range, + const vp9_variance_fn_ptr_t *fn_ptr, + const MV *center_mv, const uint8_t *second_pred); + +struct VP9_COMP; + +int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, MV *mvp_full, + int step_param, int error_per_bit, + int *sad_list, + const MV *ref_mv, MV *tmp_mv, + int var_max, int rd); + +#ifdef __cplusplus +} // extern "C" +#endif -int vp9_refining_search_8p_c(MACROBLOCK *x, - int_mv *ref_mv, int error_per_bit, - int search_range, vp9_variance_fn_ptr_t *fn_ptr, - int *mvjcost, int *mvcost[2], - int_mv *center_mv, const uint8_t *second_pred, - int w, int h); #endif // VP9_ENCODER_VP9_MCOMP_H_ diff --git a/media/libvpx/vp9/encoder/vp9_modecosts.c b/media/libvpx/vp9/encoder/vp9_modecosts.c deleted file mode 100644 index 7eb659232447..000000000000 --- a/media/libvpx/vp9/encoder/vp9_modecosts.c +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vp9/common/vp9_blockd.h" -#include "vp9/encoder/vp9_onyx_int.h" -#include "vp9/encoder/vp9_treewriter.h" -#include "vp9/common/vp9_entropymode.h" - - -void vp9_init_mode_costs(VP9_COMP *c) { - VP9_COMMON *const cm = &c->common; - const vp9_tree_index *KT = vp9_intra_mode_tree; - int i, j; - - for (i = 0; i < INTRA_MODES; i++) { - for (j = 0; j < INTRA_MODES; j++) { - vp9_cost_tokens((int *)c->mb.y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j], - KT); - } - } - - // TODO(rbultje) separate tables for superblock costing? - vp9_cost_tokens(c->mb.mbmode_cost, cm->fc.y_mode_prob[1], - vp9_intra_mode_tree); - vp9_cost_tokens(c->mb.intra_uv_mode_cost[1], - cm->fc.uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree); - vp9_cost_tokens(c->mb.intra_uv_mode_cost[0], - vp9_kf_uv_mode_prob[INTRA_MODES - 1], - vp9_intra_mode_tree); - - for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) - vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i], - cm->fc.switchable_interp_prob[i], - vp9_switchable_interp_tree); -} diff --git a/media/libvpx/vp9/encoder/vp9_modecosts.h b/media/libvpx/vp9/encoder/vp9_modecosts.h deleted file mode 100644 index f43033e5fc81..000000000000 --- a/media/libvpx/vp9/encoder/vp9_modecosts.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_ENCODER_VP9_MODECOSTS_H_ -#define VP9_ENCODER_VP9_MODECOSTS_H_ - -void vp9_init_mode_costs(VP9_COMP *x); - -#endif // VP9_ENCODER_VP9_MODECOSTS_H_ diff --git a/media/libvpx/vp9/encoder/vp9_onyx_if.c b/media/libvpx/vp9/encoder/vp9_onyx_if.c deleted file mode 100644 index b7874d5152d3..000000000000 --- a/media/libvpx/vp9/encoder/vp9_onyx_if.c +++ /dev/null @@ -1,4309 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include -#include - -#include "./vpx_config.h" -#include "./vpx_scale_rtcd.h" - -#include "vp9/common/vp9_alloccommon.h" -#include "vp9/common/vp9_filter.h" -#include "vp9/common/vp9_idct.h" -#if CONFIG_VP9_POSTPROC -#include "vp9/common/vp9_postproc.h" -#endif -#include "vp9/common/vp9_reconinter.h" -#include "vp9/common/vp9_systemdependent.h" -#include "vp9/common/vp9_tile_common.h" -#include "vp9/encoder/vp9_firstpass.h" -#include "vp9/encoder/vp9_mbgraph.h" -#include "vp9/encoder/vp9_onyx_int.h" -#include "vp9/encoder/vp9_picklpf.h" -#include "vp9/encoder/vp9_psnr.h" -#include "vp9/encoder/vp9_ratectrl.h" -#include "vp9/encoder/vp9_rdopt.h" -#include "vp9/encoder/vp9_segmentation.h" -#include "vp9/encoder/vp9_temporal_filter.h" -#include "vp9/encoder/vp9_vaq.h" - -#include "vpx_ports/vpx_timer.h" - - -extern void print_tree_update_probs(); - -static void set_default_lf_deltas(struct loopfilter *lf); - -#define DEFAULT_INTERP_FILTER SWITCHABLE - -#define SHARP_FILTER_QTHRESH 0 /* Q threshold for 8-tap sharp filter */ - -#define ALTREF_HIGH_PRECISION_MV 1 // Whether to use high precision mv - // for altref computation. -#define HIGH_PRECISION_MV_QTHRESH 200 // Q threshold for high precision - // mv. Choose a very high value for - // now so that HIGH_PRECISION is always - // chosen. - -// Masks for partially or completely disabling split mode -#define DISABLE_ALL_SPLIT 0x3F -#define DISABLE_ALL_INTER_SPLIT 0x1F -#define DISABLE_COMPOUND_SPLIT 0x18 -#define LAST_AND_INTRA_SPLIT_ONLY 0x1E - -#if CONFIG_INTERNAL_STATS -extern double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *dest, int lumamask, - double *weight); - - -extern double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *dest, double *ssim_y, - double *ssim_u, double *ssim_v); - - -#endif - -// #define OUTPUT_YUV_REC - -#ifdef OUTPUT_YUV_SRC -FILE *yuv_file; -#endif -#ifdef OUTPUT_YUV_REC -FILE *yuv_rec_file; -#endif - -#if 0 -FILE *framepsnr; -FILE *kf_list; -FILE *keyfile; -#endif - - -#ifdef ENTROPY_STATS -extern int intra_mode_stats[INTRA_MODES] - [INTRA_MODES] - [INTRA_MODES]; -#endif - -#ifdef MODE_STATS -extern void init_tx_count_stats(); -extern void write_tx_count_stats(); -extern void init_switchable_interp_stats(); -extern void write_switchable_interp_stats(); -#endif - -#ifdef SPEEDSTATS -unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0}; -#endif - -#if defined(SECTIONBITS_OUTPUT) -extern unsigned __int64 Sectionbits[500]; -#endif - -extern void vp9_init_quantizer(VP9_COMP *cpi); - -// Tables relating active max Q to active min Q -static int kf_low_motion_minq[QINDEX_RANGE]; -static int kf_high_motion_minq[QINDEX_RANGE]; -static int gf_low_motion_minq[QINDEX_RANGE]; -static int gf_high_motion_minq[QINDEX_RANGE]; -static int inter_minq[QINDEX_RANGE]; -static int afq_low_motion_minq[QINDEX_RANGE]; -static int afq_high_motion_minq[QINDEX_RANGE]; - -static INLINE void Scale2Ratio(int mode, int *hr, int *hs) { - switch (mode) { - case NORMAL: - *hr = 1; - *hs = 1; - break; - case FOURFIVE: - *hr = 4; - *hs = 5; - break; - case THREEFIVE: - *hr = 3; - *hs = 5; - break; - case ONETWO: - *hr = 1; - *hs = 2; - break; - default: - *hr = 1; - *hs = 1; - assert(0); - break; - } -} - -// Functions to compute the active minq lookup table entries based on a -// formulaic approach to facilitate easier adjustment of the Q tables. -// The formulae were derived from computing a 3rd order polynomial best -// fit to the original data (after plotting real maxq vs minq (not q index)) -static int calculate_minq_index(double maxq, - double x3, double x2, double x1, double c) { - int i; - const double minqtarget = MIN(((x3 * maxq + x2) * maxq + x1) * maxq + c, - maxq); - - // Special case handling to deal with the step from q2.0 - // down to lossless mode represented by q 1.0. - if (minqtarget <= 2.0) - return 0; - - for (i = 0; i < QINDEX_RANGE; i++) { - if (minqtarget <= vp9_convert_qindex_to_q(i)) - return i; - } - - return QINDEX_RANGE - 1; -} - -static void init_minq_luts(void) { - int i; - - for (i = 0; i < QINDEX_RANGE; i++) { - const double maxq = vp9_convert_qindex_to_q(i); - - - kf_low_motion_minq[i] = calculate_minq_index(maxq, - 0.000001, - -0.0004, - 0.15, - 0.0); - kf_high_motion_minq[i] = calculate_minq_index(maxq, - 0.000002, - -0.0012, - 0.5, - 0.0); - - gf_low_motion_minq[i] = calculate_minq_index(maxq, - 0.0000015, - -0.0009, - 0.32, - 0.0); - gf_high_motion_minq[i] = calculate_minq_index(maxq, - 0.0000021, - -0.00125, - 0.50, - 0.0); - inter_minq[i] = calculate_minq_index(maxq, - 0.00000271, - -0.00113, - 0.75, - 0.0); - afq_low_motion_minq[i] = calculate_minq_index(maxq, - 0.0000015, - -0.0009, - 0.33, - 0.0); - afq_high_motion_minq[i] = calculate_minq_index(maxq, - 0.0000021, - -0.00125, - 0.55, - 0.0); - } -} - -static int get_active_quality(int q, - int gfu_boost, - int low, - int high, - int *low_motion_minq, - int *high_motion_minq) { - int active_best_quality; - if (gfu_boost > high) { - active_best_quality = low_motion_minq[q]; - } else if (gfu_boost < low) { - active_best_quality = high_motion_minq[q]; - } else { - const int gap = high - low; - const int offset = high - gfu_boost; - const int qdiff = high_motion_minq[q] - low_motion_minq[q]; - const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap; - active_best_quality = low_motion_minq[q] + adjustment; - } - return active_best_quality; -} - -static void set_mvcost(VP9_COMP *cpi) { - MACROBLOCK *const mb = &cpi->mb; - if (cpi->common.allow_high_precision_mv) { - mb->mvcost = mb->nmvcost_hp; - mb->mvsadcost = mb->nmvsadcost_hp; - } else { - mb->mvcost = mb->nmvcost; - mb->mvsadcost = mb->nmvsadcost; - } -} - -void vp9_initialize_enc() { - static int init_done = 0; - - if (!init_done) { - vp9_initialize_common(); - vp9_tokenize_initialize(); - vp9_init_quant_tables(); - vp9_init_me_luts(); - init_minq_luts(); - // init_base_skip_probs(); - init_done = 1; - } -} - -static void setup_features(VP9_COMMON *cm) { - struct loopfilter *const lf = &cm->lf; - struct segmentation *const seg = &cm->seg; - - // Set up default state for MB feature flags - seg->enabled = 0; - - seg->update_map = 0; - seg->update_data = 0; - vpx_memset(seg->tree_probs, 255, sizeof(seg->tree_probs)); - - vp9_clearall_segfeatures(seg); - - lf->mode_ref_delta_enabled = 0; - lf->mode_ref_delta_update = 0; - vp9_zero(lf->ref_deltas); - vp9_zero(lf->mode_deltas); - vp9_zero(lf->last_ref_deltas); - vp9_zero(lf->last_mode_deltas); - - set_default_lf_deltas(lf); -} - -static void dealloc_compressor_data(VP9_COMP *cpi) { - // Delete sementation map - vpx_free(cpi->segmentation_map); - cpi->segmentation_map = 0; - vpx_free(cpi->common.last_frame_seg_map); - cpi->common.last_frame_seg_map = 0; - vpx_free(cpi->coding_context.last_frame_seg_map_copy); - cpi->coding_context.last_frame_seg_map_copy = 0; - - vpx_free(cpi->active_map); - cpi->active_map = 0; - - vp9_free_frame_buffers(&cpi->common); - - vp9_free_frame_buffer(&cpi->last_frame_uf); - vp9_free_frame_buffer(&cpi->scaled_source); - vp9_free_frame_buffer(&cpi->alt_ref_buffer); - vp9_lookahead_destroy(cpi->lookahead); - - vpx_free(cpi->tok); - cpi->tok = 0; - - // Activity mask based per mb zbin adjustments - vpx_free(cpi->mb_activity_map); - cpi->mb_activity_map = 0; - vpx_free(cpi->mb_norm_activity_map); - cpi->mb_norm_activity_map = 0; - - vpx_free(cpi->above_context[0]); - cpi->above_context[0] = NULL; - - vpx_free(cpi->above_seg_context); - cpi->above_seg_context = NULL; -} - -// Computes a q delta (in "q index" terms) to get from a starting q value -// to a target value -// target q value -int vp9_compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) { - int i; - int start_index = cpi->worst_quality; - int target_index = cpi->worst_quality; - - // Convert the average q value to an index. - for (i = cpi->best_quality; i < cpi->worst_quality; i++) { - start_index = i; - if (vp9_convert_qindex_to_q(i) >= qstart) - break; - } - - // Convert the q target to an index - for (i = cpi->best_quality; i < cpi->worst_quality; i++) { - target_index = i; - if (vp9_convert_qindex_to_q(i) >= qtarget) - break; - } - - return target_index - start_index; -} - -static void configure_static_seg_features(VP9_COMP *cpi) { - VP9_COMMON *cm = &cpi->common; - struct segmentation *seg = &cm->seg; - - int high_q = (int)(cpi->avg_q > 48.0); - int qi_delta; - - // Disable and clear down for KF - if (cm->frame_type == KEY_FRAME) { - // Clear down the global segmentation map - vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); - seg->update_map = 0; - seg->update_data = 0; - cpi->static_mb_pct = 0; - - // Disable segmentation - vp9_disable_segmentation((VP9_PTR)cpi); - - // Clear down the segment features. - vp9_clearall_segfeatures(seg); - } else if (cpi->refresh_alt_ref_frame) { - // If this is an alt ref frame - // Clear down the global segmentation map - vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); - seg->update_map = 0; - seg->update_data = 0; - cpi->static_mb_pct = 0; - - // Disable segmentation and individual segment features by default - vp9_disable_segmentation((VP9_PTR)cpi); - vp9_clearall_segfeatures(seg); - - // Scan frames from current to arf frame. - // This function re-enables segmentation if appropriate. - vp9_update_mbgraph_stats(cpi); - - // If segmentation was enabled set those features needed for the - // arf itself. - if (seg->enabled) { - seg->update_map = 1; - seg->update_data = 1; - - qi_delta = vp9_compute_qdelta(cpi, cpi->avg_q, (cpi->avg_q * 0.875)); - vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, (qi_delta - 2)); - vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2); - - vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q); - vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_LF); - - // Where relevant assume segment data is delta data - seg->abs_delta = SEGMENT_DELTADATA; - } - } else if (seg->enabled) { - // All other frames if segmentation has been enabled - - // First normal frame in a valid gf or alt ref group - if (cpi->frames_since_golden == 0) { - // Set up segment features for normal frames in an arf group - if (cpi->source_alt_ref_active) { - seg->update_map = 0; - seg->update_data = 1; - seg->abs_delta = SEGMENT_DELTADATA; - - qi_delta = vp9_compute_qdelta(cpi, cpi->avg_q, - (cpi->avg_q * 1.125)); - vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, (qi_delta + 2)); - vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q); - - vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2); - vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_LF); - - // Segment coding disabled for compred testing - if (high_q || (cpi->static_mb_pct == 100)) { - vp9_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME); - vp9_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME); - vp9_enable_segfeature(seg, 1, SEG_LVL_SKIP); - } - } else { - // Disable segmentation and clear down features if alt ref - // is not active for this group - - vp9_disable_segmentation((VP9_PTR)cpi); - - vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols); - - seg->update_map = 0; - seg->update_data = 0; - - vp9_clearall_segfeatures(seg); - } - } else if (cpi->is_src_frame_alt_ref) { - // Special case where we are coding over the top of a previous - // alt ref frame. - // Segment coding disabled for compred testing - - // Enable ref frame features for segment 0 as well - vp9_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME); - vp9_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME); - - // All mbs should use ALTREF_FRAME - vp9_clear_segdata(seg, 0, SEG_LVL_REF_FRAME); - vp9_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME); - vp9_clear_segdata(seg, 1, SEG_LVL_REF_FRAME); - vp9_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME); - - // Skip all MBs if high Q (0,0 mv and skip coeffs) - if (high_q) { - vp9_enable_segfeature(seg, 0, SEG_LVL_SKIP); - vp9_enable_segfeature(seg, 1, SEG_LVL_SKIP); - } - // Enable data update - seg->update_data = 1; - } else { - // All other frames. - - // No updates.. leave things as they are. - seg->update_map = 0; - seg->update_data = 0; - } - } -} - -#ifdef ENTROPY_STATS -void vp9_update_mode_context_stats(VP9_COMP *cpi) { - VP9_COMMON *cm = &cpi->common; - int i, j; - unsigned int (*inter_mode_counts)[INTER_MODES - 1][2] = - cm->fc.inter_mode_counts; - int64_t (*mv_ref_stats)[INTER_MODES - 1][2] = cpi->mv_ref_stats; - FILE *f; - - // Read the past stats counters - f = fopen("mode_context.bin", "rb"); - if (!f) { - vpx_memset(cpi->mv_ref_stats, 0, sizeof(cpi->mv_ref_stats)); - } else { - fread(cpi->mv_ref_stats, sizeof(cpi->mv_ref_stats), 1, f); - fclose(f); - } - - // Add in the values for this frame - for (i = 0; i < INTER_MODE_CONTEXTS; i++) { - for (j = 0; j < INTER_MODES - 1; j++) { - mv_ref_stats[i][j][0] += (int64_t)inter_mode_counts[i][j][0]; - mv_ref_stats[i][j][1] += (int64_t)inter_mode_counts[i][j][1]; - } - } - - // Write back the accumulated stats - f = fopen("mode_context.bin", "wb"); - fwrite(cpi->mv_ref_stats, sizeof(cpi->mv_ref_stats), 1, f); - fclose(f); -} - -void print_mode_context(VP9_COMP *cpi) { - FILE *f = fopen("vp9_modecont.c", "a"); - int i, j; - - fprintf(f, "#include \"vp9_entropy.h\"\n"); - fprintf( - f, - "const int inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1] ="); - fprintf(f, "{\n"); - for (j = 0; j < INTER_MODE_CONTEXTS; j++) { - fprintf(f, " {/* %d */ ", j); - fprintf(f, " "); - for (i = 0; i < INTER_MODES - 1; i++) { - int this_prob; - int64_t count = cpi->mv_ref_stats[j][i][0] + cpi->mv_ref_stats[j][i][1]; - if (count) - this_prob = ((cpi->mv_ref_stats[j][i][0] * 256) + (count >> 1)) / count; - else - this_prob = 128; - - // context probs - fprintf(f, "%5d, ", this_prob); - } - fprintf(f, " },\n"); - } - - fprintf(f, "};\n"); - fclose(f); -} -#endif // ENTROPY_STATS - -// DEBUG: Print out the segment id of each MB in the current frame. -static void print_seg_map(VP9_COMP *cpi) { - VP9_COMMON *cm = &cpi->common; - int row, col; - int map_index = 0; - FILE *statsfile = fopen("segmap.stt", "a"); - - fprintf(statsfile, "%10d\n", cm->current_video_frame); - - for (row = 0; row < cpi->common.mi_rows; row++) { - for (col = 0; col < cpi->common.mi_cols; col++) { - fprintf(statsfile, "%10d", cpi->segmentation_map[map_index]); - map_index++; - } - fprintf(statsfile, "\n"); - } - fprintf(statsfile, "\n"); - - fclose(statsfile); -} - -static void update_reference_segmentation_map(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; - int row, col; - MODE_INFO **mi_8x8, **mi_8x8_ptr = cm->mi_grid_visible; - uint8_t *cache_ptr = cm->last_frame_seg_map, *cache; - - for (row = 0; row < cm->mi_rows; row++) { - mi_8x8 = mi_8x8_ptr; - cache = cache_ptr; - for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++) - cache[0] = mi_8x8[0]->mbmi.segment_id; - mi_8x8_ptr += cm->mode_info_stride; - cache_ptr += cm->mi_cols; - } -} - -static void set_default_lf_deltas(struct loopfilter *lf) { - lf->mode_ref_delta_enabled = 1; - lf->mode_ref_delta_update = 1; - - vp9_zero(lf->ref_deltas); - vp9_zero(lf->mode_deltas); - - // Test of ref frame deltas - lf->ref_deltas[INTRA_FRAME] = 2; - lf->ref_deltas[LAST_FRAME] = 0; - lf->ref_deltas[GOLDEN_FRAME] = -2; - lf->ref_deltas[ALTREF_FRAME] = -2; - - lf->mode_deltas[0] = 0; // Zero - lf->mode_deltas[1] = 0; // New mv -} - -static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode) { - SPEED_FEATURES *sf = &cpi->sf; - int i; - - // Set baseline threshold values - for (i = 0; i < MAX_MODES; ++i) - sf->thresh_mult[i] = mode == 0 ? -500 : 0; - - sf->thresh_mult[THR_NEARESTMV] = 0; - sf->thresh_mult[THR_NEARESTG] = 0; - sf->thresh_mult[THR_NEARESTA] = 0; - - sf->thresh_mult[THR_DC] += 1000; - - sf->thresh_mult[THR_NEWMV] += 1000; - sf->thresh_mult[THR_NEWA] += 1000; - sf->thresh_mult[THR_NEWG] += 1000; - - sf->thresh_mult[THR_NEARMV] += 1000; - sf->thresh_mult[THR_NEARA] += 1000; - sf->thresh_mult[THR_COMP_NEARESTLA] += 1000; - sf->thresh_mult[THR_COMP_NEARESTGA] += 1000; - - sf->thresh_mult[THR_TM] += 1000; - - sf->thresh_mult[THR_COMP_NEARLA] += 1500; - sf->thresh_mult[THR_COMP_NEWLA] += 2000; - sf->thresh_mult[THR_NEARG] += 1000; - sf->thresh_mult[THR_COMP_NEARGA] += 1500; - sf->thresh_mult[THR_COMP_NEWGA] += 2000; - - sf->thresh_mult[THR_ZEROMV] += 2000; - sf->thresh_mult[THR_ZEROG] += 2000; - sf->thresh_mult[THR_ZEROA] += 2000; - sf->thresh_mult[THR_COMP_ZEROLA] += 2500; - sf->thresh_mult[THR_COMP_ZEROGA] += 2500; - - sf->thresh_mult[THR_H_PRED] += 2000; - sf->thresh_mult[THR_V_PRED] += 2000; - sf->thresh_mult[THR_D45_PRED ] += 2500; - sf->thresh_mult[THR_D135_PRED] += 2500; - sf->thresh_mult[THR_D117_PRED] += 2500; - sf->thresh_mult[THR_D153_PRED] += 2500; - sf->thresh_mult[THR_D207_PRED] += 2500; - sf->thresh_mult[THR_D63_PRED] += 2500; - - /* disable frame modes if flags not set */ - if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) { - sf->thresh_mult[THR_NEWMV ] = INT_MAX; - sf->thresh_mult[THR_NEARESTMV] = INT_MAX; - sf->thresh_mult[THR_ZEROMV ] = INT_MAX; - sf->thresh_mult[THR_NEARMV ] = INT_MAX; - } - if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) { - sf->thresh_mult[THR_NEARESTG ] = INT_MAX; - sf->thresh_mult[THR_ZEROG ] = INT_MAX; - sf->thresh_mult[THR_NEARG ] = INT_MAX; - sf->thresh_mult[THR_NEWG ] = INT_MAX; - } - if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) { - sf->thresh_mult[THR_NEARESTA ] = INT_MAX; - sf->thresh_mult[THR_ZEROA ] = INT_MAX; - sf->thresh_mult[THR_NEARA ] = INT_MAX; - sf->thresh_mult[THR_NEWA ] = INT_MAX; - } - - if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) != - (VP9_LAST_FLAG | VP9_ALT_FLAG)) { - sf->thresh_mult[THR_COMP_ZEROLA ] = INT_MAX; - sf->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX; - sf->thresh_mult[THR_COMP_NEARLA ] = INT_MAX; - sf->thresh_mult[THR_COMP_NEWLA ] = INT_MAX; - } - if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) != - (VP9_GOLD_FLAG | VP9_ALT_FLAG)) { - sf->thresh_mult[THR_COMP_ZEROGA ] = INT_MAX; - sf->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX; - sf->thresh_mult[THR_COMP_NEARGA ] = INT_MAX; - sf->thresh_mult[THR_COMP_NEWGA ] = INT_MAX; - } -} - -static void set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi, int mode) { - SPEED_FEATURES *sf = &cpi->sf; - int i; - - for (i = 0; i < MAX_REFS; ++i) - sf->thresh_mult_sub8x8[i] = mode == 0 ? -500 : 0; - - sf->thresh_mult_sub8x8[THR_LAST] += 2500; - sf->thresh_mult_sub8x8[THR_GOLD] += 2500; - sf->thresh_mult_sub8x8[THR_ALTR] += 2500; - sf->thresh_mult_sub8x8[THR_INTRA] += 2500; - sf->thresh_mult_sub8x8[THR_COMP_LA] += 4500; - sf->thresh_mult_sub8x8[THR_COMP_GA] += 4500; - - // Check for masked out split cases. - for (i = 0; i < MAX_REFS; i++) { - if (sf->disable_split_mask & (1 << i)) - sf->thresh_mult_sub8x8[i] = INT_MAX; - } - - // disable mode test if frame flag is not set - if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) - sf->thresh_mult_sub8x8[THR_LAST] = INT_MAX; - if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) - sf->thresh_mult_sub8x8[THR_GOLD] = INT_MAX; - if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) - sf->thresh_mult_sub8x8[THR_ALTR] = INT_MAX; - if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) != - (VP9_LAST_FLAG | VP9_ALT_FLAG)) - sf->thresh_mult_sub8x8[THR_COMP_LA] = INT_MAX; - if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) != - (VP9_GOLD_FLAG | VP9_ALT_FLAG)) - sf->thresh_mult_sub8x8[THR_COMP_GA] = INT_MAX; -} - -void vp9_set_speed_features(VP9_COMP *cpi) { - SPEED_FEATURES *sf = &cpi->sf; - int mode = cpi->compressor_speed; - int speed = cpi->speed; - int i; - - // Only modes 0 and 1 supported for now in experimental code basae - if (mode > 1) - mode = 1; - - for (i = 0; i < MAX_MODES; ++i) - cpi->mode_chosen_counts[i] = 0; - - // best quality defaults - sf->RD = 1; - sf->search_method = NSTEP; - sf->auto_filter = 1; - sf->recode_loop = 1; - sf->subpel_search_method = SUBPEL_TREE; - sf->subpel_iters_per_step = 2; - sf->optimize_coefficients = !cpi->oxcf.lossless; - sf->reduce_first_step_size = 0; - sf->auto_mv_step_size = 0; - sf->max_step_search_steps = MAX_MVSEARCH_STEPS; - sf->comp_inter_joint_search_thresh = BLOCK_4X4; - sf->adaptive_rd_thresh = 0; - sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_OFF; - sf->tx_size_search_method = USE_FULL_RD; - sf->use_lp32x32fdct = 0; - sf->adaptive_motion_search = 0; - sf->use_avoid_tested_higherror = 0; - sf->reference_masking = 0; - sf->use_one_partition_size_always = 0; - sf->less_rectangular_check = 0; - sf->use_square_partition_only = 0; - sf->auto_min_max_partition_size = 0; - sf->max_partition_size = BLOCK_64X64; - sf->min_partition_size = BLOCK_4X4; - sf->adjust_partitioning_from_last_frame = 0; - sf->last_partitioning_redo_frequency = 4; - sf->disable_split_mask = 0; - sf->mode_search_skip_flags = 0; - sf->disable_split_var_thresh = 0; - sf->disable_filter_search_var_thresh = 0; - for (i = 0; i < TX_SIZES; i++) { - sf->intra_y_mode_mask[i] = ALL_INTRA_MODES; - sf->intra_uv_mode_mask[i] = ALL_INTRA_MODES; - } - sf->use_rd_breakout = 0; - sf->skip_encode_sb = 0; - sf->use_uv_intra_rd_estimate = 0; - sf->use_fast_lpf_pick = 0; - sf->use_fast_coef_updates = 0; - sf->using_small_partition_info = 0; - sf->mode_skip_start = MAX_MODES; // Mode index at which mode skip mask set - -#if CONFIG_MULTIPLE_ARF - // Switch segmentation off. - sf->static_segmentation = 0; -#else - sf->static_segmentation = 0; -#endif - - switch (mode) { - case 0: // This is the best quality mode. - break; - - case 1: -#if CONFIG_MULTIPLE_ARF - // Switch segmentation off. - sf->static_segmentation = 0; -#else - sf->static_segmentation = 0; -#endif - sf->use_avoid_tested_higherror = 1; - sf->adaptive_rd_thresh = 1; - sf->recode_loop = (speed < 1); - - if (speed == 1) { - sf->use_square_partition_only = !frame_is_intra_only(&cpi->common); - sf->less_rectangular_check = 1; - sf->tx_size_search_method = frame_is_intra_only(&cpi->common) - ? USE_FULL_RD : USE_LARGESTALL; - - if (MIN(cpi->common.width, cpi->common.height) >= 720) - sf->disable_split_mask = cpi->common.show_frame ? - DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; - else - sf->disable_split_mask = DISABLE_COMPOUND_SPLIT; - - sf->use_rd_breakout = 1; - sf->adaptive_motion_search = 1; - sf->auto_mv_step_size = 1; - sf->adaptive_rd_thresh = 2; - sf->recode_loop = 2; - sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; - sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V; - sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V; - } - if (speed == 2) { - sf->use_square_partition_only = !frame_is_intra_only(&cpi->common); - sf->less_rectangular_check = 1; - sf->tx_size_search_method = frame_is_intra_only(&cpi->common) - ? USE_FULL_RD : USE_LARGESTALL; - - if (MIN(cpi->common.width, cpi->common.height) >= 720) - sf->disable_split_mask = cpi->common.show_frame ? - DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; - else - sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY; - - - sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH | - FLAG_SKIP_INTRA_BESTINTER | - FLAG_SKIP_COMP_BESTINTRA | - FLAG_SKIP_INTRA_LOWVAR; - - sf->use_rd_breakout = 1; - sf->adaptive_motion_search = 1; - sf->auto_mv_step_size = 1; - - sf->disable_filter_search_var_thresh = 16; - sf->comp_inter_joint_search_thresh = BLOCK_SIZES; - - sf->auto_min_max_partition_size = 1; - sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION; - sf->adjust_partitioning_from_last_frame = 1; - sf->last_partitioning_redo_frequency = 3; - - sf->adaptive_rd_thresh = 2; - sf->recode_loop = 2; - sf->use_lp32x32fdct = 1; - sf->mode_skip_start = 11; - sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; - sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; - sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V; - sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V; - } - if (speed == 3) { - sf->use_square_partition_only = 1; - sf->tx_size_search_method = USE_LARGESTALL; - - if (MIN(cpi->common.width, cpi->common.height) >= 720) - sf->disable_split_mask = DISABLE_ALL_SPLIT; - else - sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT; - - sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH | - FLAG_SKIP_INTRA_BESTINTER | - FLAG_SKIP_COMP_BESTINTRA | - FLAG_SKIP_INTRA_LOWVAR; - - sf->use_rd_breakout = 1; - sf->adaptive_motion_search = 1; - sf->auto_mv_step_size = 1; - - sf->disable_filter_search_var_thresh = 16; - sf->comp_inter_joint_search_thresh = BLOCK_SIZES; - - sf->auto_min_max_partition_size = 1; - sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL; - sf->adjust_partitioning_from_last_frame = 1; - sf->last_partitioning_redo_frequency = 3; - - sf->use_uv_intra_rd_estimate = 1; - sf->skip_encode_sb = 1; - sf->use_lp32x32fdct = 1; - sf->subpel_iters_per_step = 1; - sf->use_fast_coef_updates = 2; - - sf->adaptive_rd_thresh = 4; - sf->mode_skip_start = 6; - } - if (speed == 4) { - sf->use_square_partition_only = 1; - sf->tx_size_search_method = USE_LARGESTALL; - sf->disable_split_mask = DISABLE_ALL_SPLIT; - - sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH | - FLAG_SKIP_INTRA_BESTINTER | - FLAG_SKIP_COMP_BESTINTRA | - FLAG_SKIP_COMP_REFMISMATCH | - FLAG_SKIP_INTRA_LOWVAR | - FLAG_EARLY_TERMINATE; - - sf->use_rd_breakout = 1; - sf->adaptive_motion_search = 1; - sf->auto_mv_step_size = 1; - - sf->disable_filter_search_var_thresh = 16; - sf->comp_inter_joint_search_thresh = BLOCK_SIZES; - - sf->auto_min_max_partition_size = 1; - sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL; - sf->adjust_partitioning_from_last_frame = 1; - sf->last_partitioning_redo_frequency = 3; - - sf->use_uv_intra_rd_estimate = 1; - sf->skip_encode_sb = 1; - sf->use_lp32x32fdct = 1; - sf->subpel_iters_per_step = 1; - sf->use_fast_coef_updates = 2; - - sf->adaptive_rd_thresh = 4; - sf->mode_skip_start = 6; - - /* sf->intra_y_mode_mask = INTRA_DC_ONLY; - sf->intra_uv_mode_mask = INTRA_DC_ONLY; - sf->search_method = BIGDIA; - sf->disable_split_var_thresh = 64; - sf->disable_filter_search_var_thresh = 64; */ - } - if (speed == 5) { - sf->comp_inter_joint_search_thresh = BLOCK_SIZES; - sf->use_one_partition_size_always = 1; - sf->always_this_block_size = BLOCK_16X16; - sf->tx_size_search_method = frame_is_intra_only(&cpi->common) ? - USE_FULL_RD : USE_LARGESTALL; - sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH | - FLAG_SKIP_INTRA_BESTINTER | - FLAG_SKIP_COMP_BESTINTRA | - FLAG_SKIP_COMP_REFMISMATCH | - FLAG_SKIP_INTRA_LOWVAR | - FLAG_EARLY_TERMINATE; - sf->use_rd_breakout = 1; - sf->use_lp32x32fdct = 1; - sf->optimize_coefficients = 0; - sf->auto_mv_step_size = 1; - // sf->reduce_first_step_size = 1; - // sf->reference_masking = 1; - - sf->disable_split_mask = DISABLE_ALL_SPLIT; - sf->search_method = HEX; - sf->subpel_iters_per_step = 1; - sf->disable_split_var_thresh = 64; - sf->disable_filter_search_var_thresh = 96; - for (i = 0; i < TX_SIZES; i++) { - sf->intra_y_mode_mask[i] = INTRA_DC_ONLY; - sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY; - } - sf->use_fast_coef_updates = 2; - sf->adaptive_rd_thresh = 4; - sf->mode_skip_start = 6; - } - break; - }; /* switch */ - - // Set rd thresholds based on mode and speed setting - set_rd_speed_thresholds(cpi, mode); - set_rd_speed_thresholds_sub8x8(cpi, mode); - - // Slow quant, dct and trellis not worthwhile for first pass - // so make sure they are always turned off. - if (cpi->pass == 1) { - sf->optimize_coefficients = 0; - } - - // No recode for 1 pass. - if (cpi->pass == 0) { - sf->recode_loop = 0; - sf->optimize_coefficients = 0; - } - - cpi->mb.fwd_txm4x4 = vp9_fdct4x4; - if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) { - cpi->mb.fwd_txm4x4 = vp9_fwht4x4; - } - - if (cpi->sf.subpel_search_method == SUBPEL_ITERATIVE) { - cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_iterative; - cpi->find_fractional_mv_step_comp = vp9_find_best_sub_pixel_comp_iterative; - } else if (cpi->sf.subpel_search_method == SUBPEL_TREE) { - cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree; - cpi->find_fractional_mv_step_comp = vp9_find_best_sub_pixel_comp_tree; - } - - cpi->mb.optimize = cpi->sf.optimize_coefficients == 1 && cpi->pass != 1; - -#ifdef SPEEDSTATS - frames_at_speed[cpi->speed]++; -#endif -} - -static void alloc_raw_frame_buffers(VP9_COMP *cpi) { - VP9_COMMON *cm = &cpi->common; - - cpi->lookahead = vp9_lookahead_init(cpi->oxcf.width, cpi->oxcf.height, - cm->subsampling_x, cm->subsampling_y, - cpi->oxcf.lag_in_frames); - if (!cpi->lookahead) - vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, - "Failed to allocate lag buffers"); - - if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer, - cpi->oxcf.width, cpi->oxcf.height, - cm->subsampling_x, cm->subsampling_y, - VP9BORDERINPIXELS)) - vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, - "Failed to allocate altref buffer"); -} - -void vp9_alloc_compressor_data(VP9_COMP *cpi) { - VP9_COMMON *cm = &cpi->common; - - if (vp9_alloc_frame_buffers(cm, cm->width, cm->height)) - vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, - "Failed to allocate frame buffers"); - - if (vp9_alloc_frame_buffer(&cpi->last_frame_uf, - cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, - VP9BORDERINPIXELS)) - vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, - "Failed to allocate last frame buffer"); - - if (vp9_alloc_frame_buffer(&cpi->scaled_source, - cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, - VP9BORDERINPIXELS)) - vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, - "Failed to allocate scaled source buffer"); - - vpx_free(cpi->tok); - - { - unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols); - - CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok))); - } - - vpx_free(cpi->mb_activity_map); - CHECK_MEM_ERROR(cm, cpi->mb_activity_map, - vpx_calloc(sizeof(unsigned int), - cm->mb_rows * cm->mb_cols)); - - vpx_free(cpi->mb_norm_activity_map); - CHECK_MEM_ERROR(cm, cpi->mb_norm_activity_map, - vpx_calloc(sizeof(unsigned int), - cm->mb_rows * cm->mb_cols)); - - // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm - // block where mi unit size is 8x8. - vpx_free(cpi->above_context[0]); - CHECK_MEM_ERROR(cm, cpi->above_context[0], - vpx_calloc(2 * mi_cols_aligned_to_sb(cm->mi_cols) * - MAX_MB_PLANE, - sizeof(*cpi->above_context[0]))); - - vpx_free(cpi->above_seg_context); - CHECK_MEM_ERROR(cm, cpi->above_seg_context, - vpx_calloc(mi_cols_aligned_to_sb(cm->mi_cols), - sizeof(*cpi->above_seg_context))); -} - - -static void update_frame_size(VP9_COMP *cpi) { - VP9_COMMON *cm = &cpi->common; - - vp9_update_frame_size(cm); - - // Update size of buffers local to this frame - if (vp9_realloc_frame_buffer(&cpi->last_frame_uf, - cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, - VP9BORDERINPIXELS)) - vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, - "Failed to reallocate last frame buffer"); - - if (vp9_realloc_frame_buffer(&cpi->scaled_source, - cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, - VP9BORDERINPIXELS)) - vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, - "Failed to reallocate scaled source buffer"); - - { - int y_stride = cpi->scaled_source.y_stride; - - if (cpi->sf.search_method == NSTEP) { - vp9_init3smotion_compensation(&cpi->mb, y_stride); - } else if (cpi->sf.search_method == DIAMOND) { - vp9_init_dsmotion_compensation(&cpi->mb, y_stride); - } - } - - { - int i; - for (i = 1; i < MAX_MB_PLANE; ++i) { - cpi->above_context[i] = cpi->above_context[0] + - i * sizeof(*cpi->above_context[0]) * 2 * - mi_cols_aligned_to_sb(cm->mi_cols); - } - } -} - - -// Table that converts 0-63 Q range values passed in outside to the Qindex -// range used internally. -static const int q_trans[] = { - 0, 4, 8, 12, 16, 20, 24, 28, - 32, 36, 40, 44, 48, 52, 56, 60, - 64, 68, 72, 76, 80, 84, 88, 92, - 96, 100, 104, 108, 112, 116, 120, 124, - 128, 132, 136, 140, 144, 148, 152, 156, - 160, 164, 168, 172, 176, 180, 184, 188, - 192, 196, 200, 204, 208, 212, 216, 220, - 224, 228, 232, 236, 240, 244, 249, 255, -}; - -int vp9_reverse_trans(int x) { - int i; - - for (i = 0; i < 64; i++) - if (q_trans[i] >= x) - return i; - - return 63; -}; -void vp9_new_framerate(VP9_COMP *cpi, double framerate) { - if (framerate < 0.1) - framerate = 30; - - cpi->oxcf.framerate = framerate; - cpi->output_framerate = cpi->oxcf.framerate; - cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth - / cpi->output_framerate); - cpi->av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth - / cpi->output_framerate); - cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth * - cpi->oxcf.two_pass_vbrmin_section / 100); - - - cpi->min_frame_bandwidth = MAX(cpi->min_frame_bandwidth, FRAME_OVERHEAD_BITS); - - // Set Maximum gf/arf interval - cpi->max_gf_interval = 16; - - // Extended interval for genuinely static scenes - cpi->twopass.static_scene_max_gf_interval = cpi->key_frame_frequency >> 1; - - // Special conditions when alt ref frame enabled in lagged compress mode - if (cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames) { - if (cpi->max_gf_interval > cpi->oxcf.lag_in_frames - 1) - cpi->max_gf_interval = cpi->oxcf.lag_in_frames - 1; - - if (cpi->twopass.static_scene_max_gf_interval > cpi->oxcf.lag_in_frames - 1) - cpi->twopass.static_scene_max_gf_interval = cpi->oxcf.lag_in_frames - 1; - } - - if (cpi->max_gf_interval > cpi->twopass.static_scene_max_gf_interval) - cpi->max_gf_interval = cpi->twopass.static_scene_max_gf_interval; -} - -static int64_t rescale(int val, int64_t num, int denom) { - int64_t llnum = num; - int64_t llden = denom; - int64_t llval = val; - - return (llval * llnum / llden); -} - -static void set_tile_limits(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; - - int min_log2_tile_cols, max_log2_tile_cols; - vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); - - cm->log2_tile_cols = clamp(cpi->oxcf.tile_columns, - min_log2_tile_cols, max_log2_tile_cols); - cm->log2_tile_rows = cpi->oxcf.tile_rows; -} - -static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { - VP9_COMP *cpi = (VP9_COMP *)(ptr); - VP9_COMMON *const cm = &cpi->common; - int i; - - cpi->oxcf = *oxcf; - - cm->version = oxcf->version; - - cm->width = oxcf->width; - cm->height = oxcf->height; - cm->subsampling_x = 0; - cm->subsampling_y = 0; - vp9_alloc_compressor_data(cpi); - - // change includes all joint functionality - vp9_change_config(ptr, oxcf); - - // Initialize active best and worst q and average q values. - cpi->active_worst_quality = cpi->oxcf.worst_allowed_q; - cpi->active_best_quality = cpi->oxcf.best_allowed_q; - cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q; - - // Initialise the starting buffer levels - cpi->buffer_level = cpi->oxcf.starting_buffer_level; - cpi->bits_off_target = cpi->oxcf.starting_buffer_level; - - cpi->rolling_target_bits = cpi->av_per_frame_bandwidth; - cpi->rolling_actual_bits = cpi->av_per_frame_bandwidth; - cpi->long_rolling_target_bits = cpi->av_per_frame_bandwidth; - cpi->long_rolling_actual_bits = cpi->av_per_frame_bandwidth; - - cpi->total_actual_bits = 0; - cpi->total_target_vs_actual = 0; - - cpi->static_mb_pct = 0; - - cpi->lst_fb_idx = 0; - cpi->gld_fb_idx = 1; - cpi->alt_fb_idx = 2; - - cpi->current_layer = 0; - cpi->use_svc = 0; - - set_tile_limits(cpi); - - cpi->fixed_divide[0] = 0; - for (i = 1; i < 512; i++) - cpi->fixed_divide[i] = 0x80000 / i; -} - - -void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { - VP9_COMP *cpi = (VP9_COMP *)(ptr); - VP9_COMMON *const cm = &cpi->common; - - if (!cpi || !oxcf) - return; - - if (cm->version != oxcf->version) { - cm->version = oxcf->version; - } - - cpi->oxcf = *oxcf; - - switch (cpi->oxcf.Mode) { - // Real time and one pass deprecated in test code base - case MODE_GOODQUALITY: - cpi->pass = 0; - cpi->compressor_speed = 2; - cpi->oxcf.cpu_used = clamp(cpi->oxcf.cpu_used, -5, 5); - break; - - case MODE_FIRSTPASS: - cpi->pass = 1; - cpi->compressor_speed = 1; - break; - - case MODE_SECONDPASS: - cpi->pass = 2; - cpi->compressor_speed = 1; - cpi->oxcf.cpu_used = clamp(cpi->oxcf.cpu_used, -5, 5); - break; - - case MODE_SECONDPASS_BEST: - cpi->pass = 2; - cpi->compressor_speed = 0; - break; - } - - cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q]; - cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q]; - cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level]; - - cpi->oxcf.lossless = oxcf->lossless; - cpi->mb.e_mbd.itxm_add = cpi->oxcf.lossless ? vp9_iwht4x4_add - : vp9_idct4x4_add; - cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL; - - cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG; - - // cpi->use_golden_frame_only = 0; - // cpi->use_last_frame_only = 0; - cpi->refresh_golden_frame = 0; - cpi->refresh_last_frame = 1; - cm->refresh_frame_context = 1; - cm->reset_frame_context = 0; - - setup_features(cm); - cpi->common.allow_high_precision_mv = 0; // Default mv precision - set_mvcost(cpi); - - { - int i; - - for (i = 0; i < MAX_SEGMENTS; i++) - cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout; - } - - // At the moment the first order values may not be > MAXQ - cpi->oxcf.fixed_q = MIN(cpi->oxcf.fixed_q, MAXQ); - - // local file playback mode == really big buffer - if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK) { - cpi->oxcf.starting_buffer_level = 60000; - cpi->oxcf.optimal_buffer_level = 60000; - cpi->oxcf.maximum_buffer_size = 240000; - } - - // Convert target bandwidth from Kbit/s to Bit/s - cpi->oxcf.target_bandwidth *= 1000; - - cpi->oxcf.starting_buffer_level = rescale(cpi->oxcf.starting_buffer_level, - cpi->oxcf.target_bandwidth, 1000); - - // Set or reset optimal and maximum buffer levels. - if (cpi->oxcf.optimal_buffer_level == 0) - cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8; - else - cpi->oxcf.optimal_buffer_level = rescale(cpi->oxcf.optimal_buffer_level, - cpi->oxcf.target_bandwidth, 1000); - - if (cpi->oxcf.maximum_buffer_size == 0) - cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8; - else - cpi->oxcf.maximum_buffer_size = rescale(cpi->oxcf.maximum_buffer_size, - cpi->oxcf.target_bandwidth, 1000); - - // Set up frame rate and related parameters rate control values. - vp9_new_framerate(cpi, cpi->oxcf.framerate); - - // Set absolute upper and lower quality limits - cpi->worst_quality = cpi->oxcf.worst_allowed_q; - cpi->best_quality = cpi->oxcf.best_allowed_q; - - // active values should only be modified if out of new range - cpi->active_worst_quality = clamp(cpi->active_worst_quality, - cpi->oxcf.best_allowed_q, - cpi->oxcf.worst_allowed_q); - - cpi->active_best_quality = clamp(cpi->active_best_quality, - cpi->oxcf.best_allowed_q, - cpi->oxcf.worst_allowed_q); - - cpi->buffered_mode = cpi->oxcf.optimal_buffer_level > 0; - - cpi->cq_target_quality = cpi->oxcf.cq_level; - - cm->mcomp_filter_type = DEFAULT_INTERP_FILTER; - - cpi->target_bandwidth = cpi->oxcf.target_bandwidth; - - cm->display_width = cpi->oxcf.width; - cm->display_height = cpi->oxcf.height; - - // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs) - cpi->oxcf.Sharpness = MIN(7, cpi->oxcf.Sharpness); - - cpi->common.lf.sharpness_level = cpi->oxcf.Sharpness; - - if (cpi->initial_width) { - // Increasing the size of the frame beyond the first seen frame, or some - // otherwise signalled maximum size, is not supported. - // TODO(jkoleszar): exit gracefully. - assert(cm->width <= cpi->initial_width); - assert(cm->height <= cpi->initial_height); - } - update_frame_size(cpi); - - if (cpi->oxcf.fixed_q >= 0) { - cpi->last_q[0] = cpi->oxcf.fixed_q; - cpi->last_q[1] = cpi->oxcf.fixed_q; - cpi->last_boosted_qindex = cpi->oxcf.fixed_q; - } - - cpi->speed = cpi->oxcf.cpu_used; - - if (cpi->oxcf.lag_in_frames == 0) { - // force to allowlag to 0 if lag_in_frames is 0; - cpi->oxcf.allow_lag = 0; - } else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS) { - // Limit on lag buffers as these are not currently dynamically allocated - cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS; - } - - // YX Temp -#if CONFIG_MULTIPLE_ARF - vp9_zero(cpi->alt_ref_source); -#else - cpi->alt_ref_source = NULL; -#endif - cpi->is_src_frame_alt_ref = 0; - -#if 0 - // Experimental RD Code - cpi->frame_distortion = 0; - cpi->last_frame_distortion = 0; -#endif - - set_tile_limits(cpi); -} - -#define M_LOG2_E 0.693147180559945309417 -#define log2f(x) (log (x) / (float) M_LOG2_E) - -static void cal_nmvjointsadcost(int *mvjointsadcost) { - mvjointsadcost[0] = 600; - mvjointsadcost[1] = 300; - mvjointsadcost[2] = 300; - mvjointsadcost[0] = 300; -} - -static void cal_nmvsadcosts(int *mvsadcost[2]) { - int i = 1; - - mvsadcost[0][0] = 0; - mvsadcost[1][0] = 0; - - do { - double z = 256 * (2 * (log2f(8 * i) + .6)); - mvsadcost[0][i] = (int)z; - mvsadcost[1][i] = (int)z; - mvsadcost[0][-i] = (int)z; - mvsadcost[1][-i] = (int)z; - } while (++i <= MV_MAX); -} - -static void cal_nmvsadcosts_hp(int *mvsadcost[2]) { - int i = 1; - - mvsadcost[0][0] = 0; - mvsadcost[1][0] = 0; - - do { - double z = 256 * (2 * (log2f(8 * i) + .6)); - mvsadcost[0][i] = (int)z; - mvsadcost[1][i] = (int)z; - mvsadcost[0][-i] = (int)z; - mvsadcost[1][-i] = (int)z; - } while (++i <= MV_MAX); -} - -static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk, - PICK_MODE_CONTEXT *ctx) { - int num_pix = num_4x4_blk << 4; - int i, k; - ctx->num_4x4_blk = num_4x4_blk; - CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, - vpx_calloc(num_4x4_blk, sizeof(uint8_t))); - for (i = 0; i < MAX_MB_PLANE; ++i) { - for (k = 0; k < 3; ++k) { - CHECK_MEM_ERROR(cm, ctx->coeff[i][k], - vpx_memalign(16, num_pix * sizeof(int16_t))); - CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k], - vpx_memalign(16, num_pix * sizeof(int16_t))); - CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k], - vpx_memalign(16, num_pix * sizeof(int16_t))); - CHECK_MEM_ERROR(cm, ctx->eobs[i][k], - vpx_memalign(16, num_pix * sizeof(uint16_t))); - ctx->coeff_pbuf[i][k] = ctx->coeff[i][k]; - ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k]; - ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k]; - ctx->eobs_pbuf[i][k] = ctx->eobs[i][k]; - } - } -} - -static void free_mode_context(PICK_MODE_CONTEXT *ctx) { - int i, k; - vpx_free(ctx->zcoeff_blk); - ctx->zcoeff_blk = 0; - for (i = 0; i < MAX_MB_PLANE; ++i) { - for (k = 0; k < 3; ++k) { - vpx_free(ctx->coeff[i][k]); - ctx->coeff[i][k] = 0; - vpx_free(ctx->qcoeff[i][k]); - ctx->qcoeff[i][k] = 0; - vpx_free(ctx->dqcoeff[i][k]); - ctx->dqcoeff[i][k] = 0; - vpx_free(ctx->eobs[i][k]); - ctx->eobs[i][k] = 0; - } - } -} - -static void init_pick_mode_context(VP9_COMP *cpi) { - int i; - VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; - - - for (i = 0; i < BLOCK_SIZES; ++i) { - const int num_4x4_w = num_4x4_blocks_wide_lookup[i]; - const int num_4x4_h = num_4x4_blocks_high_lookup[i]; - const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h); - if (i < BLOCK_16X16) { - for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) { - for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index) { - for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index) { - PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - alloc_mode_context(cm, num_4x4_blk, ctx); - } - } - } - } else if (i < BLOCK_32X32) { - for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) { - for (x->mb_index = 0; x->mb_index < 64 / num_4x4_blk; ++x->mb_index) { - PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - ctx->num_4x4_blk = num_4x4_blk; - alloc_mode_context(cm, num_4x4_blk, ctx); - } - } - } else if (i < BLOCK_64X64) { - for (x->sb_index = 0; x->sb_index < 256 / num_4x4_blk; ++x->sb_index) { - PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - ctx->num_4x4_blk = num_4x4_blk; - alloc_mode_context(cm, num_4x4_blk, ctx); - } - } else { - PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - ctx->num_4x4_blk = num_4x4_blk; - alloc_mode_context(cm, num_4x4_blk, ctx); - } - } -} - -static void free_pick_mode_context(MACROBLOCK *x) { - int i; - - for (i = 0; i < BLOCK_SIZES; ++i) { - const int num_4x4_w = num_4x4_blocks_wide_lookup[i]; - const int num_4x4_h = num_4x4_blocks_high_lookup[i]; - const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h); - if (i < BLOCK_16X16) { - for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) { - for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index) { - for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index) { - PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - free_mode_context(ctx); - } - } - } - } else if (i < BLOCK_32X32) { - for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) { - for (x->mb_index = 0; x->mb_index < 64 / num_4x4_blk; ++x->mb_index) { - PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - free_mode_context(ctx); - } - } - } else if (i < BLOCK_64X64) { - for (x->sb_index = 0; x->sb_index < 256 / num_4x4_blk; ++x->sb_index) { - PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - free_mode_context(ctx); - } - } else { - PICK_MODE_CONTEXT *ctx = get_block_context(x, i); - free_mode_context(ctx); - } - } -} - -VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { - int i, j; - volatile union { - VP9_COMP *cpi; - VP9_PTR ptr; - } ctx; - - VP9_COMP *cpi; - VP9_COMMON *cm; - - cpi = ctx.cpi = vpx_memalign(32, sizeof(VP9_COMP)); - // Check that the CPI instance is valid - if (!cpi) - return 0; - - cm = &cpi->common; - - vp9_zero(*cpi); - - if (setjmp(cm->error.jmp)) { - VP9_PTR ptr = ctx.ptr; - - ctx.cpi->common.error.setjmp = 0; - vp9_remove_compressor(&ptr); - return 0; - } - - cm->error.setjmp = 1; - - CHECK_MEM_ERROR(cm, cpi->mb.ss, vpx_calloc(sizeof(search_site), - (MAX_MVSEARCH_STEPS * 8) + 1)); - - vp9_create_common(cm); - - init_config((VP9_PTR)cpi, oxcf); - - init_pick_mode_context(cpi); - - cm->current_video_frame = 0; - cpi->kf_overspend_bits = 0; - cpi->kf_bitrate_adjustment = 0; - cpi->frames_till_gf_update_due = 0; - cpi->gf_overspend_bits = 0; - cpi->non_gf_bitrate_adjustment = 0; - - // Set reference frame sign bias for ALTREF frame to 1 (for now) - cm->ref_frame_sign_bias[ALTREF_FRAME] = 1; - - cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL; - - cpi->gold_is_last = 0; - cpi->alt_is_last = 0; - cpi->gold_is_alt = 0; - - // Spatial scalability - cpi->number_spatial_layers = oxcf->ss_number_layers; - - // Create the encoder segmentation map and set all entries to 0 - CHECK_MEM_ERROR(cm, cpi->segmentation_map, - vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); - - // And a place holder structure is the coding context - // for use if we want to save and restore it - CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy, - vpx_calloc(cm->mi_rows * cm->mi_cols, 1)); - - CHECK_MEM_ERROR(cm, cpi->active_map, vpx_calloc(cm->MBs, 1)); - vpx_memset(cpi->active_map, 1, cm->MBs); - cpi->active_map_enabled = 0; - - for (i = 0; i < (sizeof(cpi->mbgraph_stats) / - sizeof(cpi->mbgraph_stats[0])); i++) { - CHECK_MEM_ERROR(cm, cpi->mbgraph_stats[i].mb_stats, - vpx_calloc(cm->MBs * - sizeof(*cpi->mbgraph_stats[i].mb_stats), 1)); - } - -#ifdef ENTROPY_STATS - if (cpi->pass != 1) - init_context_counters(); -#endif - -#ifdef MODE_STATS - init_tx_count_stats(); - init_switchable_interp_stats(); -#endif - - /*Initialize the feed-forward activity masking.*/ - cpi->activity_avg = 90 << 12; - - cpi->frames_since_key = 8; // Sensible default for first frame. - cpi->key_frame_frequency = cpi->oxcf.key_freq; - cpi->this_key_frame_forced = 0; - cpi->next_key_frame_forced = 0; - - cpi->source_alt_ref_pending = 0; - cpi->source_alt_ref_active = 0; - cpi->refresh_alt_ref_frame = 0; - -#if CONFIG_MULTIPLE_ARF - // Turn multiple ARF usage on/off. This is a quick hack for the initial test - // version. It should eventually be set via the codec API. - cpi->multi_arf_enabled = 1; - - if (cpi->multi_arf_enabled) { - cpi->sequence_number = 0; - cpi->frame_coding_order_period = 0; - vp9_zero(cpi->frame_coding_order); - vp9_zero(cpi->arf_buffer_idx); - } -#endif - - cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; -#if CONFIG_INTERNAL_STATS - cpi->b_calculate_ssimg = 0; - - cpi->count = 0; - cpi->bytes = 0; - - if (cpi->b_calculate_psnr) { - cpi->total_sq_error = 0.0; - cpi->total_sq_error2 = 0.0; - cpi->total_y = 0.0; - cpi->total_u = 0.0; - cpi->total_v = 0.0; - cpi->total = 0.0; - cpi->totalp_y = 0.0; - cpi->totalp_u = 0.0; - cpi->totalp_v = 0.0; - cpi->totalp = 0.0; - cpi->tot_recode_hits = 0; - cpi->summed_quality = 0; - cpi->summed_weights = 0; - cpi->summedp_quality = 0; - cpi->summedp_weights = 0; - } - - if (cpi->b_calculate_ssimg) { - cpi->total_ssimg_y = 0; - cpi->total_ssimg_u = 0; - cpi->total_ssimg_v = 0; - cpi->total_ssimg_all = 0; - } - -#endif - - cpi->first_time_stamp_ever = INT64_MAX; - - cpi->frames_till_gf_update_due = 0; - cpi->key_frame_count = 1; - - cpi->ni_av_qi = cpi->oxcf.worst_allowed_q; - cpi->ni_tot_qi = 0; - cpi->ni_frames = 0; - cpi->tot_q = 0.0; - cpi->avg_q = vp9_convert_qindex_to_q(cpi->oxcf.worst_allowed_q); - cpi->total_byte_count = 0; - - cpi->rate_correction_factor = 1.0; - cpi->key_frame_rate_correction_factor = 1.0; - cpi->gf_rate_correction_factor = 1.0; - cpi->twopass.est_max_qcorrection_factor = 1.0; - - cal_nmvjointsadcost(cpi->mb.nmvjointsadcost); - cpi->mb.nmvcost[0] = &cpi->mb.nmvcosts[0][MV_MAX]; - cpi->mb.nmvcost[1] = &cpi->mb.nmvcosts[1][MV_MAX]; - cpi->mb.nmvsadcost[0] = &cpi->mb.nmvsadcosts[0][MV_MAX]; - cpi->mb.nmvsadcost[1] = &cpi->mb.nmvsadcosts[1][MV_MAX]; - cal_nmvsadcosts(cpi->mb.nmvsadcost); - - cpi->mb.nmvcost_hp[0] = &cpi->mb.nmvcosts_hp[0][MV_MAX]; - cpi->mb.nmvcost_hp[1] = &cpi->mb.nmvcosts_hp[1][MV_MAX]; - cpi->mb.nmvsadcost_hp[0] = &cpi->mb.nmvsadcosts_hp[0][MV_MAX]; - cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX]; - cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp); - - for (i = 0; i < KEY_FRAME_CONTEXT; i++) - cpi->prior_key_frame_distance[i] = (int)cpi->output_framerate; - -#ifdef OUTPUT_YUV_SRC - yuv_file = fopen("bd.yuv", "ab"); -#endif -#ifdef OUTPUT_YUV_REC - yuv_rec_file = fopen("rec.yuv", "wb"); -#endif - -#if 0 - framepsnr = fopen("framepsnr.stt", "a"); - kf_list = fopen("kf_list.stt", "w"); -#endif - - cpi->output_pkt_list = oxcf->output_pkt_list; - - cpi->enable_encode_breakout = 1; - - if (cpi->pass == 1) { - vp9_init_first_pass(cpi); - } else if (cpi->pass == 2) { - size_t packet_sz = sizeof(FIRSTPASS_STATS); - int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz); - - cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf; - cpi->twopass.stats_in = cpi->twopass.stats_in_start; - cpi->twopass.stats_in_end = (void *)((char *)cpi->twopass.stats_in - + (packets - 1) * packet_sz); - vp9_init_second_pass(cpi); - } - - vp9_set_speed_features(cpi); - - // Default rd threshold factors for mode selection - for (i = 0; i < BLOCK_SIZES; ++i) { - for (j = 0; j < MAX_MODES; ++j) - cpi->rd_thresh_freq_fact[i][j] = 32; - for (j = 0; j < MAX_REFS; ++j) - cpi->rd_thresh_freq_sub8x8[i][j] = 32; - } - -#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, \ - SDX3F, SDX8F, SDX4DF)\ - cpi->fn_ptr[BT].sdf = SDF; \ - cpi->fn_ptr[BT].sdaf = SDAF; \ - cpi->fn_ptr[BT].vf = VF; \ - cpi->fn_ptr[BT].svf = SVF; \ - cpi->fn_ptr[BT].svaf = SVAF; \ - cpi->fn_ptr[BT].svf_halfpix_h = SVFHH; \ - cpi->fn_ptr[BT].svf_halfpix_v = SVFHV; \ - cpi->fn_ptr[BT].svf_halfpix_hv = SVFHHV; \ - cpi->fn_ptr[BT].sdx3f = SDX3F; \ - cpi->fn_ptr[BT].sdx8f = SDX8F; \ - cpi->fn_ptr[BT].sdx4df = SDX4DF; - - BFP(BLOCK_32X16, vp9_sad32x16, vp9_sad32x16_avg, - vp9_variance32x16, vp9_sub_pixel_variance32x16, - vp9_sub_pixel_avg_variance32x16, NULL, NULL, - NULL, NULL, NULL, - vp9_sad32x16x4d) - - BFP(BLOCK_16X32, vp9_sad16x32, vp9_sad16x32_avg, - vp9_variance16x32, vp9_sub_pixel_variance16x32, - vp9_sub_pixel_avg_variance16x32, NULL, NULL, - NULL, NULL, NULL, - vp9_sad16x32x4d) - - BFP(BLOCK_64X32, vp9_sad64x32, vp9_sad64x32_avg, - vp9_variance64x32, vp9_sub_pixel_variance64x32, - vp9_sub_pixel_avg_variance64x32, NULL, NULL, - NULL, NULL, NULL, - vp9_sad64x32x4d) - - BFP(BLOCK_32X64, vp9_sad32x64, vp9_sad32x64_avg, - vp9_variance32x64, vp9_sub_pixel_variance32x64, - vp9_sub_pixel_avg_variance32x64, NULL, NULL, - NULL, NULL, NULL, - vp9_sad32x64x4d) - - BFP(BLOCK_32X32, vp9_sad32x32, vp9_sad32x32_avg, - vp9_variance32x32, vp9_sub_pixel_variance32x32, - vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h, - vp9_variance_halfpixvar32x32_v, - vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8, - vp9_sad32x32x4d) - - BFP(BLOCK_64X64, vp9_sad64x64, vp9_sad64x64_avg, - vp9_variance64x64, vp9_sub_pixel_variance64x64, - vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h, - vp9_variance_halfpixvar64x64_v, - vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8, - vp9_sad64x64x4d) - - BFP(BLOCK_16X16, vp9_sad16x16, vp9_sad16x16_avg, - vp9_variance16x16, vp9_sub_pixel_variance16x16, - vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h, - vp9_variance_halfpixvar16x16_v, - vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8, - vp9_sad16x16x4d) - - BFP(BLOCK_16X8, vp9_sad16x8, vp9_sad16x8_avg, - vp9_variance16x8, vp9_sub_pixel_variance16x8, - vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL, - vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d) - - BFP(BLOCK_8X16, vp9_sad8x16, vp9_sad8x16_avg, - vp9_variance8x16, vp9_sub_pixel_variance8x16, - vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL, - vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d) - - BFP(BLOCK_8X8, vp9_sad8x8, vp9_sad8x8_avg, - vp9_variance8x8, vp9_sub_pixel_variance8x8, - vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL, - vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d) - - BFP(BLOCK_8X4, vp9_sad8x4, vp9_sad8x4_avg, - vp9_variance8x4, vp9_sub_pixel_variance8x4, - vp9_sub_pixel_avg_variance8x4, NULL, NULL, - NULL, NULL, vp9_sad8x4x8, - vp9_sad8x4x4d) - - BFP(BLOCK_4X8, vp9_sad4x8, vp9_sad4x8_avg, - vp9_variance4x8, vp9_sub_pixel_variance4x8, - vp9_sub_pixel_avg_variance4x8, NULL, NULL, - NULL, NULL, vp9_sad4x8x8, - vp9_sad4x8x4d) - - BFP(BLOCK_4X4, vp9_sad4x4, vp9_sad4x4_avg, - vp9_variance4x4, vp9_sub_pixel_variance4x4, - vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL, - vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d) - - cpi->full_search_sad = vp9_full_search_sad; - cpi->diamond_search_sad = vp9_diamond_search_sad; - cpi->refining_search_sad = vp9_refining_search_sad; - - // make sure frame 1 is okay - cpi->error_bins[0] = cpi->common.MBs; - - /* vp9_init_quantizer() is first called here. Add check in - * vp9_frame_init_quantizer() so that vp9_init_quantizer is only - * called later when needed. This will avoid unnecessary calls of - * vp9_init_quantizer() for every frame. - */ - vp9_init_quantizer(cpi); - - vp9_loop_filter_init(cm); - - cpi->common.error.setjmp = 0; - - vp9_zero(cpi->y_uv_mode_count); - -#ifdef MODE_TEST_HIT_STATS - vp9_zero(cpi->mode_test_hits); -#endif - - return (VP9_PTR) cpi; -} - -void vp9_remove_compressor(VP9_PTR *ptr) { - VP9_COMP *cpi = (VP9_COMP *)(*ptr); - int i; - - if (!cpi) - return; - - if (cpi && (cpi->common.current_video_frame > 0)) { - if (cpi->pass == 2) { - vp9_end_second_pass(cpi); - } - -#ifdef ENTROPY_STATS - if (cpi->pass != 1) { - print_context_counters(); - print_tree_update_probs(); - print_mode_context(cpi); - } -#endif - -#ifdef MODE_STATS - if (cpi->pass != 1) { - write_tx_count_stats(); - write_switchable_interp_stats(); - } -#endif - -#if CONFIG_INTERNAL_STATS - - vp9_clear_system_state(); - - // printf("\n8x8-4x4:%d-%d\n", cpi->t8x8_count, cpi->t4x4_count); - if (cpi->pass != 1) { - FILE *f = fopen("opsnr.stt", "a"); - double time_encoded = (cpi->last_end_time_stamp_seen - - cpi->first_time_stamp_ever) / 10000000.000; - double total_encode_time = (cpi->time_receive_data + - cpi->time_compress_data) / 1000.000; - double dr = (double)cpi->bytes * (double) 8 / (double)1000 - / time_encoded; - - if (cpi->b_calculate_psnr) { - YV12_BUFFER_CONFIG *lst_yv12 = - &cpi->common.yv12_fb[cpi->common.ref_frame_map[cpi->lst_fb_idx]]; - double samples = 3.0 / 2 * cpi->count * - lst_yv12->y_width * lst_yv12->y_height; - double total_psnr = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error); - double total_psnr2 = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error2); - double total_ssim = 100 * pow(cpi->summed_quality / - cpi->summed_weights, 8.0); - double total_ssimp = 100 * pow(cpi->summedp_quality / - cpi->summedp_weights, 8.0); - - fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t" - "VPXSSIM\tVPSSIMP\t Time(ms)\n"); - fprintf(f, "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f\n", - dr, cpi->total / cpi->count, total_psnr, - cpi->totalp / cpi->count, total_psnr2, total_ssim, total_ssimp, - total_encode_time); - } - - if (cpi->b_calculate_ssimg) { - fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t Time(ms)\n"); - fprintf(f, "%7.2f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr, - cpi->total_ssimg_y / cpi->count, - cpi->total_ssimg_u / cpi->count, - cpi->total_ssimg_v / cpi->count, - cpi->total_ssimg_all / cpi->count, total_encode_time); - } - - fclose(f); - } - -#endif - -#ifdef MODE_TEST_HIT_STATS - if (cpi->pass != 1) { - double norm_per_pixel_mode_tests = 0; - double norm_counts[BLOCK_SIZES]; - int i; - int sb64_per_frame; - int norm_factors[BLOCK_SIZES] = - {256, 128, 128, 64, 32, 32, 16, 8, 8, 4, 2, 2, 1}; - FILE *f = fopen("mode_hit_stats.stt", "a"); - - // On average, how many mode tests do we do - for (i = 0; i < BLOCK_SIZES; ++i) { - norm_counts[i] = (double)cpi->mode_test_hits[i] / - (double)norm_factors[i]; - norm_per_pixel_mode_tests += norm_counts[i]; - } - // Convert to a number per 64x64 and per frame - sb64_per_frame = ((cpi->common.height + 63) / 64) * - ((cpi->common.width + 63) / 64); - norm_per_pixel_mode_tests = - norm_per_pixel_mode_tests / - (double)(cpi->common.current_video_frame * sb64_per_frame); - - fprintf(f, "%6.4f\n", norm_per_pixel_mode_tests); - fclose(f); - } -#endif - -#ifdef ENTROPY_STATS - { - int i, j, k; - FILE *fmode = fopen("vp9_modecontext.c", "w"); - - fprintf(fmode, "\n#include \"vp9_entropymode.h\"\n\n"); - fprintf(fmode, "const unsigned int vp9_kf_default_bmode_counts "); - fprintf(fmode, "[INTRA_MODES][INTRA_MODES]" - "[INTRA_MODES] =\n{\n"); - - for (i = 0; i < INTRA_MODES; i++) { - fprintf(fmode, " { // Above Mode : %d\n", i); - - for (j = 0; j < INTRA_MODES; j++) { - fprintf(fmode, " {"); - - for (k = 0; k < INTRA_MODES; k++) { - if (!intra_mode_stats[i][j][k]) - fprintf(fmode, " %5d, ", 1); - else - fprintf(fmode, " %5d, ", intra_mode_stats[i][j][k]); - } - - fprintf(fmode, "}, // left_mode %d\n", j); - } - - fprintf(fmode, " },\n"); - } - - fprintf(fmode, "};\n"); - fclose(fmode); - } -#endif - - -#if defined(SECTIONBITS_OUTPUT) - - if (0) { - int i; - FILE *f = fopen("tokenbits.stt", "a"); - - for (i = 0; i < 28; i++) - fprintf(f, "%8d", (int)(Sectionbits[i] / 256)); - - fprintf(f, "\n"); - fclose(f); - } - -#endif - -#if 0 - { - printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000); - printf("\n_frames recive_data encod_mb_row compress_frame Total\n"); - printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, - cpi->time_receive_data / 1000, cpi->time_encode_sb_row / 1000, - cpi->time_compress_data / 1000, - (cpi->time_receive_data + cpi->time_compress_data) / 1000); - } -#endif - } - - free_pick_mode_context(&cpi->mb); - dealloc_compressor_data(cpi); - vpx_free(cpi->mb.ss); - vpx_free(cpi->tok); - - for (i = 0; i < sizeof(cpi->mbgraph_stats) / - sizeof(cpi->mbgraph_stats[0]); ++i) { - vpx_free(cpi->mbgraph_stats[i].mb_stats); - } - - vp9_remove_common(&cpi->common); - vpx_free(cpi); - *ptr = 0; - -#ifdef OUTPUT_YUV_SRC - fclose(yuv_file); -#endif -#ifdef OUTPUT_YUV_REC - fclose(yuv_rec_file); -#endif - -#if 0 - - if (keyfile) - fclose(keyfile); - - if (framepsnr) - fclose(framepsnr); - - if (kf_list) - fclose(kf_list); - -#endif -} - - -static uint64_t calc_plane_error(uint8_t *orig, int orig_stride, - uint8_t *recon, int recon_stride, - unsigned int cols, unsigned int rows) { - unsigned int row, col; - uint64_t total_sse = 0; - int diff; - - for (row = 0; row + 16 <= rows; row += 16) { - for (col = 0; col + 16 <= cols; col += 16) { - unsigned int sse; - - vp9_mse16x16(orig + col, orig_stride, recon + col, recon_stride, &sse); - total_sse += sse; - } - - /* Handle odd-sized width */ - if (col < cols) { - unsigned int border_row, border_col; - uint8_t *border_orig = orig; - uint8_t *border_recon = recon; - - for (border_row = 0; border_row < 16; border_row++) { - for (border_col = col; border_col < cols; border_col++) { - diff = border_orig[border_col] - border_recon[border_col]; - total_sse += diff * diff; - } - - border_orig += orig_stride; - border_recon += recon_stride; - } - } - - orig += orig_stride * 16; - recon += recon_stride * 16; - } - - /* Handle odd-sized height */ - for (; row < rows; row++) { - for (col = 0; col < cols; col++) { - diff = orig[col] - recon[col]; - total_sse += diff * diff; - } - - orig += orig_stride; - recon += recon_stride; - } - - return total_sse; -} - - -static void generate_psnr_packet(VP9_COMP *cpi) { - YV12_BUFFER_CONFIG *orig = cpi->Source; - YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show; - struct vpx_codec_cx_pkt pkt; - uint64_t sse; - int i; - unsigned int width = orig->y_crop_width; - unsigned int height = orig->y_crop_height; - - pkt.kind = VPX_CODEC_PSNR_PKT; - sse = calc_plane_error(orig->y_buffer, orig->y_stride, - recon->y_buffer, recon->y_stride, - width, height); - pkt.data.psnr.sse[0] = sse; - pkt.data.psnr.sse[1] = sse; - pkt.data.psnr.samples[0] = width * height; - pkt.data.psnr.samples[1] = width * height; - - width = orig->uv_crop_width; - height = orig->uv_crop_height; - - sse = calc_plane_error(orig->u_buffer, orig->uv_stride, - recon->u_buffer, recon->uv_stride, - width, height); - pkt.data.psnr.sse[0] += sse; - pkt.data.psnr.sse[2] = sse; - pkt.data.psnr.samples[0] += width * height; - pkt.data.psnr.samples[2] = width * height; - - sse = calc_plane_error(orig->v_buffer, orig->uv_stride, - recon->v_buffer, recon->uv_stride, - width, height); - pkt.data.psnr.sse[0] += sse; - pkt.data.psnr.sse[3] = sse; - pkt.data.psnr.samples[0] += width * height; - pkt.data.psnr.samples[3] = width * height; - - for (i = 0; i < 4; i++) - pkt.data.psnr.psnr[i] = vp9_mse2psnr(pkt.data.psnr.samples[i], 255.0, - (double)pkt.data.psnr.sse[i]); - - vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt); -} - - -int vp9_use_as_reference(VP9_PTR ptr, int ref_frame_flags) { - VP9_COMP *cpi = (VP9_COMP *)(ptr); - - if (ref_frame_flags > 7) - return -1; - - cpi->ref_frame_flags = ref_frame_flags; - return 0; -} -int vp9_update_reference(VP9_PTR ptr, int ref_frame_flags) { - VP9_COMP *cpi = (VP9_COMP *)(ptr); - - if (ref_frame_flags > 7) - return -1; - - cpi->refresh_golden_frame = 0; - cpi->refresh_alt_ref_frame = 0; - cpi->refresh_last_frame = 0; - - if (ref_frame_flags & VP9_LAST_FLAG) - cpi->refresh_last_frame = 1; - - if (ref_frame_flags & VP9_GOLD_FLAG) - cpi->refresh_golden_frame = 1; - - if (ref_frame_flags & VP9_ALT_FLAG) - cpi->refresh_alt_ref_frame = 1; - - return 0; -} - -int vp9_copy_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag, - YV12_BUFFER_CONFIG *sd) { - VP9_COMP *cpi = (VP9_COMP *)(ptr); - VP9_COMMON *cm = &cpi->common; - int ref_fb_idx; - - if (ref_frame_flag == VP9_LAST_FLAG) - ref_fb_idx = cm->ref_frame_map[cpi->lst_fb_idx]; - else if (ref_frame_flag == VP9_GOLD_FLAG) - ref_fb_idx = cm->ref_frame_map[cpi->gld_fb_idx]; - else if (ref_frame_flag == VP9_ALT_FLAG) - ref_fb_idx = cm->ref_frame_map[cpi->alt_fb_idx]; - else - return -1; - - vp8_yv12_copy_frame(&cm->yv12_fb[ref_fb_idx], sd); - - return 0; -} - -int vp9_get_reference_enc(VP9_PTR ptr, int index, YV12_BUFFER_CONFIG **fb) { - VP9_COMP *cpi = (VP9_COMP *)(ptr); - VP9_COMMON *cm = &cpi->common; - - if (index < 0 || index >= NUM_REF_FRAMES) - return -1; - - *fb = &cm->yv12_fb[cm->ref_frame_map[index]]; - return 0; -} - -int vp9_set_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag, - YV12_BUFFER_CONFIG *sd) { - VP9_COMP *cpi = (VP9_COMP *)(ptr); - VP9_COMMON *cm = &cpi->common; - - int ref_fb_idx; - - if (ref_frame_flag == VP9_LAST_FLAG) - ref_fb_idx = cm->ref_frame_map[cpi->lst_fb_idx]; - else if (ref_frame_flag == VP9_GOLD_FLAG) - ref_fb_idx = cm->ref_frame_map[cpi->gld_fb_idx]; - else if (ref_frame_flag == VP9_ALT_FLAG) - ref_fb_idx = cm->ref_frame_map[cpi->alt_fb_idx]; - else - return -1; - - vp8_yv12_copy_frame(sd, &cm->yv12_fb[ref_fb_idx]); - - return 0; -} -int vp9_update_entropy(VP9_PTR comp, int update) { - ((VP9_COMP *)comp)->common.refresh_frame_context = update; - return 0; -} - - -#ifdef OUTPUT_YUV_SRC -void vp9_write_yuv_frame(YV12_BUFFER_CONFIG *s) { - uint8_t *src = s->y_buffer; - int h = s->y_height; - - do { - fwrite(src, s->y_width, 1, yuv_file); - src += s->y_stride; - } while (--h); - - src = s->u_buffer; - h = s->uv_height; - - do { - fwrite(src, s->uv_width, 1, yuv_file); - src += s->uv_stride; - } while (--h); - - src = s->v_buffer; - h = s->uv_height; - - do { - fwrite(src, s->uv_width, 1, yuv_file); - src += s->uv_stride; - } while (--h); -} -#endif - -#ifdef OUTPUT_YUV_REC -void vp9_write_yuv_rec_frame(VP9_COMMON *cm) { - YV12_BUFFER_CONFIG *s = cm->frame_to_show; - uint8_t *src = s->y_buffer; - int h = cm->height; - - do { - fwrite(src, s->y_width, 1, yuv_rec_file); - src += s->y_stride; - } while (--h); - - src = s->u_buffer; - h = s->uv_height; - - do { - fwrite(src, s->uv_width, 1, yuv_rec_file); - src += s->uv_stride; - } while (--h); - - src = s->v_buffer; - h = s->uv_height; - - do { - fwrite(src, s->uv_width, 1, yuv_rec_file); - src += s->uv_stride; - } while (--h); - -#if CONFIG_ALPHA - if (s->alpha_buffer) { - src = s->alpha_buffer; - h = s->alpha_height; - do { - fwrite(src, s->alpha_width, 1, yuv_rec_file); - src += s->alpha_stride; - } while (--h); - } -#endif - - fflush(yuv_rec_file); -} -#endif - -static void scale_and_extend_frame(YV12_BUFFER_CONFIG *src_fb, - YV12_BUFFER_CONFIG *dst_fb) { - const int in_w = src_fb->y_crop_width; - const int in_h = src_fb->y_crop_height; - const int out_w = dst_fb->y_crop_width; - const int out_h = dst_fb->y_crop_height; - int x, y, i; - - uint8_t *srcs[4] = {src_fb->y_buffer, src_fb->u_buffer, src_fb->v_buffer, - src_fb->alpha_buffer}; - int src_strides[4] = {src_fb->y_stride, src_fb->uv_stride, src_fb->uv_stride, - src_fb->alpha_stride}; - - uint8_t *dsts[4] = {dst_fb->y_buffer, dst_fb->u_buffer, dst_fb->v_buffer, - dst_fb->alpha_buffer}; - int dst_strides[4] = {dst_fb->y_stride, dst_fb->uv_stride, dst_fb->uv_stride, - dst_fb->alpha_stride}; - - for (y = 0; y < out_h; y += 16) { - for (x = 0; x < out_w; x += 16) { - for (i = 0; i < MAX_MB_PLANE; ++i) { - const int factor = i == 0 ? 1 : 2; - const int x_q4 = x * (16 / factor) * in_w / out_w; - const int y_q4 = y * (16 / factor) * in_h / out_h; - const int src_stride = src_strides[i]; - const int dst_stride = dst_strides[i]; - uint8_t *src = srcs[i] + y / factor * in_h / out_h * src_stride + - x / factor * in_w / out_w; - uint8_t *dst = dsts[i] + y / factor * dst_stride + x / factor; - - vp9_convolve8(src, src_stride, dst, dst_stride, - vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w, - vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h, - 16 / factor, 16 / factor); - } - } - } - - vp8_yv12_extend_frame_borders(dst_fb); -} - - -static void update_alt_ref_frame_stats(VP9_COMP *cpi) { - // this frame refreshes means next frames don't unless specified by user - cpi->frames_since_golden = 0; - -#if CONFIG_MULTIPLE_ARF - if (!cpi->multi_arf_enabled) -#endif - // Clear the alternate reference update pending flag. - cpi->source_alt_ref_pending = 0; - - // Set the alternate reference frame active flag - cpi->source_alt_ref_active = 1; -} -static void update_golden_frame_stats(VP9_COMP *cpi) { - // Update the Golden frame usage counts. - if (cpi->refresh_golden_frame) { - // this frame refreshes means next frames don't unless specified by user - cpi->refresh_golden_frame = 0; - cpi->frames_since_golden = 0; - - // ******** Fixed Q test code only ************ - // If we are going to use the ALT reference for the next group of frames - // set a flag to say so. - if (cpi->oxcf.fixed_q >= 0 && - cpi->oxcf.play_alternate && !cpi->refresh_alt_ref_frame) { - cpi->source_alt_ref_pending = 1; - cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; - - // TODO(ivan): For SVC encoder, GF automatic update is disabled by using - // a large GF_interval. - if (cpi->use_svc) { - cpi->frames_till_gf_update_due = INT_MAX; - } - } - - if (!cpi->source_alt_ref_pending) - cpi->source_alt_ref_active = 0; - - // Decrement count down till next gf - if (cpi->frames_till_gf_update_due > 0) - cpi->frames_till_gf_update_due--; - - } else if (!cpi->refresh_alt_ref_frame) { - // Decrement count down till next gf - if (cpi->frames_till_gf_update_due > 0) - cpi->frames_till_gf_update_due--; - - if (cpi->frames_till_alt_ref_frame) - cpi->frames_till_alt_ref_frame--; - - cpi->frames_since_golden++; - } -} - -static int find_fp_qindex() { - int i; - - for (i = 0; i < QINDEX_RANGE; i++) { - if (vp9_convert_qindex_to_q(i) >= 30.0) { - break; - } - } - - if (i == QINDEX_RANGE) - i--; - - return i; -} - -static void Pass1Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest, - unsigned int *frame_flags) { - (void) size; - (void) dest; - (void) frame_flags; - - vp9_set_quantizer(cpi, find_fp_qindex()); - vp9_first_pass(cpi); -} - -#define WRITE_RECON_BUFFER 0 -#if WRITE_RECON_BUFFER -void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) { - FILE *yframe; - int i; - char filename[255]; - - snprintf(filename, sizeof(filename), "cx\\y%04d.raw", this_frame); - yframe = fopen(filename, "wb"); - - for (i = 0; i < frame->y_height; i++) - fwrite(frame->y_buffer + i * frame->y_stride, - frame->y_width, 1, yframe); - - fclose(yframe); - snprintf(filename, sizeof(filename), "cx\\u%04d.raw", this_frame); - yframe = fopen(filename, "wb"); - - for (i = 0; i < frame->uv_height; i++) - fwrite(frame->u_buffer + i * frame->uv_stride, - frame->uv_width, 1, yframe); - - fclose(yframe); - snprintf(filename, sizeof(filename), "cx\\v%04d.raw", this_frame); - yframe = fopen(filename, "wb"); - - for (i = 0; i < frame->uv_height; i++) - fwrite(frame->v_buffer + i * frame->uv_stride, - frame->uv_width, 1, yframe); - - fclose(yframe); -} -#endif - -static double compute_edge_pixel_proportion(YV12_BUFFER_CONFIG *frame) { -#define EDGE_THRESH 128 - int i, j; - int num_edge_pels = 0; - int num_pels = (frame->y_height - 2) * (frame->y_width - 2); - uint8_t *prev = frame->y_buffer + 1; - uint8_t *curr = frame->y_buffer + 1 + frame->y_stride; - uint8_t *next = frame->y_buffer + 1 + 2 * frame->y_stride; - for (i = 1; i < frame->y_height - 1; i++) { - for (j = 1; j < frame->y_width - 1; j++) { - /* Sobel hor and ver gradients */ - int v = 2 * (curr[1] - curr[-1]) + (prev[1] - prev[-1]) + - (next[1] - next[-1]); - int h = 2 * (prev[0] - next[0]) + (prev[1] - next[1]) + - (prev[-1] - next[-1]); - h = (h < 0 ? -h : h); - v = (v < 0 ? -v : v); - if (h > EDGE_THRESH || v > EDGE_THRESH) - num_edge_pels++; - curr++; - prev++; - next++; - } - curr += frame->y_stride - frame->y_width + 2; - prev += frame->y_stride - frame->y_width + 2; - next += frame->y_stride - frame->y_width + 2; - } - return (double)num_edge_pels / num_pels; -} - -// Function to test for conditions that indicate we should loop -// back and recode a frame. -static int recode_loop_test(VP9_COMP *cpi, - int high_limit, int low_limit, - int q, int maxq, int minq) { - int force_recode = 0; - VP9_COMMON *cm = &cpi->common; - - // Is frame recode allowed at all - // Yes if either recode mode 1 is selected or mode two is selected - // and the frame is a key frame. golden frame or alt_ref_frame - if ((cpi->sf.recode_loop == 1) || - ((cpi->sf.recode_loop == 2) && - ((cm->frame_type == KEY_FRAME) || - cpi->refresh_golden_frame || - cpi->refresh_alt_ref_frame))) { - // General over and under shoot tests - if (((cpi->projected_frame_size > high_limit) && (q < maxq)) || - ((cpi->projected_frame_size < low_limit) && (q > minq))) { - force_recode = 1; - } else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) { - // Deal with frame undershoot and whether or not we are - // below the automatically set cq level. - if (q > cpi->cq_target_quality && - cpi->projected_frame_size < ((cpi->this_frame_target * 7) >> 3)) { - force_recode = 1; - } else if (q > cpi->oxcf.cq_level && - cpi->projected_frame_size < cpi->min_frame_bandwidth && - cpi->active_best_quality > cpi->oxcf.cq_level) { - // Severe undershoot and between auto and user cq level - force_recode = 1; - cpi->active_best_quality = cpi->oxcf.cq_level; - } - } - } - - return force_recode; -} - -static void update_reference_frames(VP9_COMP * const cpi) { - VP9_COMMON * const cm = &cpi->common; - - // At this point the new frame has been encoded. - // If any buffer copy / swapping is signaled it should be done here. - if (cm->frame_type == KEY_FRAME) { - ref_cnt_fb(cm->fb_idx_ref_cnt, - &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx); - ref_cnt_fb(cm->fb_idx_ref_cnt, - &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx); - } -#if CONFIG_MULTIPLE_ARF - else if (!cpi->multi_arf_enabled && cpi->refresh_golden_frame && - !cpi->refresh_alt_ref_frame) { -#else - else if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame && - !cpi->use_svc) { -#endif - /* Preserve the previously existing golden frame and update the frame in - * the alt ref slot instead. This is highly specific to the current use of - * alt-ref as a forward reference, and this needs to be generalized as - * other uses are implemented (like RTC/temporal scaling) - * - * The update to the buffer in the alt ref slot was signaled in - * vp9_pack_bitstream(), now swap the buffer pointers so that it's treated - * as the golden frame next time. - */ - int tmp; - - ref_cnt_fb(cm->fb_idx_ref_cnt, - &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx); - - tmp = cpi->alt_fb_idx; - cpi->alt_fb_idx = cpi->gld_fb_idx; - cpi->gld_fb_idx = tmp; - } else { /* For non key/golden frames */ - if (cpi->refresh_alt_ref_frame) { - int arf_idx = cpi->alt_fb_idx; -#if CONFIG_MULTIPLE_ARF - if (cpi->multi_arf_enabled) { - arf_idx = cpi->arf_buffer_idx[cpi->sequence_number + 1]; - } -#endif - ref_cnt_fb(cm->fb_idx_ref_cnt, - &cm->ref_frame_map[arf_idx], cm->new_fb_idx); - } - - if (cpi->refresh_golden_frame) { - ref_cnt_fb(cm->fb_idx_ref_cnt, - &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx); - } - } - - if (cpi->refresh_last_frame) { - ref_cnt_fb(cm->fb_idx_ref_cnt, - &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx); - } -} - -static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { - MACROBLOCKD *xd = &cpi->mb.e_mbd; - struct loopfilter *lf = &cm->lf; - if (xd->lossless) { - lf->filter_level = 0; - } else { - struct vpx_usec_timer timer; - - vp9_clear_system_state(); - - vpx_usec_timer_start(&timer); - - vp9_pick_filter_level(cpi->Source, cpi, cpi->sf.use_fast_lpf_pick); - - vpx_usec_timer_mark(&timer); - cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer); - } - - if (lf->filter_level > 0) { - vp9_set_alt_lf_level(cpi, lf->filter_level); - vp9_loop_filter_frame(cm, xd, lf->filter_level, 0, 0); - } - - vp9_extend_frame_inner_borders(cm->frame_to_show, - cm->subsampling_x, cm->subsampling_y); -} - -static void scale_references(VP9_COMP *cpi) { - VP9_COMMON *cm = &cpi->common; - int i; - int refs[ALLOWED_REFS_PER_FRAME] = {cpi->lst_fb_idx, cpi->gld_fb_idx, - cpi->alt_fb_idx}; - - for (i = 0; i < 3; i++) { - YV12_BUFFER_CONFIG *ref = &cm->yv12_fb[cm->ref_frame_map[refs[i]]]; - - if (ref->y_crop_width != cm->width || - ref->y_crop_height != cm->height) { - int new_fb = get_free_fb(cm); - - vp9_realloc_frame_buffer(&cm->yv12_fb[new_fb], - cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, - VP9BORDERINPIXELS); - scale_and_extend_frame(ref, &cm->yv12_fb[new_fb]); - cpi->scaled_ref_idx[i] = new_fb; - } else { - cpi->scaled_ref_idx[i] = cm->ref_frame_map[refs[i]]; - cm->fb_idx_ref_cnt[cm->ref_frame_map[refs[i]]]++; - } - } -} - -static void release_scaled_references(VP9_COMP *cpi) { - VP9_COMMON *cm = &cpi->common; - int i; - - for (i = 0; i < 3; i++) - cm->fb_idx_ref_cnt[cpi->scaled_ref_idx[i]]--; -} - -static void full_to_model_count(unsigned int *model_count, - unsigned int *full_count) { - int n; - model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN]; - model_count[ONE_TOKEN] = full_count[ONE_TOKEN]; - model_count[TWO_TOKEN] = full_count[TWO_TOKEN]; - for (n = THREE_TOKEN; n < DCT_EOB_TOKEN; ++n) - model_count[TWO_TOKEN] += full_count[n]; - model_count[DCT_EOB_MODEL_TOKEN] = full_count[DCT_EOB_TOKEN]; -} - -static void full_to_model_counts( - vp9_coeff_count_model *model_count, vp9_coeff_count *full_count) { - int i, j, k, l; - for (i = 0; i < BLOCK_TYPES; ++i) - for (j = 0; j < REF_TYPES; ++j) - for (k = 0; k < COEF_BANDS; ++k) - for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { - if (l >= 3 && k == 0) - continue; - full_to_model_count(model_count[i][j][k][l], full_count[i][j][k][l]); - } -} - -#if 0 && CONFIG_INTERNAL_STATS -static void output_frame_level_debug_stats(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; - FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w"); - int recon_err; - - vp9_clear_system_state(); // __asm emms; - - recon_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm)); - - if (cpi->twopass.total_left_stats.coded_error != 0.0) - fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d" - "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f" - "%6d %6d %5d %5d %5d %8.2f %10d %10.3f" - "%10.3f %8d %10d %10d %10d\n", - cpi->common.current_video_frame, cpi->this_frame_target, - cpi->projected_frame_size, 0, - (cpi->projected_frame_size - cpi->this_frame_target), - (int)cpi->total_target_vs_actual, - (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target), - (int)cpi->total_actual_bits, cm->base_qindex, - vp9_convert_qindex_to_q(cm->base_qindex), - (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0, - vp9_convert_qindex_to_q(cpi->active_best_quality), - vp9_convert_qindex_to_q(cpi->active_worst_quality), cpi->avg_q, - vp9_convert_qindex_to_q(cpi->ni_av_qi), - vp9_convert_qindex_to_q(cpi->cq_target_quality), - cpi->refresh_last_frame, cpi->refresh_golden_frame, - cpi->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost, - cpi->twopass.est_max_qcorrection_factor, (int)cpi->twopass.bits_left, - cpi->twopass.total_left_stats.coded_error, - (double)cpi->twopass.bits_left / - (1 + cpi->twopass.total_left_stats.coded_error), - cpi->tot_recode_hits, recon_err, cpi->kf_boost, cpi->kf_zeromotion_pct); - - fclose(f); - - if (0) { - FILE *const fmodes = fopen("Modes.stt", "a"); - int i; - - fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame, - cm->frame_type, cpi->refresh_golden_frame, - cpi->refresh_alt_ref_frame); - - for (i = 0; i < MAX_MODES; ++i) - fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]); - for (i = 0; i < MAX_REFS; ++i) - fprintf(fmodes, "%5d ", cpi->sub8x8_mode_chosen_counts[i]); - - fprintf(fmodes, "\n"); - - fclose(fmodes); - } -} -#endif - -static int pick_q_and_adjust_q_bounds(VP9_COMP *cpi, - int * bottom_index, int * top_index) { - // Set an active best quality and if necessary active worst quality - int q = cpi->active_worst_quality; - VP9_COMMON *const cm = &cpi->common; - - if (frame_is_intra_only(cm)) { -#if !CONFIG_MULTIPLE_ARF - // Handle the special case for key frames forced when we have75 reached - // the maximum key frame interval. Here force the Q to a range - // based on the ambient Q to reduce the risk of popping. - if (cpi->this_key_frame_forced) { - int delta_qindex; - int qindex = cpi->last_boosted_qindex; - double last_boosted_q = vp9_convert_qindex_to_q(qindex); - - delta_qindex = vp9_compute_qdelta(cpi, last_boosted_q, - (last_boosted_q * 0.75)); - - cpi->active_best_quality = MAX(qindex + delta_qindex, - cpi->best_quality); - } else { - int high = 5000; - int low = 400; - double q_adj_factor = 1.0; - double q_val; - - // Baseline value derived from cpi->active_worst_quality and kf boost - cpi->active_best_quality = get_active_quality(q, cpi->kf_boost, - low, high, - kf_low_motion_minq, - kf_high_motion_minq); - - // Allow somewhat lower kf minq with small image formats. - if ((cm->width * cm->height) <= (352 * 288)) { - q_adj_factor -= 0.25; - } - - // Make a further adjustment based on the kf zero motion measure. - q_adj_factor += 0.05 - (0.001 * (double)cpi->kf_zeromotion_pct); - - // Convert the adjustment factor to a qindex delta - // on active_best_quality. - q_val = vp9_convert_qindex_to_q(cpi->active_best_quality); - cpi->active_best_quality += - vp9_compute_qdelta(cpi, q_val, (q_val * q_adj_factor)); - } -#else - double current_q; - // Force the KF quantizer to be 30% of the active_worst_quality. - current_q = vp9_convert_qindex_to_q(cpi->active_worst_quality); - cpi->active_best_quality = cpi->active_worst_quality - + vp9_compute_qdelta(cpi, current_q, current_q * 0.3); -#endif - } else if (!cpi->is_src_frame_alt_ref && - (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { - int high = 2000; - int low = 400; - - // Use the lower of cpi->active_worst_quality and recent - // average Q as basis for GF/ARF best Q limit unless last frame was - // a key frame. - if (cpi->frames_since_key > 1 && - cpi->avg_frame_qindex < cpi->active_worst_quality) { - q = cpi->avg_frame_qindex; - } - // For constrained quality dont allow Q less than the cq level - if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) { - if (q < cpi->cq_target_quality) - q = cpi->cq_target_quality; - if (cpi->frames_since_key > 1) { - cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost, - low, high, - afq_low_motion_minq, - afq_high_motion_minq); - } else { - cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost, - low, high, - gf_low_motion_minq, - gf_high_motion_minq); - } - // Constrained quality use slightly lower active best. - cpi->active_best_quality = cpi->active_best_quality * 15 / 16; - - } else if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) { - if (!cpi->refresh_alt_ref_frame) { - cpi->active_best_quality = cpi->cq_target_quality; - } else { - if (cpi->frames_since_key > 1) { - cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost, - low, high, - afq_low_motion_minq, - afq_high_motion_minq); - } else { - cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost, - low, high, - gf_low_motion_minq, - gf_high_motion_minq); - } - } - } else { - cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost, - low, high, - gf_low_motion_minq, - gf_high_motion_minq); - } - } else { - if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) { - cpi->active_best_quality = cpi->cq_target_quality; - } else { - cpi->active_best_quality = inter_minq[q]; - // 1-pass: for now, use the average Q for the active_best, if its lower - // than active_worst. - if (cpi->pass == 0 && (cpi->avg_frame_qindex < q)) - cpi->active_best_quality = inter_minq[cpi->avg_frame_qindex]; - - // For the constrained quality mode we don't want - // q to fall below the cq level. - if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) && - (cpi->active_best_quality < cpi->cq_target_quality)) { - // If we are strongly undershooting the target rate in the last - // frames then use the user passed in cq value not the auto - // cq value. - if (cpi->rolling_actual_bits < cpi->min_frame_bandwidth) - cpi->active_best_quality = cpi->oxcf.cq_level; - else - cpi->active_best_quality = cpi->cq_target_quality; - } - } - } - - // Clip the active best and worst quality values to limits - if (cpi->active_worst_quality > cpi->worst_quality) - cpi->active_worst_quality = cpi->worst_quality; - - if (cpi->active_best_quality < cpi->best_quality) - cpi->active_best_quality = cpi->best_quality; - - if (cpi->active_best_quality > cpi->worst_quality) - cpi->active_best_quality = cpi->worst_quality; - - if (cpi->active_worst_quality < cpi->active_best_quality) - cpi->active_worst_quality = cpi->active_best_quality; - - // Limit Q range for the adaptive loop. - if (cm->frame_type == KEY_FRAME && !cpi->this_key_frame_forced) { - *top_index = - (cpi->active_worst_quality + cpi->active_best_quality * 3) / 4; - // If this is the first (key) frame in 1-pass, active best is the user - // best-allowed, and leave the top_index to active_worst. - if (cpi->pass == 0 && cpi->common.current_video_frame == 0) { - cpi->active_best_quality = cpi->oxcf.best_allowed_q; - *top_index = cpi->oxcf.worst_allowed_q; - } - } else if (!cpi->is_src_frame_alt_ref && - (cpi->oxcf.end_usage != USAGE_STREAM_FROM_SERVER) && - (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { - *top_index = - (cpi->active_worst_quality + cpi->active_best_quality) / 2; - } else { - *top_index = cpi->active_worst_quality; - } - *bottom_index = cpi->active_best_quality; - - if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) { - q = cpi->active_best_quality; - // Special case code to try and match quality with forced key frames - } else if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) { - q = cpi->last_boosted_qindex; - } else { - // Determine initial Q to try. - if (cpi->pass == 0) { - // 1-pass: for now, use per-frame-bw for target size of frame, scaled - // by |x| for key frame. - int scale = (cm->frame_type == KEY_FRAME) ? 5 : 1; - q = vp9_regulate_q(cpi, scale * cpi->av_per_frame_bandwidth); - } else { - q = vp9_regulate_q(cpi, cpi->this_frame_target); - } - if (q > *top_index) - q = *top_index; - } - - return q; -} -static void encode_frame_to_data_rate(VP9_COMP *cpi, - unsigned long *size, - unsigned char *dest, - unsigned int *frame_flags) { - VP9_COMMON *const cm = &cpi->common; - TX_SIZE t; - int q; - int frame_over_shoot_limit; - int frame_under_shoot_limit; - - int loop = 0; - int loop_count; - - int q_low; - int q_high; - - int top_index; - int bottom_index; - int active_worst_qchanged = 0; - - int overshoot_seen = 0; - int undershoot_seen = 0; - - SPEED_FEATURES *const sf = &cpi->sf; - unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height); - struct segmentation *const seg = &cm->seg; - - /* Scale the source buffer, if required. */ - if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width || - cm->mi_rows * 8 != cpi->un_scaled_source->y_height) { - scale_and_extend_frame(cpi->un_scaled_source, &cpi->scaled_source); - cpi->Source = &cpi->scaled_source; - } else { - cpi->Source = cpi->un_scaled_source; - } - scale_references(cpi); - - // Clear down mmx registers to allow floating point in what follows. - vp9_clear_system_state(); - - // For an alt ref frame in 2 pass we skip the call to the second - // pass function that sets the target bandwidth so we must set it here. - if (cpi->refresh_alt_ref_frame) { - // Set a per frame bit target for the alt ref frame. - cpi->per_frame_bandwidth = cpi->twopass.gf_bits; - // Set a per second target bitrate. - cpi->target_bandwidth = (int)(cpi->twopass.gf_bits * cpi->output_framerate); - } - - // Clear zbin over-quant value and mode boost values. - cpi->zbin_mode_boost = 0; - - // Enable or disable mode based tweaking of the zbin. - // For 2 pass only used where GF/ARF prediction quality - // is above a threshold. - cpi->zbin_mode_boost = 0; - cpi->zbin_mode_boost_enabled = 0; - - // Current default encoder behavior for the altref sign bias. - cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = cpi->source_alt_ref_active; - - // Check to see if a key frame is signaled. - // For two pass with auto key frame enabled cm->frame_type may already be - // set, but not for one pass. - if ((cm->current_video_frame == 0) || - (cm->frame_flags & FRAMEFLAGS_KEY) || - (cpi->oxcf.auto_key && (cpi->frames_since_key % - cpi->key_frame_frequency == 0))) { - // Set frame type to key frame for the force key frame, if we exceed the - // maximum distance in an automatic keyframe selection or for the first - // frame. - cm->frame_type = KEY_FRAME; - } - - // Set default state for segment based loop filter update flags. - cm->lf.mode_ref_delta_update = 0; - - // Initialize cpi->mv_step_param to default based on max resolution. - cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def); - // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate. - if (sf->auto_mv_step_size) { - if (frame_is_intra_only(&cpi->common)) { - // Initialize max_mv_magnitude for use in the first INTER frame - // after a key/intra-only frame. - cpi->max_mv_magnitude = max_mv_def; - } else { - if (cm->show_frame) - // Allow mv_steps to correspond to twice the max mv magnitude found - // in the previous frame, capped by the default max_mv_magnitude based - // on resolution. - cpi->mv_step_param = vp9_init_search_range( - cpi, MIN(max_mv_def, 2 * cpi->max_mv_magnitude)); - cpi->max_mv_magnitude = 0; - } - } - - // Set various flags etc to special state if it is a key frame. - if (frame_is_intra_only(cm)) { - vp9_setup_key_frame(cpi); - // Reset the loop filter deltas and segmentation map. - setup_features(cm); - - // If segmentation is enabled force a map update for key frames. - if (seg->enabled) { - seg->update_map = 1; - seg->update_data = 1; - } - - // The alternate reference frame cannot be active for a key frame. - cpi->source_alt_ref_active = 0; - - cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0); - cm->frame_parallel_decoding_mode = - (cpi->oxcf.frame_parallel_decoding_mode != 0); - if (cm->error_resilient_mode) { - cm->frame_parallel_decoding_mode = 1; - cm->reset_frame_context = 0; - cm->refresh_frame_context = 0; - } else if (cm->intra_only) { - // Only reset the current context. - cm->reset_frame_context = 2; - } - } - - // Configure experimental use of segmentation for enhanced coding of - // static regions if indicated. - // Only allowed in second pass of two pass (as requires lagged coding) - // and if the relevant speed feature flag is set. - if ((cpi->pass == 2) && (cpi->sf.static_segmentation)) { - configure_static_seg_features(cpi); - } - - // Decide how big to make the frame. - vp9_pick_frame_size(cpi); - - vp9_clear_system_state(); - - q = pick_q_and_adjust_q_bounds(cpi, &bottom_index, &top_index); - - q_high = top_index; - q_low = bottom_index; - - vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, - &frame_over_shoot_limit); - -#if CONFIG_MULTIPLE_ARF - // Force the quantizer determined by the coding order pattern. - if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) && - cpi->oxcf.end_usage != USAGE_CONSTANT_QUALITY) { - double new_q; - double current_q = vp9_convert_qindex_to_q(cpi->active_worst_quality); - int level = cpi->this_frame_weight; - assert(level >= 0); - - // Set quantizer steps at 10% increments. - new_q = current_q * (1.0 - (0.2 * (cpi->max_arf_level - level))); - q = cpi->active_worst_quality + vp9_compute_qdelta(cpi, current_q, new_q); - - bottom_index = q; - top_index = q; - q_low = q; - q_high = q; - - printf("frame:%d q:%d\n", cm->current_video_frame, q); - } -#endif - - loop_count = 0; - vp9_zero(cpi->rd_tx_select_threshes); - - if (!frame_is_intra_only(cm)) { - cm->mcomp_filter_type = DEFAULT_INTERP_FILTER; - /* TODO: Decide this more intelligently */ - cm->allow_high_precision_mv = q < HIGH_PRECISION_MV_QTHRESH; - set_mvcost(cpi); - } - -#if CONFIG_VP9_POSTPROC - - if (cpi->oxcf.noise_sensitivity > 0) { - int l = 0; - - switch (cpi->oxcf.noise_sensitivity) { - case 1: - l = 20; - break; - case 2: - l = 40; - break; - case 3: - l = 60; - break; - case 4: - case 5: - l = 100; - break; - case 6: - l = 150; - break; - } - - vp9_denoise(cpi->Source, cpi->Source, l); - } - -#endif - -#ifdef OUTPUT_YUV_SRC - vp9_write_yuv_frame(cpi->Source); -#endif - - do { - vp9_clear_system_state(); // __asm emms; - - vp9_set_quantizer(cpi, q); - - if (loop_count == 0) { - // Set up entropy context depending on frame type. The decoder mandates - // the use of the default context, index 0, for keyframes and inter - // frames where the error_resilient_mode or intra_only flag is set. For - // other inter-frames the encoder currently uses only two contexts; - // context 1 for ALTREF frames and context 0 for the others. - if (cm->frame_type == KEY_FRAME) { - vp9_setup_key_frame(cpi); - } else { - if (!cm->intra_only && !cm->error_resilient_mode) { - cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame; - } - vp9_setup_inter_frame(cpi); - } - } - - if (cpi->oxcf.aq_mode == VARIANCE_AQ) { - vp9_vaq_frame_setup(cpi); - } - - // transform / motion compensation build reconstruction frame - - vp9_encode_frame(cpi); - - // Update the skip mb flag probabilities based on the distribution - // seen in the last encoder iteration. - // update_base_skip_probs(cpi); - - vp9_clear_system_state(); // __asm emms; - - // Dummy pack of the bitstream using up to date stats to get an - // accurate estimate of output frame size to determine if we need - // to recode. - vp9_save_coding_context(cpi); - cpi->dummy_packing = 1; - vp9_pack_bitstream(cpi, dest, size); - cpi->projected_frame_size = (*size) << 3; - vp9_restore_coding_context(cpi); - - if (frame_over_shoot_limit == 0) - frame_over_shoot_limit = 1; - active_worst_qchanged = 0; - - if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) { - loop = 0; - } else { - // Special case handling for forced key frames - if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) { - int last_q = q; - int kf_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm)); - - int high_err_target = cpi->ambient_err; - int low_err_target = cpi->ambient_err >> 1; - - // Prevent possible divide by zero error below for perfect KF - kf_err += !kf_err; - - // The key frame is not good enough or we can afford - // to make it better without undue risk of popping. - if ((kf_err > high_err_target && - cpi->projected_frame_size <= frame_over_shoot_limit) || - (kf_err > low_err_target && - cpi->projected_frame_size <= frame_under_shoot_limit)) { - // Lower q_high - q_high = q > q_low ? q - 1 : q_low; - - // Adjust Q - q = (q * high_err_target) / kf_err; - q = MIN(q, (q_high + q_low) >> 1); - } else if (kf_err < low_err_target && - cpi->projected_frame_size >= frame_under_shoot_limit) { - // The key frame is much better than the previous frame - // Raise q_low - q_low = q < q_high ? q + 1 : q_high; - - // Adjust Q - q = (q * low_err_target) / kf_err; - q = MIN(q, (q_high + q_low + 1) >> 1); - } - - // Clamp Q to upper and lower limits: - q = clamp(q, q_low, q_high); - - loop = q != last_q; - } else if (recode_loop_test( - cpi, frame_over_shoot_limit, frame_under_shoot_limit, - q, top_index, bottom_index)) { - // Is the projected frame size out of range and are we allowed - // to attempt to recode. - int last_q = q; - int retries = 0; - - // Frame size out of permitted range: - // Update correction factor & compute new Q to try... - - // Frame is too large - if (cpi->projected_frame_size > cpi->this_frame_target) { - // Raise Qlow as to at least the current value - q_low = q < q_high ? q + 1 : q_high; - - if (undershoot_seen || loop_count > 1) { - // Update rate_correction_factor unless - // cpi->active_worst_quality has changed. - if (!active_worst_qchanged) - vp9_update_rate_correction_factors(cpi, 1); - - q = (q_high + q_low + 1) / 2; - } else { - // Update rate_correction_factor unless - // cpi->active_worst_quality has changed. - if (!active_worst_qchanged) - vp9_update_rate_correction_factors(cpi, 0); - - q = vp9_regulate_q(cpi, cpi->this_frame_target); - - while (q < q_low && retries < 10) { - vp9_update_rate_correction_factors(cpi, 0); - q = vp9_regulate_q(cpi, cpi->this_frame_target); - retries++; - } - } - - overshoot_seen = 1; - } else { - // Frame is too small - q_high = q > q_low ? q - 1 : q_low; - - if (overshoot_seen || loop_count > 1) { - // Update rate_correction_factor unless - // cpi->active_worst_quality has changed. - if (!active_worst_qchanged) - vp9_update_rate_correction_factors(cpi, 1); - - q = (q_high + q_low) / 2; - } else { - // Update rate_correction_factor unless - // cpi->active_worst_quality has changed. - if (!active_worst_qchanged) - vp9_update_rate_correction_factors(cpi, 0); - - q = vp9_regulate_q(cpi, cpi->this_frame_target); - - // Special case reset for qlow for constrained quality. - // This should only trigger where there is very substantial - // undershoot on a frame and the auto cq level is above - // the user passsed in value. - if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && q < q_low) { - q_low = q; - } - - while (q > q_high && retries < 10) { - vp9_update_rate_correction_factors(cpi, 0); - q = vp9_regulate_q(cpi, cpi->this_frame_target); - retries++; - } - } - - undershoot_seen = 1; - } - - // Clamp Q to upper and lower limits: - q = clamp(q, q_low, q_high); - - loop = q != last_q; - } else { - loop = 0; - } - } - - if (cpi->is_src_frame_alt_ref) - loop = 0; - - if (loop) { - loop_count++; - -#if CONFIG_INTERNAL_STATS - cpi->tot_recode_hits++; -#endif - } - } while (loop); - - // Special case code to reduce pulsing when key frames are forced at a - // fixed interval. Note the reconstruction error if it is the frame before - // the force key frame - if (cpi->next_key_frame_forced && (cpi->twopass.frames_to_key == 0)) { - cpi->ambient_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm)); - } - - if (cm->frame_type == KEY_FRAME) - cpi->refresh_last_frame = 1; - - cm->frame_to_show = get_frame_new_buffer(cm); - -#if WRITE_RECON_BUFFER - if (cm->show_frame) - write_cx_frame_to_file(cm->frame_to_show, - cm->current_video_frame); - else - write_cx_frame_to_file(cm->frame_to_show, - cm->current_video_frame + 1000); -#endif - - // Pick the loop filter level for the frame. - loopfilter_frame(cpi, cm); - -#if WRITE_RECON_BUFFER - if (cm->show_frame) - write_cx_frame_to_file(cm->frame_to_show, - cm->current_video_frame + 2000); - else - write_cx_frame_to_file(cm->frame_to_show, - cm->current_video_frame + 3000); -#endif - - // build the bitstream - cpi->dummy_packing = 0; - vp9_pack_bitstream(cpi, dest, size); - - if (cm->seg.update_map) - update_reference_segmentation_map(cpi); - - release_scaled_references(cpi); - update_reference_frames(cpi); - - for (t = TX_4X4; t <= TX_32X32; t++) - full_to_model_counts(cpi->common.counts.coef[t], - cpi->coef_counts[t]); - if (!cpi->common.error_resilient_mode && - !cpi->common.frame_parallel_decoding_mode) { - vp9_adapt_coef_probs(&cpi->common); - } - - if (!frame_is_intra_only(&cpi->common)) { - FRAME_COUNTS *counts = &cpi->common.counts; - - vp9_copy(counts->y_mode, cpi->y_mode_count); - vp9_copy(counts->uv_mode, cpi->y_uv_mode_count); - vp9_copy(counts->partition, cpi->partition_count); - vp9_copy(counts->intra_inter, cpi->intra_inter_count); - vp9_copy(counts->comp_inter, cpi->comp_inter_count); - vp9_copy(counts->single_ref, cpi->single_ref_count); - vp9_copy(counts->comp_ref, cpi->comp_ref_count); - counts->mv = cpi->NMVcount; - if (!cpi->common.error_resilient_mode && - !cpi->common.frame_parallel_decoding_mode) { - vp9_adapt_mode_probs(&cpi->common); - vp9_adapt_mv_probs(&cpi->common, cpi->common.allow_high_precision_mv); - } - } - -#ifdef ENTROPY_STATS - vp9_update_mode_context_stats(cpi); -#endif - - /* Move storing frame_type out of the above loop since it is also - * needed in motion search besides loopfilter */ - cm->last_frame_type = cm->frame_type; - - // Update rate control heuristics - cpi->total_byte_count += (*size); - cpi->projected_frame_size = (*size) << 3; - - // Post encode loop adjustment of Q prediction. - if (!active_worst_qchanged) - vp9_update_rate_correction_factors(cpi, (cpi->sf.recode_loop || - cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) ? 2 : 0); - - - cpi->last_q[cm->frame_type] = cm->base_qindex; - - // Keep record of last boosted (KF/KF/ARF) Q value. - // If the current frame is coded at a lower Q then we also update it. - // If all mbs in this group are skipped only update if the Q value is - // better than that already stored. - // This is used to help set quality in forced key frames to reduce popping - if ((cm->base_qindex < cpi->last_boosted_qindex) || - ((cpi->static_mb_pct < 100) && - ((cm->frame_type == KEY_FRAME) || - cpi->refresh_alt_ref_frame || - (cpi->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) { - cpi->last_boosted_qindex = cm->base_qindex; - } - - if (cm->frame_type == KEY_FRAME) { - vp9_adjust_key_frame_context(cpi); - } - - // Keep a record of ambient average Q. - if (cm->frame_type != KEY_FRAME) - cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + - cm->base_qindex) >> 2; - - // Keep a record from which we can calculate the average Q excluding GF - // updates and key frames. - if (cm->frame_type != KEY_FRAME && - !cpi->refresh_golden_frame && - !cpi->refresh_alt_ref_frame) { - cpi->ni_frames++; - cpi->tot_q += vp9_convert_qindex_to_q(q); - cpi->avg_q = cpi->tot_q / (double)cpi->ni_frames; - - // Calculate the average Q for normal inter frames (not key or GFU frames). - cpi->ni_tot_qi += q; - cpi->ni_av_qi = cpi->ni_tot_qi / cpi->ni_frames; - } - - // Update the buffer level variable. - // Non-viewable frames are a special case and are treated as pure overhead. - if (!cm->show_frame) - cpi->bits_off_target -= cpi->projected_frame_size; - else - cpi->bits_off_target += cpi->av_per_frame_bandwidth - - cpi->projected_frame_size; - - // Clip the buffer level at the maximum buffer size - if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) - cpi->bits_off_target = cpi->oxcf.maximum_buffer_size; - - // Rolling monitors of whether we are over or underspending used to help - // regulate min and Max Q in two pass. - if (cm->frame_type != KEY_FRAME) { - cpi->rolling_target_bits = - ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4; - cpi->rolling_actual_bits = - ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4; - cpi->long_rolling_target_bits = - ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32; - cpi->long_rolling_actual_bits = - ((cpi->long_rolling_actual_bits * 31) + - cpi->projected_frame_size + 16) / 32; - } - - // Actual bits spent - cpi->total_actual_bits += cpi->projected_frame_size; - - // Debug stats - cpi->total_target_vs_actual += (cpi->this_frame_target - - cpi->projected_frame_size); - - cpi->buffer_level = cpi->bits_off_target; - -#ifndef DISABLE_RC_LONG_TERM_MEM - // Update bits left to the kf and gf groups to account for overshoot or - // undershoot on these frames - if (cm->frame_type == KEY_FRAME) { - cpi->twopass.kf_group_bits += cpi->this_frame_target - - cpi->projected_frame_size; - - cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0); - } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) { - cpi->twopass.gf_group_bits += cpi->this_frame_target - - cpi->projected_frame_size; - - cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0); - } -#endif - -#if 0 - output_frame_level_debug_stats(cpi); -#endif - if (cpi->refresh_golden_frame == 1) - cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN; - else - cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_GOLDEN; - - if (cpi->refresh_alt_ref_frame == 1) - cm->frame_flags = cm->frame_flags | FRAMEFLAGS_ALTREF; - else - cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_ALTREF; - - - if (cpi->refresh_last_frame & cpi->refresh_golden_frame) - cpi->gold_is_last = 1; - else if (cpi->refresh_last_frame ^ cpi->refresh_golden_frame) - cpi->gold_is_last = 0; - - if (cpi->refresh_last_frame & cpi->refresh_alt_ref_frame) - cpi->alt_is_last = 1; - else if (cpi->refresh_last_frame ^ cpi->refresh_alt_ref_frame) - cpi->alt_is_last = 0; - - if (cpi->refresh_alt_ref_frame & cpi->refresh_golden_frame) - cpi->gold_is_alt = 1; - else if (cpi->refresh_alt_ref_frame ^ cpi->refresh_golden_frame) - cpi->gold_is_alt = 0; - - cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG; - - if (cpi->gold_is_last) - cpi->ref_frame_flags &= ~VP9_GOLD_FLAG; - - if (cpi->alt_is_last) - cpi->ref_frame_flags &= ~VP9_ALT_FLAG; - - if (cpi->gold_is_alt) - cpi->ref_frame_flags &= ~VP9_ALT_FLAG; - - if (cpi->oxcf.play_alternate && cpi->refresh_alt_ref_frame - && (cm->frame_type != KEY_FRAME)) - // Update the alternate reference frame stats as appropriate. - update_alt_ref_frame_stats(cpi); - else - // Update the Golden frame stats as appropriate. - update_golden_frame_stats(cpi); - - if (cm->frame_type == KEY_FRAME) { - // Tell the caller that the frame was coded as a key frame - *frame_flags = cm->frame_flags | FRAMEFLAGS_KEY; - -#if CONFIG_MULTIPLE_ARF - // Reset the sequence number. - if (cpi->multi_arf_enabled) { - cpi->sequence_number = 0; - cpi->frame_coding_order_period = cpi->new_frame_coding_order_period; - cpi->new_frame_coding_order_period = -1; - } -#endif - - // As this frame is a key frame the next defaults to an inter frame. - cm->frame_type = INTER_FRAME; - } else { - *frame_flags = cm->frame_flags&~FRAMEFLAGS_KEY; - -#if CONFIG_MULTIPLE_ARF - /* Increment position in the coded frame sequence. */ - if (cpi->multi_arf_enabled) { - ++cpi->sequence_number; - if (cpi->sequence_number >= cpi->frame_coding_order_period) { - cpi->sequence_number = 0; - cpi->frame_coding_order_period = cpi->new_frame_coding_order_period; - cpi->new_frame_coding_order_period = -1; - } - cpi->this_frame_weight = cpi->arf_weight[cpi->sequence_number]; - assert(cpi->this_frame_weight >= 0); - } -#endif - } - - // Clear the one shot update flags for segmentation map and mode/ref loop - // filter deltas. - cm->seg.update_map = 0; - cm->seg.update_data = 0; - cm->lf.mode_ref_delta_update = 0; - - // keep track of the last coded dimensions - cm->last_width = cm->width; - cm->last_height = cm->height; - - // reset to normal state now that we are done. - cm->last_show_frame = cm->show_frame; - if (cm->show_frame) { - // current mip will be the prev_mip for the next frame - MODE_INFO *temp = cm->prev_mip; - MODE_INFO **temp2 = cm->prev_mi_grid_base; - cm->prev_mip = cm->mip; - cm->mip = temp; - cm->prev_mi_grid_base = cm->mi_grid_base; - cm->mi_grid_base = temp2; - - // update the upper left visible macroblock ptrs - cm->mi = cm->mip + cm->mode_info_stride + 1; - cm->mi_grid_visible = cm->mi_grid_base + cm->mode_info_stride + 1; - - cpi->mb.e_mbd.mi_8x8 = cm->mi_grid_visible; - cpi->mb.e_mbd.mi_8x8[0] = cm->mi; - - // Don't increment frame counters if this was an altref buffer - // update not a real frame - ++cm->current_video_frame; - ++cpi->frames_since_key; - } - // restore prev_mi - cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1; - cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1; -} - -static void Pass2Encode(VP9_COMP *cpi, unsigned long *size, - unsigned char *dest, unsigned int *frame_flags) { - cpi->enable_encode_breakout = 1; - - if (!cpi->refresh_alt_ref_frame) - vp9_second_pass(cpi); - - encode_frame_to_data_rate(cpi, size, dest, frame_flags); - // vp9_print_modes_and_motion_vectors(&cpi->common, "encode.stt"); -#ifdef DISABLE_RC_LONG_TERM_MEM - cpi->twopass.bits_left -= cpi->this_frame_target; -#else - cpi->twopass.bits_left -= 8 * *size; -#endif - - if (!cpi->refresh_alt_ref_frame) { - double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate; - double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth - * cpi->oxcf.two_pass_vbrmin_section - / 100); - - if (two_pass_min_rate < lower_bounds_min_rate) - two_pass_min_rate = lower_bounds_min_rate; - - cpi->twopass.bits_left += (int64_t)(two_pass_min_rate - / cpi->oxcf.framerate); - } -} - -static void check_initial_width(VP9_COMP *cpi, YV12_BUFFER_CONFIG *sd) { - VP9_COMMON *cm = &cpi->common; - if (!cpi->initial_width) { - // TODO(jkoleszar): Support 1/4 subsampling? - cm->subsampling_x = (sd != NULL) && sd->uv_width < sd->y_width; - cm->subsampling_y = (sd != NULL) && sd->uv_height < sd->y_height; - alloc_raw_frame_buffers(cpi); - - cpi->initial_width = cm->width; - cpi->initial_height = cm->height; - } -} - - -int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags, - YV12_BUFFER_CONFIG *sd, int64_t time_stamp, - int64_t end_time) { - VP9_COMP *cpi = (VP9_COMP *) ptr; - struct vpx_usec_timer timer; - int res = 0; - - check_initial_width(cpi, sd); - vpx_usec_timer_start(&timer); - if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags, - cpi->active_map_enabled ? cpi->active_map : NULL)) - res = -1; - vpx_usec_timer_mark(&timer); - cpi->time_receive_data += vpx_usec_timer_elapsed(&timer); - - return res; -} - - -static int frame_is_reference(const VP9_COMP *cpi) { - const VP9_COMMON *cm = &cpi->common; - - return cm->frame_type == KEY_FRAME || - cpi->refresh_last_frame || - cpi->refresh_golden_frame || - cpi->refresh_alt_ref_frame || - cm->refresh_frame_context || - cm->lf.mode_ref_delta_update || - cm->seg.update_map || - cm->seg.update_data; -} - -#if CONFIG_MULTIPLE_ARF -int is_next_frame_arf(VP9_COMP *cpi) { - // Negative entry in frame_coding_order indicates an ARF at this position. - return cpi->frame_coding_order[cpi->sequence_number + 1] < 0 ? 1 : 0; -} -#endif - -int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, - unsigned long *size, unsigned char *dest, - int64_t *time_stamp, int64_t *time_end, int flush) { - VP9_COMP *cpi = (VP9_COMP *) ptr; - VP9_COMMON *cm = &cpi->common; - struct vpx_usec_timer cmptimer; - YV12_BUFFER_CONFIG *force_src_buffer = NULL; - int i; - // FILE *fp_out = fopen("enc_frame_type.txt", "a"); - - if (!cpi) - return -1; - - vpx_usec_timer_start(&cmptimer); - - cpi->source = NULL; - - cpi->common.allow_high_precision_mv = ALTREF_HIGH_PRECISION_MV; - set_mvcost(cpi); - - // Should we code an alternate reference frame. - if (cpi->oxcf.play_alternate && cpi->source_alt_ref_pending) { - int frames_to_arf; - -#if CONFIG_MULTIPLE_ARF - assert(!cpi->multi_arf_enabled || - cpi->frame_coding_order[cpi->sequence_number] < 0); - - if (cpi->multi_arf_enabled && (cpi->pass == 2)) - frames_to_arf = (-cpi->frame_coding_order[cpi->sequence_number]) - - cpi->next_frame_in_order; - else -#endif - frames_to_arf = cpi->frames_till_gf_update_due; - - assert(frames_to_arf < cpi->twopass.frames_to_key); - - if ((cpi->source = vp9_lookahead_peek(cpi->lookahead, frames_to_arf))) { -#if CONFIG_MULTIPLE_ARF - cpi->alt_ref_source[cpi->arf_buffered] = cpi->source; -#else - cpi->alt_ref_source = cpi->source; -#endif - - if (cpi->oxcf.arnr_max_frames > 0) { - // Produce the filtered ARF frame. - // TODO(agrange) merge these two functions. - configure_arnr_filter(cpi, cm->current_video_frame + frames_to_arf, - cpi->gfu_boost); - vp9_temporal_filter_prepare(cpi, frames_to_arf); - vp9_extend_frame_borders(&cpi->alt_ref_buffer, - cm->subsampling_x, cm->subsampling_y); - force_src_buffer = &cpi->alt_ref_buffer; - } - - cm->show_frame = 0; - cpi->refresh_alt_ref_frame = 1; - cpi->refresh_golden_frame = 0; - cpi->refresh_last_frame = 0; - cpi->is_src_frame_alt_ref = 0; - - // TODO(agrange) This needs to vary depending on where the next ARF is. - cpi->frames_till_alt_ref_frame = frames_to_arf; - -#if CONFIG_MULTIPLE_ARF - if (!cpi->multi_arf_enabled) -#endif - cpi->source_alt_ref_pending = 0; // Clear Pending altf Ref flag. - } - } - - if (!cpi->source) { -#if CONFIG_MULTIPLE_ARF - int i; -#endif - if ((cpi->source = vp9_lookahead_pop(cpi->lookahead, flush))) { - cm->show_frame = 1; - cm->intra_only = 0; - -#if CONFIG_MULTIPLE_ARF - // Is this frame the ARF overlay. - cpi->is_src_frame_alt_ref = 0; - for (i = 0; i < cpi->arf_buffered; ++i) { - if (cpi->source == cpi->alt_ref_source[i]) { - cpi->is_src_frame_alt_ref = 1; - cpi->refresh_golden_frame = 1; - break; - } - } -#else - cpi->is_src_frame_alt_ref = cpi->alt_ref_source - && (cpi->source == cpi->alt_ref_source); -#endif - if (cpi->is_src_frame_alt_ref) { - // Current frame is an ARF overlay frame. -#if CONFIG_MULTIPLE_ARF - cpi->alt_ref_source[i] = NULL; -#else - cpi->alt_ref_source = NULL; -#endif - // Don't refresh the last buffer for an ARF overlay frame. It will - // become the GF so preserve last as an alternative prediction option. - cpi->refresh_last_frame = 0; - } -#if CONFIG_MULTIPLE_ARF - ++cpi->next_frame_in_order; -#endif - } - } - - if (cpi->source) { - cpi->un_scaled_source = cpi->Source = force_src_buffer ? force_src_buffer - : &cpi->source->img; - *time_stamp = cpi->source->ts_start; - *time_end = cpi->source->ts_end; - *frame_flags = cpi->source->flags; - - // fprintf(fp_out, " Frame:%d", cm->current_video_frame); -#if CONFIG_MULTIPLE_ARF - if (cpi->multi_arf_enabled) { - // fprintf(fp_out, " seq_no:%d this_frame_weight:%d", - // cpi->sequence_number, cpi->this_frame_weight); - } else { - // fprintf(fp_out, "\n"); - } -#else - // fprintf(fp_out, "\n"); -#endif - -#if CONFIG_MULTIPLE_ARF - if ((cm->frame_type != KEY_FRAME) && (cpi->pass == 2)) - cpi->source_alt_ref_pending = is_next_frame_arf(cpi); -#endif - } else { - *size = 0; - if (flush && cpi->pass == 1 && !cpi->twopass.first_pass_done) { - vp9_end_first_pass(cpi); /* get last stats packet */ - cpi->twopass.first_pass_done = 1; - } - - // fclose(fp_out); - return -1; - } - - if (cpi->source->ts_start < cpi->first_time_stamp_ever) { - cpi->first_time_stamp_ever = cpi->source->ts_start; - cpi->last_end_time_stamp_seen = cpi->source->ts_start; - } - - // adjust frame rates based on timestamps given - if (!cpi->refresh_alt_ref_frame) { - int64_t this_duration; - int step = 0; - - if (cpi->source->ts_start == cpi->first_time_stamp_ever) { - this_duration = cpi->source->ts_end - cpi->source->ts_start; - step = 1; - } else { - int64_t last_duration = cpi->last_end_time_stamp_seen - - cpi->last_time_stamp_seen; - - this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen; - - // do a step update if the duration changes by 10% - if (last_duration) - step = (int)((this_duration - last_duration) * 10 / last_duration); - } - - if (this_duration) { - if (step) { - vp9_new_framerate(cpi, 10000000.0 / this_duration); - } else { - // Average this frame's rate into the last second's average - // frame rate. If we haven't seen 1 second yet, then average - // over the whole interval seen. - const double interval = MIN((double)(cpi->source->ts_end - - cpi->first_time_stamp_ever), 10000000.0); - double avg_duration = 10000000.0 / cpi->oxcf.framerate; - avg_duration *= (interval - avg_duration + this_duration); - avg_duration /= interval; - - vp9_new_framerate(cpi, 10000000.0 / avg_duration); - } - } - - cpi->last_time_stamp_seen = cpi->source->ts_start; - cpi->last_end_time_stamp_seen = cpi->source->ts_end; - } - - // start with a 0 size frame - *size = 0; - - // Clear down mmx registers - vp9_clear_system_state(); // __asm emms; - - /* find a free buffer for the new frame, releasing the reference previously - * held. - */ - cm->fb_idx_ref_cnt[cm->new_fb_idx]--; - cm->new_fb_idx = get_free_fb(cm); - -#if CONFIG_MULTIPLE_ARF - /* Set up the correct ARF frame. */ - if (cpi->refresh_alt_ref_frame) { - ++cpi->arf_buffered; - } - if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) && - (cpi->pass == 2)) { - cpi->alt_fb_idx = cpi->arf_buffer_idx[cpi->sequence_number]; - } -#endif - - /* Get the mapping of L/G/A to the reference buffer pool */ - cm->active_ref_idx[0] = cm->ref_frame_map[cpi->lst_fb_idx]; - cm->active_ref_idx[1] = cm->ref_frame_map[cpi->gld_fb_idx]; - cm->active_ref_idx[2] = cm->ref_frame_map[cpi->alt_fb_idx]; - -#if 0 // CONFIG_MULTIPLE_ARF - if (cpi->multi_arf_enabled) { - fprintf(fp_out, " idx(%d, %d, %d, %d) active(%d, %d, %d)", - cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx, cm->new_fb_idx, - cm->active_ref_idx[0], cm->active_ref_idx[1], cm->active_ref_idx[2]); - if (cpi->refresh_alt_ref_frame) - fprintf(fp_out, " type:ARF"); - if (cpi->is_src_frame_alt_ref) - fprintf(fp_out, " type:OVERLAY[%d]", cpi->alt_fb_idx); - fprintf(fp_out, "\n"); - } -#endif - - cm->frame_type = INTER_FRAME; - cm->frame_flags = *frame_flags; - - // Reset the frame pointers to the current frame size - vp9_realloc_frame_buffer(get_frame_new_buffer(cm), - cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, - VP9BORDERINPIXELS); - - // Calculate scaling factors for each of the 3 available references - for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) - vp9_setup_scale_factors(cm, i); - - vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm); - - if (cpi->oxcf.aq_mode == VARIANCE_AQ) { - vp9_vaq_init(); - } - - if (cpi->pass == 1) { - Pass1Encode(cpi, size, dest, frame_flags); - } else if (cpi->pass == 2) { - Pass2Encode(cpi, size, dest, frame_flags); - } else { - encode_frame_to_data_rate(cpi, size, dest, frame_flags); - } - - if (cm->refresh_frame_context) - cm->frame_contexts[cm->frame_context_idx] = cm->fc; - - if (*size > 0) { - // if its a dropped frame honor the requests on subsequent frames - cpi->droppable = !frame_is_reference(cpi); - - // return to normal state - cm->reset_frame_context = 0; - cm->refresh_frame_context = 1; - cpi->refresh_alt_ref_frame = 0; - cpi->refresh_golden_frame = 0; - cpi->refresh_last_frame = 1; - cm->frame_type = INTER_FRAME; - } - - vpx_usec_timer_mark(&cmptimer); - cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer); - - if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame) - generate_psnr_packet(cpi); - -#if CONFIG_INTERNAL_STATS - - if (cpi->pass != 1) { - cpi->bytes += *size; - - if (cm->show_frame) { - cpi->count++; - - if (cpi->b_calculate_psnr) { - double ye, ue, ve; - double frame_psnr; - YV12_BUFFER_CONFIG *orig = cpi->Source; - YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show; - YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer; - int y_samples = orig->y_height * orig->y_width; - int uv_samples = orig->uv_height * orig->uv_width; - int t_samples = y_samples + 2 * uv_samples; - double sq_error; - - ye = (double)calc_plane_error(orig->y_buffer, orig->y_stride, - recon->y_buffer, recon->y_stride, - orig->y_crop_width, orig->y_crop_height); - - ue = (double)calc_plane_error(orig->u_buffer, orig->uv_stride, - recon->u_buffer, recon->uv_stride, - orig->uv_crop_width, orig->uv_crop_height); - - ve = (double)calc_plane_error(orig->v_buffer, orig->uv_stride, - recon->v_buffer, recon->uv_stride, - orig->uv_crop_width, orig->uv_crop_height); - - sq_error = ye + ue + ve; - - frame_psnr = vp9_mse2psnr(t_samples, 255.0, sq_error); - - cpi->total_y += vp9_mse2psnr(y_samples, 255.0, ye); - cpi->total_u += vp9_mse2psnr(uv_samples, 255.0, ue); - cpi->total_v += vp9_mse2psnr(uv_samples, 255.0, ve); - cpi->total_sq_error += sq_error; - cpi->total += frame_psnr; - { - double frame_psnr2, frame_ssim2 = 0; - double weight = 0; -#if CONFIG_VP9_POSTPROC - vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer, - cm->lf.filter_level * 10 / 6); -#endif - vp9_clear_system_state(); - - ye = (double)calc_plane_error(orig->y_buffer, orig->y_stride, - pp->y_buffer, pp->y_stride, - orig->y_crop_width, orig->y_crop_height); - - ue = (double)calc_plane_error(orig->u_buffer, orig->uv_stride, - pp->u_buffer, pp->uv_stride, - orig->uv_crop_width, orig->uv_crop_height); - - ve = (double)calc_plane_error(orig->v_buffer, orig->uv_stride, - pp->v_buffer, pp->uv_stride, - orig->uv_crop_width, orig->uv_crop_height); - - sq_error = ye + ue + ve; - - frame_psnr2 = vp9_mse2psnr(t_samples, 255.0, sq_error); - - cpi->totalp_y += vp9_mse2psnr(y_samples, 255.0, ye); - cpi->totalp_u += vp9_mse2psnr(uv_samples, 255.0, ue); - cpi->totalp_v += vp9_mse2psnr(uv_samples, 255.0, ve); - cpi->total_sq_error2 += sq_error; - cpi->totalp += frame_psnr2; - - frame_ssim2 = vp9_calc_ssim(cpi->Source, - recon, 1, &weight); - - cpi->summed_quality += frame_ssim2 * weight; - cpi->summed_weights += weight; - - frame_ssim2 = vp9_calc_ssim(cpi->Source, - &cm->post_proc_buffer, 1, &weight); - - cpi->summedp_quality += frame_ssim2 * weight; - cpi->summedp_weights += weight; -#if 0 - { - FILE *f = fopen("q_used.stt", "a"); - fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n", - cpi->common.current_video_frame, y2, u2, v2, - frame_psnr2, frame_ssim2); - fclose(f); - } -#endif - } - } - - if (cpi->b_calculate_ssimg) { - double y, u, v, frame_all; - frame_all = vp9_calc_ssimg(cpi->Source, cm->frame_to_show, - &y, &u, &v); - cpi->total_ssimg_y += y; - cpi->total_ssimg_u += u; - cpi->total_ssimg_v += v; - cpi->total_ssimg_all += frame_all; - } - } - } - -#endif - // fclose(fp_out); - return 0; -} - -int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest, - vp9_ppflags_t *flags) { - VP9_COMP *cpi = (VP9_COMP *) comp; - - if (!cpi->common.show_frame) { - return -1; - } else { - int ret; -#if CONFIG_VP9_POSTPROC - ret = vp9_post_proc_frame(&cpi->common, dest, flags); -#else - - if (cpi->common.frame_to_show) { - *dest = *cpi->common.frame_to_show; - dest->y_width = cpi->common.width; - dest->y_height = cpi->common.height; - dest->uv_height = cpi->common.height / 2; - ret = 0; - } else { - ret = -1; - } - -#endif // !CONFIG_VP9_POSTPROC - vp9_clear_system_state(); - return ret; - } -} - -int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows, - unsigned int cols, int delta_q[MAX_SEGMENTS], - int delta_lf[MAX_SEGMENTS], - unsigned int threshold[MAX_SEGMENTS]) { - VP9_COMP *cpi = (VP9_COMP *) comp; - signed char feature_data[SEG_LVL_MAX][MAX_SEGMENTS]; - struct segmentation *seg = &cpi->common.seg; - int i; - - if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols) - return -1; - - if (!map) { - vp9_disable_segmentation((VP9_PTR)cpi); - return 0; - } - - // Set the segmentation Map - vp9_set_segmentation_map((VP9_PTR)cpi, map); - - // Activate segmentation. - vp9_enable_segmentation((VP9_PTR)cpi); - - // Set up the quant, LF and breakout threshold segment data - for (i = 0; i < MAX_SEGMENTS; i++) { - feature_data[SEG_LVL_ALT_Q][i] = delta_q[i]; - feature_data[SEG_LVL_ALT_LF][i] = delta_lf[i]; - cpi->segment_encode_breakout[i] = threshold[i]; - } - - // Enable the loop and quant changes in the feature mask - for (i = 0; i < MAX_SEGMENTS; i++) { - if (delta_q[i]) - vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q); - else - vp9_disable_segfeature(seg, i, SEG_LVL_ALT_Q); - - if (delta_lf[i]) - vp9_enable_segfeature(seg, i, SEG_LVL_ALT_LF); - else - vp9_disable_segfeature(seg, i, SEG_LVL_ALT_LF); - } - - // Initialize the feature data structure - // SEGMENT_DELTADATA 0, SEGMENT_ABSDATA 1 - vp9_set_segment_data((VP9_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA); - - return 0; -} - -int vp9_set_active_map(VP9_PTR comp, unsigned char *map, - unsigned int rows, unsigned int cols) { - VP9_COMP *cpi = (VP9_COMP *) comp; - - if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) { - if (map) { - vpx_memcpy(cpi->active_map, map, rows * cols); - cpi->active_map_enabled = 1; - } else { - cpi->active_map_enabled = 0; - } - - return 0; - } else { - // cpi->active_map_enabled = 0; - return -1; - } -} - -int vp9_set_internal_size(VP9_PTR comp, - VPX_SCALING horiz_mode, VPX_SCALING vert_mode) { - VP9_COMP *cpi = (VP9_COMP *) comp; - VP9_COMMON *cm = &cpi->common; - int hr = 0, hs = 0, vr = 0, vs = 0; - - if (horiz_mode > ONETWO || vert_mode > ONETWO) - return -1; - - Scale2Ratio(horiz_mode, &hr, &hs); - Scale2Ratio(vert_mode, &vr, &vs); - - // always go to the next whole number - cm->width = (hs - 1 + cpi->oxcf.width * hr) / hs; - cm->height = (vs - 1 + cpi->oxcf.height * vr) / vs; - - assert(cm->width <= cpi->initial_width); - assert(cm->height <= cpi->initial_height); - update_frame_size(cpi); - return 0; -} - -int vp9_set_size_literal(VP9_PTR comp, unsigned int width, - unsigned int height) { - VP9_COMP *cpi = (VP9_COMP *)comp; - VP9_COMMON *cm = &cpi->common; - - check_initial_width(cpi, NULL); - - if (width) { - cm->width = width; - if (cm->width * 5 < cpi->initial_width) { - cm->width = cpi->initial_width / 5 + 1; - printf("Warning: Desired width too small, changed to %d \n", cm->width); - } - if (cm->width > cpi->initial_width) { - cm->width = cpi->initial_width; - printf("Warning: Desired width too large, changed to %d \n", cm->width); - } - } - - if (height) { - cm->height = height; - if (cm->height * 5 < cpi->initial_height) { - cm->height = cpi->initial_height / 5 + 1; - printf("Warning: Desired height too small, changed to %d \n", cm->height); - } - if (cm->height > cpi->initial_height) { - cm->height = cpi->initial_height; - printf("Warning: Desired height too large, changed to %d \n", cm->height); - } - } - - assert(cm->width <= cpi->initial_width); - assert(cm->height <= cpi->initial_height); - update_frame_size(cpi); - return 0; -} - -void vp9_set_svc(VP9_PTR comp, int use_svc) { - VP9_COMP *cpi = (VP9_COMP *)comp; - cpi->use_svc = use_svc; - return; -} - -int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest) { - int i, j; - int total = 0; - - uint8_t *src = source->y_buffer; - uint8_t *dst = dest->y_buffer; - - // Loop through the Y plane raw and reconstruction data summing - // (square differences) - for (i = 0; i < source->y_height; i += 16) { - for (j = 0; j < source->y_width; j += 16) { - unsigned int sse; - total += vp9_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, - &sse); - } - - src += 16 * source->y_stride; - dst += 16 * dest->y_stride; - } - - return total; -} - - -int vp9_get_quantizer(VP9_PTR c) { - return ((VP9_COMP *)c)->common.base_qindex; -} diff --git a/media/libvpx/vp9/encoder/vp9_onyx_int.h b/media/libvpx/vp9/encoder/vp9_onyx_int.h deleted file mode 100644 index c098b5ed840f..000000000000 --- a/media/libvpx/vp9/encoder/vp9_onyx_int.h +++ /dev/null @@ -1,709 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_ENCODER_VP9_ONYX_INT_H_ -#define VP9_ENCODER_VP9_ONYX_INT_H_ - -#include -#include "./vpx_config.h" -#include "vp9/common/vp9_onyx.h" -#include "vp9/encoder/vp9_treewriter.h" -#include "vp9/encoder/vp9_tokenize.h" -#include "vp9/common/vp9_onyxc_int.h" -#include "vp9/encoder/vp9_variance.h" -#include "vp9/encoder/vp9_encodemb.h" -#include "vp9/encoder/vp9_quantize.h" -#include "vp9/common/vp9_entropy.h" -#include "vp9/common/vp9_entropymode.h" -#include "vpx_ports/mem.h" -#include "vpx/internal/vpx_codec_internal.h" -#include "vp9/encoder/vp9_mcomp.h" -#include "vp9/common/vp9_findnearmv.h" -#include "vp9/encoder/vp9_lookahead.h" - -#define DISABLE_RC_LONG_TERM_MEM 0 - -// #define MODE_TEST_HIT_STATS - -// #define SPEEDSTATS 1 -#if CONFIG_MULTIPLE_ARF -// Set MIN_GF_INTERVAL to 1 for the full decomposition. -#define MIN_GF_INTERVAL 2 -#else -#define MIN_GF_INTERVAL 4 -#endif -#define DEFAULT_GF_INTERVAL 7 - -#define KEY_FRAME_CONTEXT 5 - -#define MAX_MODES 30 -#define MAX_REFS 6 - -#define MIN_THRESHMULT 32 -#define MAX_THRESHMULT 512 - -#define GF_ZEROMV_ZBIN_BOOST 0 -#define LF_ZEROMV_ZBIN_BOOST 0 -#define MV_ZBIN_BOOST 0 -#define SPLIT_MV_ZBIN_BOOST 0 -#define INTRA_ZBIN_BOOST 0 - -typedef struct { - int nmvjointcost[MV_JOINTS]; - int nmvcosts[2][MV_VALS]; - int nmvcosts_hp[2][MV_VALS]; - - vp9_prob segment_pred_probs[PREDICTION_PROBS]; - - unsigned char *last_frame_seg_map_copy; - - // 0 = Intra, Last, GF, ARF - signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS]; - // 0 = ZERO_MV, MV - signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS]; - - int inter_mode_counts[INTER_MODE_CONTEXTS][INTER_MODES - 1][2]; - FRAME_CONTEXT fc; -} CODING_CONTEXT; - -typedef struct { - double frame; - double intra_error; - double coded_error; - double sr_coded_error; - double ssim_weighted_pred_err; - double pcnt_inter; - double pcnt_motion; - double pcnt_second_ref; - double pcnt_neutral; - double MVr; - double mvr_abs; - double MVc; - double mvc_abs; - double MVrv; - double MVcv; - double mv_in_out_count; - double new_mv_count; - double duration; - double count; -} FIRSTPASS_STATS; - -typedef struct { - int frames_so_far; - double frame_intra_error; - double frame_coded_error; - double frame_pcnt_inter; - double frame_pcnt_motion; - double frame_mvr; - double frame_mvr_abs; - double frame_mvc; - double frame_mvc_abs; -} ONEPASS_FRAMESTATS; - -typedef struct { - struct { - int err; - union { - int_mv mv; - MB_PREDICTION_MODE mode; - } m; - } ref[MAX_REF_FRAMES]; -} MBGRAPH_MB_STATS; - -typedef struct { - MBGRAPH_MB_STATS *mb_stats; -} MBGRAPH_FRAME_STATS; - -// This enumerator type needs to be kept aligned with the mode order in -// const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code. -typedef enum { - THR_NEARESTMV, - THR_NEARESTA, - THR_NEARESTG, - - THR_DC, - - THR_NEWMV, - THR_NEWA, - THR_NEWG, - - THR_NEARMV, - THR_NEARA, - THR_COMP_NEARESTLA, - THR_COMP_NEARESTGA, - - THR_TM, - - THR_COMP_NEARLA, - THR_COMP_NEWLA, - THR_NEARG, - THR_COMP_NEARGA, - THR_COMP_NEWGA, - - THR_ZEROMV, - THR_ZEROG, - THR_ZEROA, - THR_COMP_ZEROLA, - THR_COMP_ZEROGA, - - THR_H_PRED, - THR_V_PRED, - THR_D135_PRED, - THR_D207_PRED, - THR_D153_PRED, - THR_D63_PRED, - THR_D117_PRED, - THR_D45_PRED, -} THR_MODES; - -typedef enum { - THR_LAST, - THR_GOLD, - THR_ALTR, - THR_COMP_LA, - THR_COMP_GA, - THR_INTRA, -} THR_MODES_SUB8X8; - -typedef enum { - DIAMOND = 0, - NSTEP = 1, - HEX = 2, - BIGDIA = 3, - SQUARE = 4 -} SEARCH_METHODS; - -typedef enum { - USE_FULL_RD = 0, - USE_LARGESTINTRA, - USE_LARGESTINTRA_MODELINTER, - USE_LARGESTALL -} TX_SIZE_SEARCH_METHOD; - -typedef enum { - // Values should be powers of 2 so that they can be selected as bits of - // an integer flags field - - // terminate search early based on distortion so far compared to - // qp step, distortion in the neighborhood of the frame, etc. - FLAG_EARLY_TERMINATE = 1, - - // skips comp inter modes if the best so far is an intra mode - FLAG_SKIP_COMP_BESTINTRA = 2, - - // skips comp inter modes if the best single intermode so far does - // not have the same reference as one of the two references being - // tested - FLAG_SKIP_COMP_REFMISMATCH = 4, - - // skips oblique intra modes if the best so far is an inter mode - FLAG_SKIP_INTRA_BESTINTER = 8, - - // skips oblique intra modes at angles 27, 63, 117, 153 if the best - // intra so far is not one of the neighboring directions - FLAG_SKIP_INTRA_DIRMISMATCH = 16, - - // skips intra modes other than DC_PRED if the source variance - // is small - FLAG_SKIP_INTRA_LOWVAR = 32, -} MODE_SEARCH_SKIP_LOGIC; - -typedef enum { - SUBPEL_ITERATIVE = 0, - SUBPEL_TREE = 1, - // Other methods to come -} SUBPEL_SEARCH_METHODS; - -#define ALL_INTRA_MODES 0x3FF -#define INTRA_DC_ONLY 0x01 -#define INTRA_DC_TM ((1 << TM_PRED) | (1 << DC_PRED)) -#define INTRA_DC_H_V ((1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED)) -#define INTRA_DC_TM_H_V (INTRA_DC_TM | (1 << V_PRED) | (1 << H_PRED)) - -typedef enum { - LAST_FRAME_PARTITION_OFF = 0, - LAST_FRAME_PARTITION_LOW_MOTION = 1, - LAST_FRAME_PARTITION_ALL = 2 -} LAST_FRAME_PARTITION_METHOD; - -typedef struct { - int RD; - SEARCH_METHODS search_method; - int auto_filter; - int recode_loop; - SUBPEL_SEARCH_METHODS subpel_search_method; - int subpel_iters_per_step; - int thresh_mult[MAX_MODES]; - int thresh_mult_sub8x8[MAX_REFS]; - int max_step_search_steps; - int reduce_first_step_size; - int auto_mv_step_size; - int optimize_coefficients; - int static_segmentation; - int comp_inter_joint_search_thresh; - int adaptive_rd_thresh; - int skip_encode_sb; - int skip_encode_frame; - LAST_FRAME_PARTITION_METHOD use_lastframe_partitioning; - TX_SIZE_SEARCH_METHOD tx_size_search_method; - int use_lp32x32fdct; - int use_avoid_tested_higherror; - int use_one_partition_size_always; - int less_rectangular_check; - int use_square_partition_only; - int mode_skip_start; - int reference_masking; - BLOCK_SIZE always_this_block_size; - int auto_min_max_partition_size; - BLOCK_SIZE min_partition_size; - BLOCK_SIZE max_partition_size; - int adjust_partitioning_from_last_frame; - int last_partitioning_redo_frequency; - int disable_split_mask; - int using_small_partition_info; - // TODO(jingning): combine the related motion search speed features - int adaptive_motion_search; - - // Implements various heuristics to skip searching modes - // The heuristics selected are based on flags - // defined in the MODE_SEARCH_SKIP_HEURISTICS enum - unsigned int mode_search_skip_flags; - // A source variance threshold below which the split mode is disabled - unsigned int disable_split_var_thresh; - // A source variance threshold below which filter search is disabled - // Choose a very large value (UINT_MAX) to use 8-tap always - unsigned int disable_filter_search_var_thresh; - int intra_y_mode_mask[TX_SIZES]; - int intra_uv_mode_mask[TX_SIZES]; - int use_rd_breakout; - int use_uv_intra_rd_estimate; - int use_fast_lpf_pick; - int use_fast_coef_updates; // 0: 2-loop, 1: 1-loop, 2: 1-loop reduced -} SPEED_FEATURES; - -typedef struct VP9_COMP { - DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]); - DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]); - DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]); - DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]); - - DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]); - DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]); - DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]); - DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]); - -#if CONFIG_ALPHA - DECLARE_ALIGNED(16, int16_t, a_quant[QINDEX_RANGE][8]); - DECLARE_ALIGNED(16, int16_t, a_quant_shift[QINDEX_RANGE][8]); - DECLARE_ALIGNED(16, int16_t, a_zbin[QINDEX_RANGE][8]); - DECLARE_ALIGNED(16, int16_t, a_round[QINDEX_RANGE][8]); -#endif - - MACROBLOCK mb; - VP9_COMMON common; - VP9_CONFIG oxcf; - struct rdcost_block_args rdcost_stack; - struct lookahead_ctx *lookahead; - struct lookahead_entry *source; -#if CONFIG_MULTIPLE_ARF - struct lookahead_entry *alt_ref_source[NUM_REF_FRAMES]; -#else - struct lookahead_entry *alt_ref_source; -#endif - - YV12_BUFFER_CONFIG *Source; - YV12_BUFFER_CONFIG *un_scaled_source; - YV12_BUFFER_CONFIG scaled_source; - - unsigned int frames_till_alt_ref_frame; - int source_alt_ref_pending; - int source_alt_ref_active; - - int is_src_frame_alt_ref; - - int gold_is_last; // gold same as last frame ( short circuit gold searches) - int alt_is_last; // Alt same as last ( short circuit altref search) - int gold_is_alt; // don't do both alt and gold search ( just do gold). - - int scaled_ref_idx[3]; - int lst_fb_idx; - int gld_fb_idx; - int alt_fb_idx; - - int current_layer; - int use_svc; - -#if CONFIG_MULTIPLE_ARF - int alt_ref_fb_idx[NUM_REF_FRAMES - 3]; -#endif - int refresh_last_frame; - int refresh_golden_frame; - int refresh_alt_ref_frame; - YV12_BUFFER_CONFIG last_frame_uf; - - TOKENEXTRA *tok; - unsigned int tok_count[4][1 << 6]; - - - unsigned int frames_since_key; - unsigned int key_frame_frequency; - unsigned int this_key_frame_forced; - unsigned int next_key_frame_forced; -#if CONFIG_MULTIPLE_ARF - // Position within a frame coding order (including any additional ARF frames). - unsigned int sequence_number; - // Next frame in naturally occurring order that has not yet been coded. - int next_frame_in_order; -#endif - - // Ambient reconstruction err target for force key frames - int ambient_err; - - unsigned int mode_chosen_counts[MAX_MODES]; - unsigned int sub8x8_mode_chosen_counts[MAX_REFS]; - int64_t mode_skip_mask; - int ref_frame_mask; - int set_ref_frame_mask; - - int rd_threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES]; - int rd_thresh_freq_fact[BLOCK_SIZES][MAX_MODES]; - int rd_thresh_sub8x8[MAX_SEGMENTS][BLOCK_SIZES][MAX_REFS]; - int rd_thresh_freq_sub8x8[BLOCK_SIZES][MAX_REFS]; - - int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES]; - int64_t rd_prediction_type_threshes[4][NB_PREDICTION_TYPES]; - unsigned int intra_inter_count[INTRA_INTER_CONTEXTS][2]; - unsigned int comp_inter_count[COMP_INTER_CONTEXTS][2]; - unsigned int single_ref_count[REF_CONTEXTS][2][2]; - unsigned int comp_ref_count[REF_CONTEXTS][2]; - - int64_t rd_tx_select_diff[TX_MODES]; - // FIXME(rbultje) can this overflow? - int rd_tx_select_threshes[4][TX_MODES]; - - int64_t rd_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; - int64_t rd_filter_threshes[4][SWITCHABLE_FILTER_CONTEXTS]; - int64_t rd_filter_cache[SWITCHABLE_FILTER_CONTEXTS]; - - int RDMULT; - int RDDIV; - - CODING_CONTEXT coding_context; - - // Rate targetting variables - int this_frame_target; - int projected_frame_size; - int last_q[2]; // Separate values for Intra/Inter - int last_boosted_qindex; // Last boosted GF/KF/ARF q - - double rate_correction_factor; - double key_frame_rate_correction_factor; - double gf_rate_correction_factor; - - unsigned int frames_since_golden; - int frames_till_gf_update_due; // Count down till next GF - - int gf_overspend_bits; // cumulative bits overspent because of GF boost - - int non_gf_bitrate_adjustment; // Following GF to recover extra bits spent - - int kf_overspend_bits; // Bits spent on key frames to be recovered on inters - int kf_bitrate_adjustment; // number of bits to recover on each inter frame. - int max_gf_interval; - int baseline_gf_interval; - int active_arnr_frames; // <= cpi->oxcf.arnr_max_frames - int active_arnr_strength; // <= cpi->oxcf.arnr_max_strength - - int64_t key_frame_count; - int prior_key_frame_distance[KEY_FRAME_CONTEXT]; - int per_frame_bandwidth; // Current section per frame bandwidth target - int av_per_frame_bandwidth; // Average frame size target for clip - int min_frame_bandwidth; // Minimum allocation used for any frame - int inter_frame_target; - double output_framerate; - int64_t last_time_stamp_seen; - int64_t last_end_time_stamp_seen; - int64_t first_time_stamp_ever; - - int ni_av_qi; - int ni_tot_qi; - int ni_frames; - int avg_frame_qindex; - double tot_q; - double avg_q; - - int zbin_mode_boost; - int zbin_mode_boost_enabled; - - int64_t total_byte_count; - - int buffered_mode; - - int buffer_level; - int bits_off_target; - - int rolling_target_bits; - int rolling_actual_bits; - - int long_rolling_target_bits; - int long_rolling_actual_bits; - - int64_t total_actual_bits; - int total_target_vs_actual; // debug stats - - int worst_quality; - int active_worst_quality; - int best_quality; - int active_best_quality; - - int cq_target_quality; - - int y_mode_count[4][INTRA_MODES]; - int y_uv_mode_count[INTRA_MODES][INTRA_MODES]; - unsigned int partition_count[PARTITION_CONTEXTS][PARTITION_TYPES]; - - nmv_context_counts NMVcount; - - vp9_coeff_count coef_counts[TX_SIZES][BLOCK_TYPES]; - vp9_coeff_probs_model frame_coef_probs[TX_SIZES][BLOCK_TYPES]; - vp9_coeff_stats frame_branch_ct[TX_SIZES][BLOCK_TYPES]; - - int gfu_boost; - int last_boost; - int kf_boost; - int kf_zeromotion_pct; - int gf_zeromotion_pct; - - int64_t target_bandwidth; - struct vpx_codec_pkt_list *output_pkt_list; - -#if 0 - // Experimental code for lagged and one pass - ONEPASS_FRAMESTATS one_pass_frame_stats[MAX_LAG_BUFFERS]; - int one_pass_frame_index; -#endif - MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS]; - int mbgraph_n_frames; // number of frames filled in the above - int static_mb_pct; // % forced skip mbs by segmentation - int seg0_progress, seg0_idx, seg0_cnt; - - int decimation_factor; - int decimation_count; - - // for real time encoding - int speed; - int compressor_speed; - - int auto_worst_q; - int cpu_used; - int pass; - - vp9_prob last_skip_false_probs[3][MBSKIP_CONTEXTS]; - int last_skip_probs_q[3]; - - int ref_frame_flags; - - SPEED_FEATURES sf; - int error_bins[1024]; - - unsigned int max_mv_magnitude; - int mv_step_param; - - unsigned char *segmentation_map; - - // segment threashold for encode breakout - int segment_encode_breakout[MAX_SEGMENTS]; - - unsigned char *active_map; - unsigned int active_map_enabled; - - fractional_mv_step_fp *find_fractional_mv_step; - fractional_mv_step_comp_fp *find_fractional_mv_step_comp; - vp9_full_search_fn_t full_search_sad; - vp9_refining_search_fn_t refining_search_sad; - vp9_diamond_search_fn_t diamond_search_sad; - vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES]; - uint64_t time_receive_data; - uint64_t time_compress_data; - uint64_t time_pick_lpf; - uint64_t time_encode_sb_row; - - struct twopass_rc { - unsigned int section_intra_rating; - unsigned int next_iiratio; - unsigned int this_iiratio; - FIRSTPASS_STATS total_stats; - FIRSTPASS_STATS this_frame_stats; - FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start; - FIRSTPASS_STATS total_left_stats; - int first_pass_done; - int64_t bits_left; - int64_t clip_bits_total; - double avg_iiratio; - double modified_error_total; - double modified_error_used; - double modified_error_left; - double kf_intra_err_min; - double gf_intra_err_min; - int frames_to_key; - int maxq_max_limit; - int maxq_min_limit; - int static_scene_max_gf_interval; - int kf_bits; - // Remaining error from uncoded frames in a gf group. Two pass use only - int64_t gf_group_error_left; - - // Projected total bits available for a key frame group of frames - int64_t kf_group_bits; - - // Error score of frames still to be coded in kf group - int64_t kf_group_error_left; - - // Projected Bits available for a group of frames including 1 GF or ARF - int64_t gf_group_bits; - // Bits for the golden frame or ARF - 2 pass only - int gf_bits; - int alt_extra_bits; - - int sr_update_lag; - double est_max_qcorrection_factor; - } twopass; - - YV12_BUFFER_CONFIG alt_ref_buffer; - YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS]; - int fixed_divide[512]; - -#if CONFIG_INTERNAL_STATS - int count; - double total_y; - double total_u; - double total_v; - double total; - double total_sq_error; - double totalp_y; - double totalp_u; - double totalp_v; - double totalp; - double total_sq_error2; - int bytes; - double summed_quality; - double summed_weights; - double summedp_quality; - double summedp_weights; - unsigned int tot_recode_hits; - - - double total_ssimg_y; - double total_ssimg_u; - double total_ssimg_v; - double total_ssimg_all; - - int b_calculate_ssimg; -#endif - int b_calculate_psnr; - - // Per MB activity measurement - unsigned int activity_avg; - unsigned int *mb_activity_map; - int *mb_norm_activity_map; - int output_partition; - - /* force next frame to intra when kf_auto says so */ - int force_next_frame_intra; - - int droppable; - - int dummy_packing; /* flag to indicate if packing is dummy */ - - unsigned int switchable_interp_count[SWITCHABLE_FILTER_CONTEXTS] - [SWITCHABLE_FILTERS]; - - unsigned int tx_stepdown_count[TX_SIZES]; - - int initial_width; - int initial_height; - - int number_spatial_layers; - int enable_encode_breakout; // Default value is 1. From first pass stats, - // encode_breakout may be disabled. - -#if CONFIG_MULTIPLE_ARF - // ARF tracking variables. - int multi_arf_enabled; - unsigned int frame_coding_order_period; - unsigned int new_frame_coding_order_period; - int frame_coding_order[MAX_LAG_BUFFERS * 2]; - int arf_buffer_idx[MAX_LAG_BUFFERS * 3 / 2]; - int arf_weight[MAX_LAG_BUFFERS]; - int arf_buffered; - int this_frame_weight; - int max_arf_level; -#endif - -#ifdef ENTROPY_STATS - int64_t mv_ref_stats[INTER_MODE_CONTEXTS][INTER_MODES - 1][2]; -#endif - - -#ifdef MODE_TEST_HIT_STATS - // Debug / test stats - int64_t mode_test_hits[BLOCK_SIZES]; -#endif - - /* Y,U,V,(A) */ - ENTROPY_CONTEXT *above_context[MAX_MB_PLANE]; - ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16]; - - PARTITION_CONTEXT *above_seg_context; - PARTITION_CONTEXT left_seg_context[8]; -} VP9_COMP; - -static int get_ref_frame_idx(VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) { - if (ref_frame == LAST_FRAME) { - return cpi->lst_fb_idx; - } else if (ref_frame == GOLDEN_FRAME) { - return cpi->gld_fb_idx; - } else { - return cpi->alt_fb_idx; - } -} - -static int get_scale_ref_frame_idx(VP9_COMP *cpi, - MV_REFERENCE_FRAME ref_frame) { - if (ref_frame == LAST_FRAME) { - return 0; - } else if (ref_frame == GOLDEN_FRAME) { - return 1; - } else { - return 2; - } -} - -void vp9_encode_frame(VP9_COMP *cpi); - -void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest, - unsigned long *size); - -void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x); - -void vp9_set_speed_features(VP9_COMP *cpi); - -int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest); - -void vp9_alloc_compressor_data(VP9_COMP *cpi); - -int vp9_compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget); - -static int get_token_alloc(int mb_rows, int mb_cols) { - return mb_rows * mb_cols * (48 * 16 + 4); -} - -#endif // VP9_ENCODER_VP9_ONYX_INT_H_ diff --git a/media/libvpx/vp9/encoder/vp9_picklpf.c b/media/libvpx/vp9/encoder/vp9_picklpf.c index 476ecaaa254a..d365489969cd 100644 --- a/media/libvpx/vp9/encoder/vp9_picklpf.c +++ b/media/libvpx/vp9/encoder/vp9_picklpf.c @@ -10,162 +10,96 @@ #include #include -#include "vp9/common/vp9_onyxc_int.h" -#include "vp9/encoder/vp9_onyx_int.h" -#include "vp9/encoder/vp9_picklpf.h" -#include "vp9/encoder/vp9_quantize.h" -#include "vpx_mem/vpx_mem.h" -#include "vpx_scale/vpx_scale.h" -#include "vp9/common/vp9_alloccommon.h" -#include "vp9/common/vp9_loopfilter.h" + #include "./vpx_scale_rtcd.h" -void vp9_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc, - YV12_BUFFER_CONFIG *dst_ybc, int fraction) { - const int height = src_ybc->y_height; - const int stride = src_ybc->y_stride; - const int offset = stride * ((height >> 5) * 16 - 8); - const int lines_to_copy = MAX(height >> (fraction + 4), 1) << 4; +#include "vpx_mem/vpx_mem.h" - assert(src_ybc->y_stride == dst_ybc->y_stride); - vpx_memcpy(dst_ybc->y_buffer + offset, src_ybc->y_buffer + offset, - stride * (lines_to_copy + 16)); -} +#include "vp9/common/vp9_loopfilter.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_quant_common.h" -static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *dest, int Fraction) { - int i, j; - int Total = 0; - int srcoffset, dstoffset; - uint8_t *src = source->y_buffer; - uint8_t *dst = dest->y_buffer; +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_picklpf.h" +#include "vp9/encoder/vp9_quantize.h" - int linestocopy = (source->y_height >> (Fraction + 4)); - - if (linestocopy < 1) - linestocopy = 1; - - linestocopy <<= 4; - - - srcoffset = source->y_stride * (dest->y_height >> 5) * 16; - dstoffset = dest->y_stride * (dest->y_height >> 5) * 16; - - src += srcoffset; - dst += dstoffset; - - // Loop through the raw Y plane and reconstruction data summing the square - // differences. - for (i = 0; i < linestocopy; i += 16) { - for (j = 0; j < source->y_width; j += 16) { - unsigned int sse; - Total += vp9_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, - &sse); - } - - src += 16 * source->y_stride; - dst += 16 * dest->y_stride; +static int get_max_filter_level(const VP9_COMP *cpi) { + if (cpi->oxcf.pass == 2) { + return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4 + : MAX_LOOP_FILTER; + } else { + return MAX_LOOP_FILTER; } - - return Total; -} - -// Enforce a minimum filter level based upon baseline Q -static int get_min_filter_level(VP9_COMP *cpi, int base_qindex) { - int min_filter_level; - min_filter_level = 0; - - return min_filter_level; -} - -// Enforce a maximum filter level based upon baseline Q -static int get_max_filter_level(VP9_COMP *cpi, int base_qindex) { - int max_filter_level = MAX_LOOP_FILTER; - (void)base_qindex; - - if (cpi->twopass.section_intra_rating > 8) - max_filter_level = MAX_LOOP_FILTER * 3 / 4; - - return max_filter_level; } -// Stub function for now Alt LF not used -void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val) { -} - -void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) { +static int try_filter_frame(const YV12_BUFFER_CONFIG *sd, VP9_COMP *const cpi, + int filt_level, int partial_frame) { VP9_COMMON *const cm = &cpi->common; - struct loopfilter *const lf = &cm->lf; + int filt_err; - int best_err = 0; - int filt_err = 0; - const int min_filter_level = get_min_filter_level(cpi, cm->base_qindex); - const int max_filter_level = get_max_filter_level(cpi, cm->base_qindex); + vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->mb.e_mbd, filt_level, 1, + partial_frame); + filt_err = vp9_get_y_sse(sd, cm->frame_to_show); - int filter_step; - int filt_high = 0; - // Start search at previous frame filter level - int filt_mid = lf->filter_level; - int filt_low = 0; - int filt_best; + // Re-instate the unfiltered frame + vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); + + return filt_err; +} + +static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, + int partial_frame) { + const VP9_COMMON *const cm = &cpi->common; + const struct loopfilter *const lf = &cm->lf; + const int min_filter_level = 0; + const int max_filter_level = get_max_filter_level(cpi); int filt_direction = 0; + int best_err, filt_best; - int Bias = 0; // Bias against raising loop filter in favor of lowering it. + // Start the search at the previous frame filter level unless it is now out of + // range. + int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level); + int filter_step = filt_mid < 16 ? 4 : filt_mid / 4; + // Sum squared error at each filter level + int ss_err[MAX_LOOP_FILTER + 1]; + + // Set each entry to -1 + vpx_memset(ss_err, 0xFF, sizeof(ss_err)); // Make a copy of the unfiltered / processed recon buffer vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf); - lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0 - : cpi->oxcf.Sharpness; - - // Start the search at the previous frame filter level unless it is now out of - // range. - filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level); - - // Define the initial step size - filter_step = filt_mid < 16 ? 4 : filt_mid / 4; - - // Get baseline error score - vp9_set_alt_lf_level(cpi, filt_mid); - vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_mid, 1, partial); - - best_err = vp9_calc_ss_err(sd, cm->frame_to_show); + best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame); filt_best = filt_mid; - - // Re-instate the unfiltered frame - vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); + ss_err[filt_mid] = best_err; while (filter_step > 0) { - Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; + const int filt_high = MIN(filt_mid + filter_step, max_filter_level); + const int filt_low = MAX(filt_mid - filter_step, min_filter_level); + int filt_err; - if (cpi->twopass.section_intra_rating < 20) - Bias = Bias * cpi->twopass.section_intra_rating / 20; + // Bias against raising loop filter in favor of lowering it. + int bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; + + if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20)) + bias = (bias * cpi->twopass.section_intra_rating) / 20; // yx, bias less for large block size - if (cpi->common.tx_mode != ONLY_4X4) - Bias >>= 1; + if (cm->tx_mode != ONLY_4X4) + bias >>= 1; - filt_high = ((filt_mid + filter_step) > max_filter_level) - ? max_filter_level - : (filt_mid + filter_step); - filt_low = ((filt_mid - filter_step) < min_filter_level) - ? min_filter_level - : (filt_mid - filter_step); - - if ((filt_direction <= 0) && (filt_low != filt_mid)) { + if (filt_direction <= 0 && filt_low != filt_mid) { // Get Low filter error score - vp9_set_alt_lf_level(cpi, filt_low); - vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_low, 1, partial); - - filt_err = vp9_calc_ss_err(sd, cm->frame_to_show); - - // Re-instate the unfiltered frame - vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); - + if (ss_err[filt_low] < 0) { + filt_err = try_filter_frame(sd, cpi, filt_low, partial_frame); + ss_err[filt_low] = filt_err; + } else { + filt_err = ss_err[filt_low]; + } // If value is close to the best so far then bias towards a lower loop // filter value. - if ((filt_err - Bias) < best_err) { + if ((filt_err - bias) < best_err) { // Was it actually better than the previous best? if (filt_err < best_err) best_err = filt_err; @@ -175,17 +109,15 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) { } // Now look at filt_high - if ((filt_direction >= 0) && (filt_high != filt_mid)) { - vp9_set_alt_lf_level(cpi, filt_high); - vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1, partial); - - filt_err = vp9_calc_ss_err(sd, cm->frame_to_show); - - // Re-instate the unfiltered frame - vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); - + if (filt_direction >= 0 && filt_high != filt_mid) { + if (ss_err[filt_high] < 0) { + filt_err = try_filter_frame(sd, cpi, filt_high, partial_frame); + ss_err[filt_high] = filt_err; + } else { + filt_err = ss_err[filt_high]; + } // Was it better than the previous best? - if (filt_err < (best_err - Bias)) { + if (filt_err < (best_err - bias)) { best_err = filt_err; filt_best = filt_high; } @@ -193,7 +125,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) { // Half the step distance if the best filter value was the same as last time if (filt_best == filt_mid) { - filter_step = filter_step / 2; + filter_step /= 2; filt_direction = 0; } else { filt_direction = (filt_best < filt_mid) ? -1 : 1; @@ -201,5 +133,31 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) { } } - lf->filter_level = filt_best; + return filt_best; +} + +void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, + LPF_PICK_METHOD method) { + VP9_COMMON *const cm = &cpi->common; + struct loopfilter *const lf = &cm->lf; + + lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0 + : cpi->oxcf.sharpness; + + if (method == LPF_PICK_MINIMAL_LPF && lf->filter_level) { + lf->filter_level = 0; + } else if (method >= LPF_PICK_FROM_Q) { + const int min_filter_level = 0; + const int max_filter_level = get_max_filter_level(cpi); + const int q = vp9_ac_quant(cm->base_qindex, 0); + // These values were determined by linear fitting the result of the + // searched level, filt_guess = q * 0.316206 + 3.87252 + int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18); + if (cm->frame_type == KEY_FRAME) + filt_guess -= 4; + lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level); + } else { + lf->filter_level = search_filter_level(sd, cpi, + method == LPF_PICK_FROM_SUBIMAGE); + } } diff --git a/media/libvpx/vp9/encoder/vp9_picklpf.h b/media/libvpx/vp9/encoder/vp9_picklpf.h index 9de4cf849cc5..33c490f6935f 100644 --- a/media/libvpx/vp9/encoder/vp9_picklpf.h +++ b/media/libvpx/vp9/encoder/vp9_picklpf.h @@ -12,11 +12,19 @@ #ifndef VP9_ENCODER_VP9_PICKLPF_H_ #define VP9_ENCODER_VP9_PICKLPF_H_ +#ifdef __cplusplus +extern "C" { +#endif + +#include "vp9/encoder/vp9_encoder.h" + struct yv12_buffer_config; struct VP9_COMP; -void vp9_set_alt_lf_level(struct VP9_COMP *cpi, int filt_val); +void vp9_pick_filter_level(const struct yv12_buffer_config *sd, + struct VP9_COMP *cpi, LPF_PICK_METHOD method); +#ifdef __cplusplus +} // extern "C" +#endif -void vp9_pick_filter_level(struct yv12_buffer_config *sd, - struct VP9_COMP *cpi, int partial); #endif // VP9_ENCODER_VP9_PICKLPF_H_ diff --git a/media/libvpx/vp9/encoder/vp9_pickmode.c b/media/libvpx/vp9/encoder/vp9_pickmode.c new file mode 100644 index 000000000000..eee6ffe1305f --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_pickmode.c @@ -0,0 +1,766 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include + +#include "./vp9_rtcd.h" + +#include "vpx_mem/vpx_mem.h" + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_mvref_common.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_reconintra.h" + +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_pickmode.h" +#include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_rd.h" + +static int mv_refs_rt(const VP9_COMMON *cm, const MACROBLOCKD *xd, + const TileInfo *const tile, + MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, + int mi_row, int mi_col) { + const int *ref_sign_bias = cm->ref_frame_sign_bias; + int i, refmv_count = 0; + + const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type]; + + int different_ref_found = 0; + int context_counter = 0; + int const_motion = 0; + + // Blank the reference vector list + vpx_memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES); + + // The nearest 2 blocks are treated differently + // if the size < 8x8 we get the mv from the bmi substructure, + // and we also need to keep a mode count. + for (i = 0; i < 2; ++i) { + const POSITION *const mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row * + xd->mi_stride]; + const MB_MODE_INFO *const candidate = &candidate_mi->mbmi; + // Keep counts for entropy encoding. + context_counter += mode_2_counter[candidate->mode]; + different_ref_found = 1; + + if (candidate->ref_frame[0] == ref_frame) + ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, -1)); + } + } + + const_motion = 1; + + // Check the rest of the neighbors in much the same way + // as before except we don't need to keep track of sub blocks or + // mode counts. + for (; i < MVREF_NEIGHBOURS && !refmv_count; ++i) { + const POSITION *const mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row * + xd->mi_stride]->mbmi; + different_ref_found = 1; + + if (candidate->ref_frame[0] == ref_frame) + ADD_MV_REF_LIST(candidate->mv[0]); + } + } + + // Since we couldn't find 2 mvs from the same reference frame + // go back through the neighbors and find motion vectors from + // different reference frames. + if (different_ref_found && !refmv_count) { + for (i = 0; i < MVREF_NEIGHBOURS; ++i) { + const POSITION *mv_ref = &mv_ref_search[i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row + * xd->mi_stride]->mbmi; + + // If the candidate is INTRA we don't want to consider its mv. + IF_DIFF_REF_FRAME_ADD_MV(candidate); + } + } + } + + Done: + + mi->mbmi.mode_context[ref_frame] = counter_to_context[context_counter]; + + // Clamp vectors + for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) + clamp_mv_ref(&mv_ref_list[i].as_mv, xd); + + return const_motion; +} + +static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int_mv *tmp_mv, int *rate_mv, + int64_t best_rd_sofar) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}}; + const int step_param = cpi->sf.mv.fullpel_search_step_param; + const int sadpb = x->sadperbit16; + MV mvp_full; + const int ref = mbmi->ref_frame[0]; + const MV ref_mv = mbmi->ref_mvs[ref][0].as_mv; + int dis; + int rate_mode; + const int tmp_col_min = x->mv_col_min; + const int tmp_col_max = x->mv_col_max; + const int tmp_row_min = x->mv_row_min; + const int tmp_row_max = x->mv_row_max; + int rv = 0; + int sad_list[5]; + const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, + ref); + if (cpi->common.show_frame && + (x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[LAST_FRAME]) + return rv; + + if (scaled_ref_frame) { + int i; + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + for (i = 0; i < MAX_MB_PLANE; i++) + backup_yv12[i] = xd->plane[i].pre[0]; + vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL); + } + vp9_set_mv_search_range(x, &ref_mv); + + assert(x->mv_best_ref_index[ref] <= 2); + if (x->mv_best_ref_index[ref] < 2) + mvp_full = mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_mv; + else + mvp_full = x->pred_mv[ref]; + + mvp_full.col >>= 3; + mvp_full.row >>= 3; + + vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, + cond_sad_list(cpi, sad_list), + &ref_mv, &tmp_mv->as_mv, INT_MAX, 0); + + x->mv_col_min = tmp_col_min; + x->mv_col_max = tmp_col_max; + x->mv_row_min = tmp_row_min; + x->mv_row_max = tmp_row_max; + + // calculate the bit cost on motion vector + mvp_full.row = tmp_mv->as_mv.row * 8; + mvp_full.col = tmp_mv->as_mv.col * 8; + + *rate_mv = vp9_mv_bit_cost(&mvp_full, &ref_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); + + rate_mode = cpi->inter_mode_cost[mbmi->mode_context[ref]] + [INTER_OFFSET(NEWMV)]; + rv = !(RDCOST(x->rdmult, x->rddiv, (*rate_mv + rate_mode), 0) > + best_rd_sofar); + + if (rv) { + cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv, + cpi->common.allow_high_precision_mv, + x->errorperbit, + &cpi->fn_ptr[bsize], + cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_iters_per_step, + cond_sad_list(cpi, sad_list), + x->nmvjointcost, x->mvcost, + &dis, &x->pred_sse[ref], NULL, 0, 0); + x->pred_mv[ref] = tmp_mv->as_mv; + } + + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) + xd->plane[i].pre[0] = backup_yv12[i]; + } + return rv; +} + + +static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, + MACROBLOCK *x, MACROBLOCKD *xd, + int *out_rate_sum, int64_t *out_dist_sum, + unsigned int *var_y, unsigned int *sse_y) { + // Note our transform coeffs are 8 times an orthogonal transform. + // Hence quantizer step is also 8 times. To get effective quantizer + // we need to divide by 8 before sending to modeling function. + unsigned int sse; + int rate; + int64_t dist; + struct macroblock_plane *const p = &x->plane[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + const uint32_t dc_quant = pd->dequant[0]; + const uint32_t ac_quant = pd->dequant[1]; + unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride, &sse); + *var_y = var; + *sse_y = sse; + + if (sse < dc_quant * dc_quant >> 6) + x->skip_txfm[0] = 1; + else if (var < ac_quant * ac_quant >> 6) + x->skip_txfm[0] = 2; + else + x->skip_txfm[0] = 0; + + if (cpi->common.tx_mode == TX_MODE_SELECT) { + if (sse > (var << 2)) + xd->mi[0]->mbmi.tx_size = MIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); + else + xd->mi[0]->mbmi.tx_size = TX_8X8; + } else { + xd->mi[0]->mbmi.tx_size = MIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); + } + + vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize], + dc_quant >> 3, &rate, &dist); + *out_rate_sum = rate >> 1; + *out_dist_sum = dist << 3; + + vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize], + ac_quant >> 3, &rate, &dist); + *out_rate_sum += rate; + *out_dist_sum += dist << 4; +} + +static int get_pred_buffer(PRED_BUFFER *p, int len) { + int i; + + for (i = 0; i < len; i++) { + if (!p[i].in_use) { + p[i].in_use = 1; + return i; + } + } + return -1; +} + +static void free_pred_buffer(PRED_BUFFER *p) { + if (p != NULL) + p->in_use = 0; +} + +static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, int mi_col, + MV_REFERENCE_FRAME ref_frame, + PREDICTION_MODE this_mode, + unsigned int var_y, unsigned int sse_y, + struct buf_2d yv12_mb[][MAX_MB_PLANE], + int *rate, int64_t *dist) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + + const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]); + unsigned int var = var_y, sse = sse_y; + // Skipping threshold for ac. + unsigned int thresh_ac; + // Skipping threshold for dc. + unsigned int thresh_dc; + if (x->encode_breakout > 0) { + // Set a maximum for threshold to avoid big PSNR loss in low bit rate + // case. Use extreme low threshold for static frames to limit + // skipping. + const unsigned int max_thresh = 36000; + // The encode_breakout input + const unsigned int min_thresh = + MIN(((unsigned int)x->encode_breakout << 4), max_thresh); + + // Calculate threshold according to dequant value. + thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9; + thresh_ac = clamp(thresh_ac, min_thresh, max_thresh); + + // Adjust ac threshold according to partition size. + thresh_ac >>= + 8 - (b_width_log2(bsize) + b_height_log2(bsize)); + + thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6); + } else { + thresh_ac = 0; + thresh_dc = 0; + } + + // Y skipping condition checking for ac and dc. + if (var <= thresh_ac && (sse - var) <= thresh_dc) { + unsigned int sse_u, sse_v; + unsigned int var_u, var_v; + + // Skip UV prediction unless breakout is zero (lossless) to save + // computation with low impact on the result + if (x->encode_breakout == 0) { + xd->plane[1].pre[0] = yv12_mb[ref_frame][1]; + xd->plane[2].pre[0] = yv12_mb[ref_frame][2]; + vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize); + } + + var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf, + x->plane[1].src.stride, + xd->plane[1].dst.buf, + xd->plane[1].dst.stride, &sse_u); + + // U skipping condition checking + if ((var_u * 4 <= thresh_ac) && (sse_u - var_u <= thresh_dc)) { + var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf, + x->plane[2].src.stride, + xd->plane[2].dst.buf, + xd->plane[2].dst.stride, &sse_v); + + // V skipping condition checking + if ((var_v * 4 <= thresh_ac) && (sse_v - var_v <= thresh_dc)) { + x->skip = 1; + + // The cost of skip bit needs to be added. + *rate = cpi->inter_mode_cost[mbmi->mode_context[ref_frame]] + [INTER_OFFSET(this_mode)]; + + // More on this part of rate + // rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); + + // Scaling factor for SSE from spatial domain to frequency + // domain is 16. Adjust distortion accordingly. + // TODO(yunqingwang): In this function, only y-plane dist is + // calculated. + *dist = (sse << 4); // + ((sse_u + sse_v) << 4); + + // *disable_skip = 1; + } + } + } +} + +struct estimate_block_intra_args { + VP9_COMP *cpi; + MACROBLOCK *x; + PREDICTION_MODE mode; + int rate; + int64_t dist; +}; + +static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { + struct estimate_block_intra_args* const args = arg; + VP9_COMP *const cpi = args->cpi; + MACROBLOCK *const x = args->x; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = &x->plane[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size]; + uint8_t *const src_buf_base = p->src.buf; + uint8_t *const dst_buf_base = pd->dst.buf; + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; + int i, j; + int rate; + int64_t dist; + unsigned int var_y, sse_y; + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); + assert(plane == 0); + (void) plane; + + p->src.buf = &src_buf_base[4 * (j * src_stride + i)]; + pd->dst.buf = &dst_buf_base[4 * (j * dst_stride + i)]; + // Use source buffer as an approximation for the fully reconstructed buffer. + vp9_predict_intra_block(xd, block >> (2 * tx_size), + b_width_log2(plane_bsize), + tx_size, args->mode, + p->src.buf, src_stride, + pd->dst.buf, dst_stride, + i, j, 0); + // This procedure assumes zero offset from p->src.buf and pd->dst.buf. + model_rd_for_sb_y(cpi, bsize_tx, x, xd, &rate, &dist, &var_y, &sse_y); + p->src.buf = src_buf_base; + pd->dst.buf = dst_buf_base; + args->rate += rate; + args->dist += dist; +} + +static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][INTER_MODES] = { + {THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV}, + {THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG}, + {THR_NEARESTA, THR_NEARA, THR_ZEROA, THR_NEWA}, +}; + +// TODO(jingning) placeholder for inter-frame non-RD mode decision. +// this needs various further optimizations. to be continued.. +int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, + const TileInfo *const tile, + int mi_row, int mi_col, + int *returnrate, + int64_t *returndistortion, + BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + struct macroblockd_plane *const pd = &xd->plane[0]; + PREDICTION_MODE this_mode, best_mode = ZEROMV; + MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME; + TX_SIZE best_tx_size = MIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); + INTERP_FILTER best_pred_filter = EIGHTTAP; + int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; + struct buf_2d yv12_mb[4][MAX_MB_PLANE]; + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + int64_t best_rd = INT64_MAX; + int64_t this_rd = INT64_MAX; + uint8_t skip_txfm = 0; + int rate = INT_MAX; + int64_t dist = INT64_MAX; + // var_y and sse_y are saved to be used in skipping checking + unsigned int var_y = UINT_MAX; + unsigned int sse_y = UINT_MAX; + + VP9_COMMON *cm = &cpi->common; + int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q); + + const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv, + intra_cost_penalty, 0); + const int64_t intra_mode_cost = 50; + + unsigned char segment_id = mbmi->segment_id; + const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize]; + const int *const rd_thresh_freq_fact = cpi->rd.thresh_freq_fact[bsize]; + // Mode index conversion form THR_MODES to PREDICTION_MODE for a ref frame. + INTERP_FILTER filter_ref = cm->interp_filter; + int bsl = mi_width_log2(bsize); + const int pred_filter_search = cm->interp_filter == SWITCHABLE ? + (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_video_frame)) & 0x1 : 0; + int const_motion[MAX_REF_FRAMES] = { 0 }; + const int bh = num_4x4_blocks_high_lookup[bsize] << 2; + const int bw = num_4x4_blocks_wide_lookup[bsize] << 2; + const int pixels_in_block = bh * bw; + // For speed 6, the result of interp filter is reused later in actual encoding + // process. + // tmp[3] points to dst buffer, and the other 3 point to allocated buffers. + PRED_BUFFER tmp[4]; + DECLARE_ALIGNED_ARRAY(16, uint8_t, pred_buf, 3 * 64 * 64); + struct buf_2d orig_dst = pd->dst; + PRED_BUFFER *best_pred = NULL; + PRED_BUFFER *this_mode_pred = NULL; + int i; + + // CTX is used by the temporal denoiser which is currently being developed. + // TODO(jbb): when temporal denoiser is finished and in the default build + // remove the following line; + (void) ctx; + if (cpi->sf.reuse_inter_pred_sby) { + for (i = 0; i < 3; i++) { + tmp[i].data = &pred_buf[pixels_in_block * i]; + tmp[i].stride = bw; + tmp[i].in_use = 0; + } + tmp[3].data = pd->dst.buf; + tmp[3].stride = pd->dst.stride; + tmp[3].in_use = 0; + } + + x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; + x->skip = 0; + + // initialize mode decisions + *returnrate = INT_MAX; + *returndistortion = INT64_MAX; + vpx_memset(mbmi, 0, sizeof(MB_MODE_INFO)); + mbmi->sb_type = bsize; + mbmi->ref_frame[0] = NONE; + mbmi->ref_frame[1] = NONE; + mbmi->tx_size = MIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[cm->tx_mode]); + mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? + EIGHTTAP : cm->interp_filter; + mbmi->segment_id = segment_id; + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + x->pred_mv_sad[ref_frame] = INT_MAX; + frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; + frame_mv[ZEROMV][ref_frame].as_int = 0; + + if (xd->up_available) + filter_ref = xd->mi[-xd->mi_stride]->mbmi.interp_filter; + else if (xd->left_available) + filter_ref = xd->mi[-1]->mbmi.interp_filter; + + if (cpi->ref_frame_flags & flag_list[ref_frame]) { + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); + int_mv *const candidates = mbmi->ref_mvs[ref_frame]; + const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf; + vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, + sf, sf); + + if (!cm->error_resilient_mode) + vp9_find_mv_refs(cm, xd, tile, xd->mi[0], ref_frame, + candidates, mi_row, mi_col); + else + const_motion[ref_frame] = mv_refs_rt(cm, xd, tile, xd->mi[0], + ref_frame, candidates, + mi_row, mi_col); + + vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates, + &frame_mv[NEARESTMV][ref_frame], + &frame_mv[NEARMV][ref_frame]); + + if (!vp9_is_scaled(sf) && bsize >= BLOCK_8X8) + vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, + ref_frame, bsize); + } else { + continue; + } + + // Select prediction reference frames. + xd->plane[0].pre[0] = yv12_mb[ref_frame][0]; + + clamp_mv2(&frame_mv[NEARESTMV][ref_frame].as_mv, xd); + clamp_mv2(&frame_mv[NEARMV][ref_frame].as_mv, xd); + + mbmi->ref_frame[0] = ref_frame; + + for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { + int rate_mv = 0; + int mode_rd_thresh; + + if (const_motion[ref_frame] && + (this_mode == NEARMV || this_mode == ZEROMV)) + continue; + + if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode))) + continue; + + mode_rd_thresh = rd_threshes[mode_idx[ref_frame - LAST_FRAME] + [INTER_OFFSET(this_mode)]]; + if (rd_less_than_thresh(best_rd, mode_rd_thresh, + rd_thresh_freq_fact[this_mode])) + continue; + + if (this_mode == NEWMV) { + if (this_rd < (int64_t)(1 << num_pels_log2_lookup[bsize])) + continue; + if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, + &frame_mv[NEWMV][ref_frame], + &rate_mv, best_rd)) + continue; + } + + if (this_mode != NEARESTMV && + frame_mv[this_mode][ref_frame].as_int == + frame_mv[NEARESTMV][ref_frame].as_int) + continue; + + mbmi->mode = this_mode; + mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int; + + // Search for the best prediction filter type, when the resulting + // motion vector is at sub-pixel accuracy level for luma component, i.e., + // the last three bits are all zeros. + if (cpi->sf.reuse_inter_pred_sby) { + if (this_mode == NEARESTMV) { + this_mode_pred = &tmp[3]; + } else { + this_mode_pred = &tmp[get_pred_buffer(tmp, 3)]; + pd->dst.buf = this_mode_pred->data; + pd->dst.stride = bw; + } + } + + if ((this_mode == NEWMV || filter_ref == SWITCHABLE) && + pred_filter_search && + ((mbmi->mv[0].as_mv.row & 0x07) != 0 || + (mbmi->mv[0].as_mv.col & 0x07) != 0)) { + int pf_rate[3]; + int64_t pf_dist[3]; + unsigned int pf_var[3]; + unsigned int pf_sse[3]; + TX_SIZE pf_tx_size[3]; + int64_t best_cost = INT64_MAX; + INTERP_FILTER best_filter = SWITCHABLE, filter; + PRED_BUFFER *current_pred = this_mode_pred; + + for (filter = EIGHTTAP; filter <= EIGHTTAP_SHARP; ++filter) { + int64_t cost; + mbmi->interp_filter = filter; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); + model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], + &pf_dist[filter], &pf_var[filter], &pf_sse[filter]); + cost = RDCOST(x->rdmult, x->rddiv, + vp9_get_switchable_rate(cpi) + pf_rate[filter], + pf_dist[filter]); + pf_tx_size[filter] = mbmi->tx_size; + if (cost < best_cost) { + best_filter = filter; + best_cost = cost; + skip_txfm = x->skip_txfm[0]; + + if (cpi->sf.reuse_inter_pred_sby) { + if (this_mode_pred != current_pred) { + free_pred_buffer(this_mode_pred); + this_mode_pred = current_pred; + } + + if (filter < EIGHTTAP_SHARP) { + current_pred = &tmp[get_pred_buffer(tmp, 3)]; + pd->dst.buf = current_pred->data; + pd->dst.stride = bw; + } + } + } + } + + if (cpi->sf.reuse_inter_pred_sby && this_mode_pred != current_pred) + free_pred_buffer(current_pred); + + mbmi->interp_filter = best_filter; + mbmi->tx_size = pf_tx_size[mbmi->interp_filter]; + rate = pf_rate[mbmi->interp_filter]; + dist = pf_dist[mbmi->interp_filter]; + var_y = pf_var[mbmi->interp_filter]; + sse_y = pf_sse[mbmi->interp_filter]; + x->skip_txfm[0] = skip_txfm; + } else { + mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP: filter_ref; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); + model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y); + } + + rate += rate_mv; + rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]] + [INTER_OFFSET(this_mode)]; + this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist); + + // Skipping checking: test to see if this block can be reconstructed by + // prediction only. + if (cpi->allow_encode_breakout) { + encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame, + this_mode, var_y, sse_y, yv12_mb, &rate, &dist); + if (x->skip) { + rate += rate_mv; + this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist); + } + } + +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0) { + vp9_denoiser_update_frame_stats(mbmi, sse_y, this_mode, ctx); + } +#endif + + if (this_rd < best_rd || x->skip) { + best_rd = this_rd; + *returnrate = rate; + *returndistortion = dist; + best_mode = this_mode; + best_pred_filter = mbmi->interp_filter; + best_tx_size = mbmi->tx_size; + best_ref_frame = ref_frame; + skip_txfm = x->skip_txfm[0]; + + if (cpi->sf.reuse_inter_pred_sby) { + free_pred_buffer(best_pred); + + best_pred = this_mode_pred; + } + } else { + if (cpi->sf.reuse_inter_pred_sby) + free_pred_buffer(this_mode_pred); + } + + if (x->skip) + break; + } + // If the current reference frame is valid and we found a usable mode, + // we are done. + if (best_rd < INT64_MAX) + break; + } + + // If best prediction is not in dst buf, then copy the prediction block from + // temp buf to dst buf. + if (best_pred != NULL && cpi->sf.reuse_inter_pred_sby && + best_pred->data != orig_dst.buf) { + uint8_t *copy_from, *copy_to; + + pd->dst = orig_dst; + copy_to = pd->dst.buf; + + copy_from = best_pred->data; + + vp9_convolve_copy(copy_from, bw, copy_to, pd->dst.stride, NULL, 0, NULL, 0, + bw, bh); + } + + mbmi->mode = best_mode; + mbmi->interp_filter = best_pred_filter; + mbmi->tx_size = best_tx_size; + mbmi->ref_frame[0] = best_ref_frame; + mbmi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int; + xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int; + x->skip_txfm[0] = skip_txfm; + + // Perform intra prediction search, if the best SAD is above a certain + // threshold. + if (!x->skip && best_rd > inter_mode_thresh && + bsize <= cpi->sf.max_intra_bsize) { + struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 }; + + const TX_SIZE intra_tx_size = + MIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); + + if (cpi->sf.reuse_inter_pred_sby) { + pd->dst.buf = tmp[0].data; + pd->dst.stride = bw; + } + + for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) { + const TX_SIZE saved_tx_size = mbmi->tx_size; + args.mode = this_mode; + args.rate = 0; + args.dist = 0; + mbmi->tx_size = intra_tx_size; + vp9_foreach_transformed_block_in_plane(xd, bsize, 0, + estimate_block_intra, &args); + mbmi->tx_size = saved_tx_size; + rate = args.rate; + dist = args.dist; + rate += cpi->mbmode_cost[this_mode]; + rate += intra_cost_penalty; + this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist); + + if (this_rd + intra_mode_cost < best_rd) { + best_rd = this_rd; + *returnrate = rate; + *returndistortion = dist; + mbmi->mode = this_mode; + mbmi->tx_size = intra_tx_size; + mbmi->ref_frame[0] = INTRA_FRAME; + mbmi->uv_mode = this_mode; + mbmi->mv[0].as_int = INVALID_MV; + } else { + x->skip_txfm[0] = skip_txfm; + } + } + if (cpi->sf.reuse_inter_pred_sby) + pd->dst = orig_dst; + } + + return INT64_MAX; +} diff --git a/media/libvpx/vp9/encoder/vp9_pickmode.h b/media/libvpx/vp9/encoder/vp9_pickmode.h new file mode 100644 index 000000000000..49c6feb88a80 --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_pickmode.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_ENCODER_VP9_PICKMODE_H_ +#define VP9_ENCODER_VP9_PICKMODE_H_ + +#include "vp9/encoder/vp9_encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + uint8_t *data; + int stride; + int in_use; +} PRED_BUFFER; + +int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, + const struct TileInfo *const tile, + int mi_row, int mi_col, + int *returnrate, + int64_t *returndistortion, + BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_ENCODER_VP9_PICKMODE_H_ diff --git a/media/libvpx/vp9/encoder/vp9_psnr.c b/media/libvpx/vp9/encoder/vp9_psnr.c deleted file mode 100644 index 58294e15a38f..000000000000 --- a/media/libvpx/vp9/encoder/vp9_psnr.c +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -#include "vpx_scale/yv12config.h" - -#define MAX_PSNR 100 - -double vp9_mse2psnr(double samples, double peak, double mse) { - double psnr; - - if (mse > 0.0) - psnr = 10.0 * log10(peak * peak * samples / mse); - else - psnr = MAX_PSNR; // Limit to prevent / 0 - - if (psnr > MAX_PSNR) - psnr = MAX_PSNR; - - return psnr; -} diff --git a/media/libvpx/vp9/encoder/vp9_psnr.h b/media/libvpx/vp9/encoder/vp9_psnr.h deleted file mode 100644 index 15dd8366bd83..000000000000 --- a/media/libvpx/vp9/encoder/vp9_psnr.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_ENCODER_VP9_PSNR_H_ -#define VP9_ENCODER_VP9_PSNR_H_ - -double vp9_mse2psnr(double samples, double peak, double mse); - -#endif // VP9_ENCODER_VP9_PSNR_H_ diff --git a/media/libvpx/vp9/encoder/vp9_quantize.c b/media/libvpx/vp9/encoder/vp9_quantize.c index d24be96f64fa..eababdbca974 100644 --- a/media/libvpx/vp9/encoder/vp9_quantize.c +++ b/media/libvpx/vp9/encoder/vp9_quantize.c @@ -9,18 +9,142 @@ */ #include + #include "vpx_mem/vpx_mem.h" -#include "vp9/encoder/vp9_onyx_int.h" -#include "vp9/encoder/vp9_rdopt.h" -#include "vp9/encoder/vp9_quantize.h" #include "vp9/common/vp9_quant_common.h" - #include "vp9/common/vp9_seg_common.h" -#ifdef ENC_DEBUG -extern int enc_debug; -#endif +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_quantize.h" +#include "vp9/encoder/vp9_rd.h" + +void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr) { + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp, eob = -1; + + if (!skip_block) { + tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = (tmp * quant) >> 16; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr; + if (tmp) + eob = 0; + } + *eob_ptr = eob + 1; +} + +void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr) { + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp, eob = -1; + + if (!skip_block) { + + tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = (tmp * quant) >> 15; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2; + if (tmp) + eob = 0; + } + *eob_ptr = eob + 1; +} + +void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count, + int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t *dequant_ptr, + int zbin_oq_value, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int i, eob = -1; + // TODO(jingning) Decide the need of these arguments after the + // quantization process is completed. + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)zbin_oq_value; + (void)iscan; + + vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t)); + vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t)); + + if (!skip_block) { + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = (tmp * quant_ptr[rc != 0]) >> 16; + + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + + if (tmp) + eob = i; + } + } + *eob_ptr = eob + 1; +} + +// TODO(jingning) Refactor this file and combine functions with similar +// operations. +void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t *dequant_ptr, + int zbin_oq_value, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int i, eob = -1; + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)zbin_oq_value; + (void)iscan; + + vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t)); + vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t)); + + if (!skip_block) { + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + int tmp = 0; + int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) { + abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); + tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + } + + if (tmp) + eob = i; + } + } + *eob_ptr = eob + 1; +} void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count, int skip_block, @@ -30,18 +154,19 @@ void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - int i, non_zero_count = count, eob = -1; + int i, non_zero_count = (int)count, eob = -1; const int zbins[2] = { zbin_ptr[0] + zbin_oq_value, zbin_ptr[1] + zbin_oq_value }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + (void)iscan; vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t)); vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t)); if (!skip_block) { // Pre-scan pass - for (i = count - 1; i >= 0; i--) { + for (i = (int)count - 1; i >= 0; i--) { const int rc = scan[i]; const int coeff = coeff_ptr[rc]; @@ -83,99 +208,65 @@ void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - int i, rc, eob; - int zbins[2], nzbins[2]; - int x, y, z, sz; + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1), + ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1) }; + const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1}; + int idx = 0; int idx_arr[1024]; + int i, eob = -1; + (void)iscan; - vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); - vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); - - eob = -1; - - // Base ZBIN - zbins[0] = ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1); - zbins[1] = ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1); - nzbins[0] = zbins[0] * -1; - nzbins[1] = zbins[1] * -1; + vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t)); + vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t)); if (!skip_block) { // Pre-scan pass for (i = 0; i < n_coeffs; i++) { - rc = scan[i]; - z = coeff_ptr[rc]; + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; // If the coefficient is out of the base ZBIN range, keep it for // quantization. - if (z >= zbins[rc != 0] || z <= nzbins[rc != 0]) + if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i; } // Quantization pass: only process the coefficients selected in // pre-scan pass. Note: idx can be zero. for (i = 0; i < idx; i++) { - rc = scan[idx_arr[i]]; + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + int tmp; + int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX); + tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) * + quant_shift_ptr[rc != 0]) >> 15; - z = coeff_ptr[rc]; - sz = (z >> 31); // sign of z - x = (z ^ sz) - sz; // x = abs(z) + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - x += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - x = clamp(x, INT16_MIN, INT16_MAX); - y = ((((x * quant_ptr[rc != 0]) >> 16) + x) * - quant_shift_ptr[rc != 0]) >> 15; // quantize (x) - - x = (y ^ sz) - sz; // get the sign back - qcoeff_ptr[rc] = x; // write to destination - dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / 2; // dequantized value - - if (y) - eob = idx_arr[i]; // last nonzero coeffs + if (tmp) + eob = idx_arr[i]; } } *eob_ptr = eob + 1; } -struct plane_block_idx { - int plane; - int block; -}; - -// TODO(jkoleszar): returning a struct so it can be used in a const context, -// expect to refactor this further later. -static INLINE struct plane_block_idx plane_block_idx(int y_blocks, - int b_idx) { - const int v_offset = y_blocks * 5 / 4; - struct plane_block_idx res; - - if (b_idx < y_blocks) { - res.plane = 0; - res.block = b_idx; - } else if (b_idx < v_offset) { - res.plane = 1; - res.block = b_idx - y_blocks; - } else { - assert(b_idx < y_blocks * 3 / 2); - res.plane = 2; - res.block = b_idx - v_offset; - } - return res; -} - -void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx, +void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, const int16_t *scan, const int16_t *iscan) { MACROBLOCKD *const xd = &x->e_mbd; - const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx); - struct macroblock_plane* p = &x->plane[pb_idx.plane]; - struct macroblockd_plane* pd = &xd->plane[pb_idx.plane]; + struct macroblock_plane *p = &x->plane[plane]; + struct macroblockd_plane *pd = &xd->plane[plane]; - vp9_quantize_b(BLOCK_OFFSET(p->coeff, pb_idx.block), + vp9_quantize_b(BLOCK_OFFSET(p->coeff, block), 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, - BLOCK_OFFSET(pd->qcoeff, pb_idx.block), - BLOCK_OFFSET(pd->dqcoeff, pb_idx.block), - pd->dequant, p->zbin_extra, &pd->eobs[pb_idx.block], scan, iscan); + BLOCK_OFFSET(p->qcoeff, block), + BLOCK_OFFSET(pd->dqcoeff, block), + pd->dequant, p->zbin_extra, &p->eobs[block], scan, iscan); } static void invert_quant(int16_t *quant, int16_t *shift, int d) { @@ -190,122 +281,106 @@ static void invert_quant(int16_t *quant, int16_t *shift, int d) { } void vp9_init_quantizer(VP9_COMP *cpi) { - int i, q; VP9_COMMON *const cm = &cpi->common; + QUANTS *const quants = &cpi->quants; + int i, q, quant; for (q = 0; q < QINDEX_RANGE; q++) { const int qzbin_factor = q == 0 ? 64 : (vp9_dc_quant(q, 0) < 148 ? 84 : 80); const int qrounding_factor = q == 0 ? 64 : 48; - // y for (i = 0; i < 2; ++i) { - const int quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q) - : vp9_ac_quant(q, 0); - invert_quant(&cpi->y_quant[q][i], &cpi->y_quant_shift[q][i], quant); - cpi->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); - cpi->y_round[q][i] = (qrounding_factor * quant) >> 7; - cm->y_dequant[q][i] = quant; - } + int qrounding_factor_fp = i == 0 ? 48 : 42; + if (q == 0) + qrounding_factor_fp = 64; - // uv - for (i = 0; i < 2; ++i) { - const int quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q) - : vp9_ac_quant(q, cm->uv_ac_delta_q); - invert_quant(&cpi->uv_quant[q][i], &cpi->uv_quant_shift[q][i], quant); - cpi->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); - cpi->uv_round[q][i] = (qrounding_factor * quant) >> 7; + // y + quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q) + : vp9_ac_quant(q, 0); + invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant); + quants->y_quant_fp[q][i] = (1 << 16) / quant; + quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7; + quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); + quants->y_round[q][i] = (qrounding_factor * quant) >> 7; + cm->y_dequant[q][i] = quant; + + // uv + quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q) + : vp9_ac_quant(q, cm->uv_ac_delta_q); + invert_quant(&quants->uv_quant[q][i], + &quants->uv_quant_shift[q][i], quant); + quants->uv_quant_fp[q][i] = (1 << 16) / quant; + quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7; + quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); + quants->uv_round[q][i] = (qrounding_factor * quant) >> 7; cm->uv_dequant[q][i] = quant; } -#if CONFIG_ALPHA - // alpha - for (i = 0; i < 2; ++i) { - const int quant = i == 0 ? vp9_dc_quant(q, cm->a_dc_delta_q) - : vp9_ac_quant(q, cm->a_ac_delta_q); - invert_quant(&cpi->a_quant[q][i], &cpi->a_quant_shift[q][i], quant); - cpi->a_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7); - cpi->a_round[q][i] = (qrounding_factor * quant) >> 7; - cm->a_dequant[q][i] = quant; - } -#endif - for (i = 2; i < 8; i++) { - cpi->y_quant[q][i] = cpi->y_quant[q][1]; - cpi->y_quant_shift[q][i] = cpi->y_quant_shift[q][1]; - cpi->y_zbin[q][i] = cpi->y_zbin[q][1]; - cpi->y_round[q][i] = cpi->y_round[q][1]; + quants->y_quant[q][i] = quants->y_quant[q][1]; + quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1]; + quants->y_round_fp[q][i] = quants->y_round_fp[q][1]; + quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1]; + quants->y_zbin[q][i] = quants->y_zbin[q][1]; + quants->y_round[q][i] = quants->y_round[q][1]; cm->y_dequant[q][i] = cm->y_dequant[q][1]; - cpi->uv_quant[q][i] = cpi->uv_quant[q][1]; - cpi->uv_quant_shift[q][i] = cpi->uv_quant_shift[q][1]; - cpi->uv_zbin[q][i] = cpi->uv_zbin[q][1]; - cpi->uv_round[q][i] = cpi->uv_round[q][1]; + quants->uv_quant[q][i] = quants->uv_quant[q][1]; + quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1]; + quants->uv_round_fp[q][i] = quants->uv_round_fp[q][1]; + quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1]; + quants->uv_zbin[q][i] = quants->uv_zbin[q][1]; + quants->uv_round[q][i] = quants->uv_round[q][1]; cm->uv_dequant[q][i] = cm->uv_dequant[q][1]; - -#if CONFIG_ALPHA - cpi->a_quant[q][i] = cpi->a_quant[q][1]; - cpi->a_quant_shift[q][i] = cpi->a_quant_shift[q][1]; - cpi->a_zbin[q][i] = cpi->a_zbin[q][1]; - cpi->a_round[q][i] = cpi->a_round[q][1]; - cm->a_dequant[q][i] = cm->a_dequant[q][1]; -#endif } } } -void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) { +void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) { + const VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + QUANTS *const quants = &cpi->quants; + const int segment_id = xd->mi[0]->mbmi.segment_id; + const int qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex); + const int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q); + const int zbin = cpi->zbin_mode_boost; int i; - VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *xd = &x->e_mbd; - int zbin_extra; - int segment_id = xd->mi_8x8[0]->mbmi.segment_id; - const int qindex = vp9_get_qindex(&cpi->common.seg, segment_id, - cpi->common.base_qindex); - - int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q); // Y - zbin_extra = (cpi->common.y_dequant[qindex][1] * - (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7; - - x->plane[0].quant = cpi->y_quant[qindex]; - x->plane[0].quant_shift = cpi->y_quant_shift[qindex]; - x->plane[0].zbin = cpi->y_zbin[qindex]; - x->plane[0].round = cpi->y_round[qindex]; - x->plane[0].zbin_extra = (int16_t)zbin_extra; - x->e_mbd.plane[0].dequant = cpi->common.y_dequant[qindex]; + x->plane[0].quant = quants->y_quant[qindex]; + x->plane[0].quant_fp = quants->y_quant_fp[qindex]; + x->plane[0].round_fp = quants->y_round_fp[qindex]; + x->plane[0].quant_shift = quants->y_quant_shift[qindex]; + x->plane[0].zbin = quants->y_zbin[qindex]; + x->plane[0].round = quants->y_round[qindex]; + x->plane[0].quant_thred[0] = cm->y_dequant[qindex][0] * + cm->y_dequant[qindex][0]; + x->plane[0].quant_thred[1] = cm->y_dequant[qindex][1] * + cm->y_dequant[qindex][1]; + x->plane[0].zbin_extra = (int16_t)((cm->y_dequant[qindex][1] * zbin) >> 7); + xd->plane[0].dequant = cm->y_dequant[qindex]; // UV - zbin_extra = (cpi->common.uv_dequant[qindex][1] * - (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7; - for (i = 1; i < 3; i++) { - x->plane[i].quant = cpi->uv_quant[qindex]; - x->plane[i].quant_shift = cpi->uv_quant_shift[qindex]; - x->plane[i].zbin = cpi->uv_zbin[qindex]; - x->plane[i].round = cpi->uv_round[qindex]; - x->plane[i].zbin_extra = (int16_t)zbin_extra; - x->e_mbd.plane[i].dequant = cpi->common.uv_dequant[qindex]; + x->plane[i].quant = quants->uv_quant[qindex]; + x->plane[i].quant_fp = quants->uv_quant_fp[qindex]; + x->plane[i].round_fp = quants->uv_round_fp[qindex]; + x->plane[i].quant_shift = quants->uv_quant_shift[qindex]; + x->plane[i].zbin = quants->uv_zbin[qindex]; + x->plane[i].round = quants->uv_round[qindex]; + x->plane[i].quant_thred[0] = cm->y_dequant[qindex][0] * + cm->y_dequant[qindex][0]; + x->plane[i].quant_thred[1] = cm->y_dequant[qindex][1] * + cm->y_dequant[qindex][1]; + x->plane[i].zbin_extra = (int16_t)((cm->uv_dequant[qindex][1] * zbin) >> 7); + xd->plane[i].dequant = cm->uv_dequant[qindex]; } -#if CONFIG_ALPHA - x->plane[3].quant = cpi->a_quant[qindex]; - x->plane[3].quant_shift = cpi->a_quant_shift[qindex]; - x->plane[3].zbin = cpi->a_zbin[qindex]; - x->plane[3].round = cpi->a_round[qindex]; - x->plane[3].zbin_extra = (int16_t)zbin_extra; - x->e_mbd.plane[3].dequant = cpi->common.a_dequant[qindex]; -#endif - - x->skip_block = vp9_segfeature_active(&cpi->common.seg, segment_id, - SEG_LVL_SKIP); - - /* save this macroblock QIndex for vp9_update_zbin_extra() */ + x->skip_block = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP); x->q_index = qindex; - /* R/D setup */ - cpi->mb.errorperbit = rdmult >> 6; - cpi->mb.errorperbit += (cpi->mb.errorperbit == 0); + x->errorperbit = rdmult >> 6; + x->errorperbit += (x->errorperbit == 0); vp9_initialize_me_consts(cpi, x->q_index); } @@ -313,9 +388,9 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) { void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) { const int qindex = x->q_index; const int y_zbin_extra = (cpi->common.y_dequant[qindex][1] * - (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7; + cpi->zbin_mode_boost) >> 7; const int uv_zbin_extra = (cpi->common.uv_dequant[qindex][1] * - (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7; + cpi->zbin_mode_boost) >> 7; x->plane[0].zbin_extra = (int16_t)y_zbin_extra; x->plane[1].zbin_extra = (int16_t)uv_zbin_extra; @@ -323,26 +398,42 @@ void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) { } void vp9_frame_init_quantizer(VP9_COMP *cpi) { - // Clear Zbin mode boost for default case cpi->zbin_mode_boost = 0; - - // MB level quantizer setup - vp9_mb_init_quantizer(cpi, &cpi->mb); + vp9_init_plane_quantizers(cpi, &cpi->mb); } -void vp9_set_quantizer(struct VP9_COMP *cpi, int q) { - VP9_COMMON *cm = &cpi->common; - +void vp9_set_quantizer(VP9_COMMON *cm, int q) { + // quantizer has to be reinitialized with vp9_init_quantizer() if any + // delta_q changes. cm->base_qindex = q; - - // if any of the delta_q values are changing update flag will - // have to be set. cm->y_dc_delta_q = 0; cm->uv_dc_delta_q = 0; cm->uv_ac_delta_q = 0; - - // quantizer has to be reinitialized if any delta_q changes. - // As there are not any here for now this is inactive code. - // if(update) - // vp9_init_quantizer(cpi); +} + +// Table that converts 0-63 Q-range values passed in outside to the Qindex +// range used internally. +static const int quantizer_to_qindex[] = { + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 249, 255, +}; + +int vp9_quantizer_to_qindex(int quantizer) { + return quantizer_to_qindex[quantizer]; +} + +int vp9_qindex_to_quantizer(int qindex) { + int quantizer; + + for (quantizer = 0; quantizer < 64; ++quantizer) + if (quantizer_to_qindex[quantizer] >= qindex) + return quantizer; + + return 63; } diff --git a/media/libvpx/vp9/encoder/vp9_quantize.h b/media/libvpx/vp9/encoder/vp9_quantize.h index c078e1d41a5a..262529b05700 100644 --- a/media/libvpx/vp9/encoder/vp9_quantize.h +++ b/media/libvpx/vp9/encoder/vp9_quantize.h @@ -11,21 +11,62 @@ #ifndef VP9_ENCODER_VP9_QUANTIZE_H_ #define VP9_ENCODER_VP9_QUANTIZE_H_ +#include "./vpx_config.h" #include "vp9/encoder/vp9_block.h" -void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx, +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]); + + // TODO(jingning): in progress of re-working the quantization. will decide + // if we want to deprecate the current use of y_quant. + DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_quant_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_round_fp[QINDEX_RANGE][8]); + + DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]); + DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]); +} QUANTS; + +void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr); +void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr); +void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, const int16_t *scan, const int16_t *iscan); struct VP9_COMP; - -void vp9_set_quantizer(struct VP9_COMP *cpi, int q); +struct VP9Common; void vp9_frame_init_quantizer(struct VP9_COMP *cpi); void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x); -void vp9_mb_init_quantizer(struct VP9_COMP *cpi, MACROBLOCK *x); +void vp9_init_plane_quantizers(struct VP9_COMP *cpi, MACROBLOCK *x); void vp9_init_quantizer(struct VP9_COMP *cpi); +void vp9_set_quantizer(struct VP9Common *cm, int q); + +int vp9_quantizer_to_qindex(int quantizer); + +int vp9_qindex_to_quantizer(int qindex); + +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_ENCODER_VP9_QUANTIZE_H_ diff --git a/media/libvpx/vp9/encoder/vp9_ratectrl.c b/media/libvpx/vp9/encoder/vp9_ratectrl.c index 0aa3a6893eba..b607c8559eb5 100644 --- a/media/libvpx/vp9/encoder/vp9_ratectrl.c +++ b/media/libvpx/vp9/encoder/vp9_ratectrl.c @@ -8,32 +8,86 @@ * be found in the AUTHORS file in the root of the source tree. */ - -#include -#include -#include -#include #include +#include #include +#include +#include +#include + +#include "vpx_mem/vpx_mem.h" #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_common.h" -#include "vp9/encoder/vp9_ratectrl.h" #include "vp9/common/vp9_entropymode.h" -#include "vpx_mem/vpx_mem.h" -#include "vp9/common/vp9_systemdependent.h" -#include "vp9/encoder/vp9_encodemv.h" #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_systemdependent.h" + +#include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_ratectrl.h" + +// Max rate target for 1080P and below encodes under normal circumstances +// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB +#define MAX_MB_RATE 250 +#define MAXRATE_1080P 2025000 + +#define DEFAULT_KF_BOOST 2000 +#define DEFAULT_GF_BOOST 2000 + +#define LIMIT_QRANGE_FOR_ALTREF_AND_KEY 1 #define MIN_BPB_FACTOR 0.005 #define MAX_BPB_FACTOR 50 -// Bits Per MB at different Q (Multiplied by 512) -#define BPER_MB_NORMBITS 9 +#define FRAME_OVERHEAD_BITS 200 -static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] = - { 1, 2, 3, 4, 5 }; +// Tables relating active max Q to active min Q +static int kf_low_motion_minq[QINDEX_RANGE]; +static int kf_high_motion_minq[QINDEX_RANGE]; +static int arfgf_low_motion_minq[QINDEX_RANGE]; +static int arfgf_high_motion_minq[QINDEX_RANGE]; +static int inter_minq[QINDEX_RANGE]; +static int rtc_minq[QINDEX_RANGE]; +static int gf_high = 2000; +static int gf_low = 400; +static int kf_high = 5000; +static int kf_low = 400; + +// Functions to compute the active minq lookup table entries based on a +// formulaic approach to facilitate easier adjustment of the Q tables. +// The formulae were derived from computing a 3rd order polynomial best +// fit to the original data (after plotting real maxq vs minq (not q index)) +static int get_minq_index(double maxq, double x3, double x2, double x1) { + int i; + const double minqtarget = MIN(((x3 * maxq + x2) * maxq + x1) * maxq, + maxq); + + // Special case handling to deal with the step from q2.0 + // down to lossless mode represented by q 1.0. + if (minqtarget <= 2.0) + return 0; + + for (i = 0; i < QINDEX_RANGE; i++) + if (minqtarget <= vp9_convert_qindex_to_q(i)) + return i; + + return QINDEX_RANGE - 1; +} + +void vp9_rc_init_minq_luts() { + int i; + + for (i = 0; i < QINDEX_RANGE; i++) { + const double maxq = vp9_convert_qindex_to_q(i); + kf_low_motion_minq[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.125); + kf_high_motion_minq[i] = get_minq_index(maxq, 0.000002, -0.0012, 0.50); + arfgf_low_motion_minq[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30); + arfgf_high_motion_minq[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.50); + inter_minq[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90); + rtc_minq[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70); + } +} // These functions use formulaic calculations to make playing with the // quantizer tables easier. If necessary they can be replaced by lookup @@ -43,231 +97,244 @@ double vp9_convert_qindex_to_q(int qindex) { return vp9_ac_quant(qindex, 0) / 4.0; } -int vp9_gfboost_qadjust(int qindex) { - const double q = vp9_convert_qindex_to_q(qindex); - return (int)((0.00000828 * q * q * q) + - (-0.0055 * q * q) + - (1.32 * q) + 79.3); -} - -static int kfboost_qadjust(int qindex) { - const double q = vp9_convert_qindex_to_q(qindex); - return (int)((0.00000973 * q * q * q) + - (-0.00613 * q * q) + - (1.316 * q) + 121.2); -} - -int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex, - double correction_factor) { +int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex, + double correction_factor) { const double q = vp9_convert_qindex_to_q(qindex); int enumerator = frame_type == KEY_FRAME ? 3300000 : 2250000; // q based adjustment to baseline enumerator enumerator += (int)(enumerator * q) >> 12; - return (int)(0.5 + (enumerator * correction_factor / q)); + return (int)(enumerator * correction_factor / q); } -void vp9_save_coding_context(VP9_COMP *cpi) { - CODING_CONTEXT *const cc = &cpi->coding_context; - VP9_COMMON *cm = &cpi->common; - - // Stores a snapshot of key state variables which can subsequently be - // restored with a call to vp9_restore_coding_context. These functions are - // intended for use in a re-code loop in vp9_compress_frame where the - // quantizer value is adjusted between loop iterations. - vp9_copy(cc->nmvjointcost, cpi->mb.nmvjointcost); - vp9_copy(cc->nmvcosts, cpi->mb.nmvcosts); - vp9_copy(cc->nmvcosts_hp, cpi->mb.nmvcosts_hp); - - vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs); - - vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy, - cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols)); - - vp9_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas); - vp9_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas); - - cc->fc = cm->fc; -} - -void vp9_restore_coding_context(VP9_COMP *cpi) { - CODING_CONTEXT *const cc = &cpi->coding_context; - VP9_COMMON *cm = &cpi->common; - - // Restore key state variables to the snapshot state stored in the - // previous call to vp9_save_coding_context. - vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost); - vp9_copy(cpi->mb.nmvcosts, cc->nmvcosts); - vp9_copy(cpi->mb.nmvcosts_hp, cc->nmvcosts_hp); - - vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs); - - vpx_memcpy(cm->last_frame_seg_map, - cpi->coding_context.last_frame_seg_map_copy, - (cm->mi_rows * cm->mi_cols)); - - vp9_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas); - vp9_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas); - - cm->fc = cc->fc; -} - -void vp9_setup_key_frame(VP9_COMP *cpi) { - VP9_COMMON *cm = &cpi->common; - - vp9_setup_past_independence(cm); - - // interval before next GF - cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; - /* All buffers are implicitly updated on key frames. */ - cpi->refresh_golden_frame = 1; - cpi->refresh_alt_ref_frame = 1; -} - -void vp9_setup_inter_frame(VP9_COMP *cpi) { - VP9_COMMON *cm = &cpi->common; - if (cm->error_resilient_mode || cm->intra_only) - vp9_setup_past_independence(cm); - - assert(cm->frame_context_idx < NUM_FRAME_CONTEXTS); - cm->fc = cm->frame_contexts[cm->frame_context_idx]; -} - -static int estimate_bits_at_q(int frame_kind, int q, int mbs, +static int estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs, double correction_factor) { - const int bpm = (int)(vp9_bits_per_mb(frame_kind, q, correction_factor)); - - // Attempt to retain reasonable accuracy without overflow. The cutoff is - // chosen such that the maximum product of Bpm and MBs fits 31 bits. The - // largest Bpm takes 20 bits. - return (mbs > (1 << 11)) ? (bpm >> BPER_MB_NORMBITS) * mbs - : (bpm * mbs) >> BPER_MB_NORMBITS; + const int bpm = (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor)); + return ((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS; } - -static void calc_iframe_target_size(VP9_COMP *cpi) { - // boost defaults to half second - int target; - - // Clear down mmx registers to allow floating point in what follows - vp9_clear_system_state(); // __asm emms; - - // New Two pass RC - target = cpi->per_frame_bandwidth; - - if (cpi->oxcf.rc_max_intra_bitrate_pct) { - int max_rate = cpi->per_frame_bandwidth - * cpi->oxcf.rc_max_intra_bitrate_pct / 100; - - if (target > max_rate) - target = max_rate; +int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) { + const RATE_CONTROL *rc = &cpi->rc; + const int min_frame_target = MAX(rc->min_frame_bandwidth, + rc->avg_frame_bandwidth >> 5); + if (target < min_frame_target) + target = min_frame_target; + if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) { + // If there is an active ARF at this location use the minimum + // bits on this frame even if it is a constructed arf. + // The active maximum quantizer insures that an appropriate + // number of bits will be spent if needed for constructed ARFs. + target = min_frame_target; } + // Clip the frame target to the maximum allowed value. + if (target > rc->max_frame_bandwidth) + target = rc->max_frame_bandwidth; + return target; +} - cpi->this_frame_target = target; +int vp9_rc_clamp_iframe_target_size(const VP9_COMP *const cpi, int target) { + const RATE_CONTROL *rc = &cpi->rc; + const VP9EncoderConfig *oxcf = &cpi->oxcf; + if (oxcf->rc_max_intra_bitrate_pct) { + const int max_rate = rc->avg_frame_bandwidth * + oxcf->rc_max_intra_bitrate_pct / 100; + target = MIN(target, max_rate); + } + if (target > rc->max_frame_bandwidth) + target = rc->max_frame_bandwidth; + return target; } -// Do the best we can to define the parameters for the next GF based -// on what information we have available. -// -// In this experimental code only two pass is supported -// so we just use the interval determined in the two pass code. -static void calc_gf_params(VP9_COMP *cpi) { - // Set the gf interval - cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; +// Update the buffer level for higher layers, given the encoded current layer. +static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) { + int temporal_layer = 0; + int current_temporal_layer = svc->temporal_layer_id; + for (temporal_layer = current_temporal_layer + 1; + temporal_layer < svc->number_temporal_layers; ++temporal_layer) { + LAYER_CONTEXT *lc = &svc->layer_context[temporal_layer]; + RATE_CONTROL *lrc = &lc->rc; + int bits_off_for_this_layer = (int)(lc->target_bandwidth / lc->framerate - + encoded_frame_size); + lrc->bits_off_target += bits_off_for_this_layer; + + // Clip buffer level to maximum buffer size for the layer. + lrc->bits_off_target = MIN(lrc->bits_off_target, lrc->maximum_buffer_size); + lrc->buffer_level = lrc->bits_off_target; + } } +// Update the buffer level: leaky bucket model. +static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) { + const VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; -static void calc_pframe_target_size(VP9_COMP *cpi) { - const int min_frame_target = MAX(cpi->min_frame_bandwidth, - cpi->av_per_frame_bandwidth >> 5); - if (cpi->refresh_alt_ref_frame) { - // Special alt reference frame case - // Per frame bit target for the alt ref frame - cpi->per_frame_bandwidth = cpi->twopass.gf_bits; - cpi->this_frame_target = cpi->per_frame_bandwidth; + // Non-viewable frames are a special case and are treated as pure overhead. + if (!cm->show_frame) { + rc->bits_off_target -= encoded_frame_size; } else { - // Normal frames (gf,and inter) - cpi->this_frame_target = cpi->per_frame_bandwidth; + rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size; } - // Check that the total sum of adjustments is not above the maximum allowed. - // That is, having allowed for the KF and GF penalties, we have not pushed - // the current inter-frame target too low. If the adjustment we apply here is - // not capable of recovering all the extra bits we have spent in the KF or GF, - // then the remainder will have to be recovered over a longer time span via - // other buffer / rate control mechanisms. - if (cpi->this_frame_target < min_frame_target) - cpi->this_frame_target = min_frame_target; + // Clip the buffer level to the maximum specified buffer size. + rc->bits_off_target = MIN(rc->bits_off_target, rc->maximum_buffer_size); + rc->buffer_level = rc->bits_off_target; - if (!cpi->refresh_alt_ref_frame) - // Note the baseline target data rate for this inter frame. - cpi->inter_frame_target = cpi->this_frame_target; + if (cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR) { + update_layer_buffer_level(&cpi->svc, encoded_frame_size); + } +} - // Adjust target frame size for Golden Frames: - if (cpi->frames_till_gf_update_due == 0) { - const int q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] - : cpi->oxcf.fixed_q; +void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { + int i; - cpi->refresh_golden_frame = 1; + if (pass == 0 && oxcf->rc_mode == VPX_CBR) { + rc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q; + rc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q; + } else { + rc->avg_frame_qindex[KEY_FRAME] = (oxcf->worst_allowed_q + + oxcf->best_allowed_q) / 2; + rc->avg_frame_qindex[INTER_FRAME] = (oxcf->worst_allowed_q + + oxcf->best_allowed_q) / 2; + } - calc_gf_params(cpi); + rc->last_q[KEY_FRAME] = oxcf->best_allowed_q; + rc->last_q[INTER_FRAME] = oxcf->best_allowed_q; - // If we are using alternate ref instead of gf then do not apply the boost - // It will instead be applied to the altref update - // Jims modified boost - if (!cpi->source_alt_ref_active) { - if (cpi->oxcf.fixed_q < 0) { - // The spend on the GF is defined in the two pass code - // for two pass encodes - cpi->this_frame_target = cpi->per_frame_bandwidth; - } else { - cpi->this_frame_target = - (estimate_bits_at_q(1, q, cpi->common.MBs, 1.0) - * cpi->last_boost) / 100; - } + rc->buffer_level = rc->starting_buffer_level; + rc->bits_off_target = rc->starting_buffer_level; + + rc->rolling_target_bits = rc->avg_frame_bandwidth; + rc->rolling_actual_bits = rc->avg_frame_bandwidth; + rc->long_rolling_target_bits = rc->avg_frame_bandwidth; + rc->long_rolling_actual_bits = rc->avg_frame_bandwidth; + + rc->total_actual_bits = 0; + rc->total_target_bits = 0; + rc->total_target_vs_actual = 0; + + rc->baseline_gf_interval = DEFAULT_GF_INTERVAL; + rc->frames_since_key = 8; // Sensible default for first frame. + rc->this_key_frame_forced = 0; + rc->next_key_frame_forced = 0; + rc->source_alt_ref_pending = 0; + rc->source_alt_ref_active = 0; + + rc->frames_till_gf_update_due = 0; + + rc->ni_av_qi = oxcf->worst_allowed_q; + rc->ni_tot_qi = 0; + rc->ni_frames = 0; + + rc->tot_q = 0.0; + rc->avg_q = vp9_convert_qindex_to_q(oxcf->worst_allowed_q); + + for (i = 0; i < RATE_FACTOR_LEVELS; ++i) { + rc->rate_correction_factors[i] = 1.0; + } +} + +int vp9_rc_drop_frame(VP9_COMP *cpi) { + const VP9EncoderConfig *oxcf = &cpi->oxcf; + RATE_CONTROL *const rc = &cpi->rc; + + if (!oxcf->drop_frames_water_mark) { + return 0; + } else { + if (rc->buffer_level < 0) { + // Always drop if buffer is below 0. + return 1; } else { - // If there is an active ARF at this location use the minimum - // bits on this frame even if it is a constructed arf. - // The active maximum quantizer insures that an appropriate - // number of bits will be spent if needed for constructed ARFs. - cpi->this_frame_target = 0; + // If buffer is below drop_mark, for now just drop every other frame + // (starting with the next frame) until it increases back over drop_mark. + int drop_mark = (int)(oxcf->drop_frames_water_mark * + rc->optimal_buffer_level / 100); + if ((rc->buffer_level > drop_mark) && + (rc->decimation_factor > 0)) { + --rc->decimation_factor; + } else if (rc->buffer_level <= drop_mark && + rc->decimation_factor == 0) { + rc->decimation_factor = 1; + } + if (rc->decimation_factor > 0) { + if (rc->decimation_count > 0) { + --rc->decimation_count; + return 1; + } else { + rc->decimation_count = rc->decimation_factor; + return 0; + } + } else { + rc->decimation_count = 0; + return 0; + } } } } +static double get_rate_correction_factor(const VP9_COMP *cpi) { + const RATE_CONTROL *const rc = &cpi->rc; -void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) { - const int q = cpi->common.base_qindex; + if (cpi->common.frame_type == KEY_FRAME) { + return rc->rate_correction_factors[KF_STD]; + } else if (cpi->oxcf.pass == 2) { + RATE_FACTOR_LEVEL rf_lvl = + cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index]; + return rc->rate_correction_factors[rf_lvl]; + } else { + if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) && + !rc->is_src_frame_alt_ref && + !(cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR)) + return rc->rate_correction_factors[GF_ARF_STD]; + else + return rc->rate_correction_factors[INTER_NORMAL]; + } +} + +static void set_rate_correction_factor(VP9_COMP *cpi, double factor) { + RATE_CONTROL *const rc = &cpi->rc; + + if (cpi->common.frame_type == KEY_FRAME) { + rc->rate_correction_factors[KF_STD] = factor; + } else if (cpi->oxcf.pass == 2) { + RATE_FACTOR_LEVEL rf_lvl = + cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index]; + rc->rate_correction_factors[rf_lvl] = factor; + } else { + if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) && + !rc->is_src_frame_alt_ref && + !(cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR)) + rc->rate_correction_factors[GF_ARF_STD] = factor; + else + rc->rate_correction_factors[INTER_NORMAL] = factor; + } +} + +void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) { + const VP9_COMMON *const cm = &cpi->common; int correction_factor = 100; - double rate_correction_factor; + double rate_correction_factor = get_rate_correction_factor(cpi); double adjustment_limit; int projected_size_based_on_q = 0; - // Clear down mmx registers to allow floating point in what follows - vp9_clear_system_state(); // __asm emms; + // Do not update the rate factors for arf overlay frames. + if (cpi->rc.is_src_frame_alt_ref) + return; - if (cpi->common.frame_type == KEY_FRAME) { - rate_correction_factor = cpi->key_frame_rate_correction_factor; - } else { - if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) - rate_correction_factor = cpi->gf_rate_correction_factor; - else - rate_correction_factor = cpi->rate_correction_factor; - } + // Clear down mmx registers to allow floating point in what follows + vp9_clear_system_state(); // Work out how big we would have expected the frame to be at this Q given // the current correction factor. // Stay in double to avoid int overflow when values are large - projected_size_based_on_q = estimate_bits_at_q(cpi->common.frame_type, q, - cpi->common.MBs, + projected_size_based_on_q = estimate_bits_at_q(cm->frame_type, + cm->base_qindex, cm->MBs, rate_correction_factor); - // Work out a size correction factor. if (projected_size_based_on_q > 0) - correction_factor = - (100 * cpi->projected_frame_size) / projected_size_based_on_q; + correction_factor = (100 * cpi->rc.projected_frame_size) / + projected_size_based_on_q; // More heavily damped adjustment used if we have been oscillating either side // of target. @@ -284,74 +351,48 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) { break; } - // if ( (correction_factor > 102) && (Q < cpi->active_worst_quality) ) if (correction_factor > 102) { // We are not already at the worst allowable quality - correction_factor = - (int)(100 + ((correction_factor - 100) * adjustment_limit)); - rate_correction_factor = - ((rate_correction_factor * correction_factor) / 100); + correction_factor = (int)(100 + ((correction_factor - 100) * + adjustment_limit)); + rate_correction_factor = (rate_correction_factor * correction_factor) / 100; // Keep rate_correction_factor within limits if (rate_correction_factor > MAX_BPB_FACTOR) rate_correction_factor = MAX_BPB_FACTOR; } else if (correction_factor < 99) { // We are not already at the best allowable quality - correction_factor = - (int)(100 - ((100 - correction_factor) * adjustment_limit)); - rate_correction_factor = - ((rate_correction_factor * correction_factor) / 100); + correction_factor = (int)(100 - ((100 - correction_factor) * + adjustment_limit)); + rate_correction_factor = (rate_correction_factor * correction_factor) / 100; // Keep rate_correction_factor within limits if (rate_correction_factor < MIN_BPB_FACTOR) rate_correction_factor = MIN_BPB_FACTOR; } - if (cpi->common.frame_type == KEY_FRAME) { - cpi->key_frame_rate_correction_factor = rate_correction_factor; - } else { - if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) - cpi->gf_rate_correction_factor = rate_correction_factor; - else - cpi->rate_correction_factor = rate_correction_factor; - } + set_rate_correction_factor(cpi, rate_correction_factor); } -int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) { - int q = cpi->active_worst_quality; - - int i; +int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame, + int active_best_quality, int active_worst_quality) { + const VP9_COMMON *const cm = &cpi->common; + int q = active_worst_quality; int last_error = INT_MAX; - int target_bits_per_mb; - int bits_per_mb_at_this_q; - double correction_factor; - - // Select the appropriate correction factor based upon type of frame. - if (cpi->common.frame_type == KEY_FRAME) { - correction_factor = cpi->key_frame_rate_correction_factor; - } else { - if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) - correction_factor = cpi->gf_rate_correction_factor; - else - correction_factor = cpi->rate_correction_factor; - } + int i, target_bits_per_mb; + const double correction_factor = get_rate_correction_factor(cpi); // Calculate required scaling factor based on target frame size and size of // frame produced using previous Q. - if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS)) - target_bits_per_mb = - (target_bits_per_frame / cpi->common.MBs) - << BPER_MB_NORMBITS; // Case where we would overflow int - else - target_bits_per_mb = - (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs; + target_bits_per_mb = + ((uint64_t)target_bits_per_frame << BPER_MB_NORMBITS) / cm->MBs; - i = cpi->active_best_quality; + i = active_best_quality; do { - bits_per_mb_at_this_q = (int)vp9_bits_per_mb(cpi->common.frame_type, i, - correction_factor); + const int bits_per_mb_at_this_q = (int)vp9_rc_bits_per_mb(cm->frame_type, i, + correction_factor); if (bits_per_mb_at_this_q <= target_bits_per_mb) { if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error) @@ -363,114 +404,982 @@ int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) { } else { last_error = bits_per_mb_at_this_q - target_bits_per_mb; } - } while (++i <= cpi->active_worst_quality); + } while (++i <= active_worst_quality); return q; } - -static int estimate_keyframe_frequency(VP9_COMP *cpi) { - int i; - - // Average key frame frequency - int av_key_frame_frequency = 0; - - /* First key frame at start of sequence is a special case. We have no - * frequency data. - */ - if (cpi->key_frame_count == 1) { - /* Assume a default of 1 kf every 2 seconds, or the max kf interval, - * whichever is smaller. - */ - int key_freq = cpi->oxcf.key_freq > 0 ? cpi->oxcf.key_freq : 1; - av_key_frame_frequency = (int)cpi->output_framerate * 2; - - if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq) - av_key_frame_frequency = cpi->oxcf.key_freq; - - cpi->prior_key_frame_distance[KEY_FRAME_CONTEXT - 1] - = av_key_frame_frequency; +static int get_active_quality(int q, int gfu_boost, int low, int high, + int *low_motion_minq, int *high_motion_minq) { + if (gfu_boost > high) { + return low_motion_minq[q]; + } else if (gfu_boost < low) { + return high_motion_minq[q]; } else { - unsigned int total_weight = 0; - int last_kf_interval = - (cpi->frames_since_key > 0) ? cpi->frames_since_key : 1; - - /* reset keyframe context and calculate weighted average of last - * KEY_FRAME_CONTEXT keyframes - */ - for (i = 0; i < KEY_FRAME_CONTEXT; i++) { - if (i < KEY_FRAME_CONTEXT - 1) - cpi->prior_key_frame_distance[i] - = cpi->prior_key_frame_distance[i + 1]; - else - cpi->prior_key_frame_distance[i] = last_kf_interval; - - av_key_frame_frequency += prior_key_frame_weight[i] - * cpi->prior_key_frame_distance[i]; - total_weight += prior_key_frame_weight[i]; - } - - av_key_frame_frequency /= total_weight; + const int gap = high - low; + const int offset = high - gfu_boost; + const int qdiff = high_motion_minq[q] - low_motion_minq[q]; + const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap; + return low_motion_minq[q] + adjustment; } - return av_key_frame_frequency; } +static int get_kf_active_quality(const RATE_CONTROL *const rc, int q) { + return get_active_quality(q, rc->kf_boost, kf_low, kf_high, + kf_low_motion_minq, kf_high_motion_minq); +} -void vp9_adjust_key_frame_context(VP9_COMP *cpi) { - // Clear down mmx registers to allow floating point in what follows +static int get_gf_active_quality(const RATE_CONTROL *const rc, int q) { + return get_active_quality(q, rc->gfu_boost, gf_low, gf_high, + arfgf_low_motion_minq, arfgf_high_motion_minq); +} + +static int calc_active_worst_quality_one_pass_vbr(const VP9_COMP *cpi) { + const RATE_CONTROL *const rc = &cpi->rc; + const unsigned int curr_frame = cpi->common.current_video_frame; + int active_worst_quality; + + if (cpi->common.frame_type == KEY_FRAME) { + active_worst_quality = curr_frame == 0 ? rc->worst_quality + : rc->last_q[KEY_FRAME] * 2; + } else { + if (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 5 / 4 + : rc->last_q[INTER_FRAME]; + } else { + active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 2 + : rc->last_q[INTER_FRAME] * 2; + } + } + return MIN(active_worst_quality, rc->worst_quality); +} + +// Adjust active_worst_quality level based on buffer level. +static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) { + // Adjust active_worst_quality: If buffer is above the optimal/target level, + // bring active_worst_quality down depending on fullness of buffer. + // If buffer is below the optimal level, let the active_worst_quality go from + // ambient Q (at buffer = optimal level) to worst_quality level + // (at buffer = critical level). + const VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *rc = &cpi->rc; + // Buffer level below which we push active_worst to worst_quality. + int64_t critical_level = rc->optimal_buffer_level >> 2; + int64_t buff_lvl_step = 0; + int adjustment = 0; + int active_worst_quality; + if (cm->frame_type == KEY_FRAME) + return rc->worst_quality; + if (cm->current_video_frame > 1) + active_worst_quality = MIN(rc->worst_quality, + rc->avg_frame_qindex[INTER_FRAME] * 5 / 4); + else + active_worst_quality = MIN(rc->worst_quality, + rc->avg_frame_qindex[KEY_FRAME] * 3 / 2); + if (rc->buffer_level > rc->optimal_buffer_level) { + // Adjust down. + // Maximum limit for down adjustment, ~30%. + int max_adjustment_down = active_worst_quality / 3; + if (max_adjustment_down) { + buff_lvl_step = ((rc->maximum_buffer_size - + rc->optimal_buffer_level) / max_adjustment_down); + if (buff_lvl_step) + adjustment = (int)((rc->buffer_level - rc->optimal_buffer_level) / + buff_lvl_step); + active_worst_quality -= adjustment; + } + } else if (rc->buffer_level > critical_level) { + // Adjust up from ambient Q. + if (critical_level) { + buff_lvl_step = (rc->optimal_buffer_level - critical_level); + if (buff_lvl_step) { + adjustment = + (int)((rc->worst_quality - rc->avg_frame_qindex[INTER_FRAME]) * + (rc->optimal_buffer_level - rc->buffer_level) / + buff_lvl_step); + } + active_worst_quality = rc->avg_frame_qindex[INTER_FRAME] + adjustment; + } + } else { + // Set to worst_quality if buffer is below critical level. + active_worst_quality = rc->worst_quality; + } + return active_worst_quality; +} + +static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, + int *bottom_index, + int *top_index) { + const VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + int active_best_quality; + int active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi); + int q; + + if (frame_is_intra_only(cm)) { + active_best_quality = rc->best_quality; + // Handle the special case for key frames forced when we have75 reached + // the maximum key frame interval. Here force the Q to a range + // based on the ambient Q to reduce the risk of popping. + if (rc->this_key_frame_forced) { + int qindex = rc->last_boosted_qindex; + double last_boosted_q = vp9_convert_qindex_to_q(qindex); + int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q, + (last_boosted_q * 0.75)); + active_best_quality = MAX(qindex + delta_qindex, rc->best_quality); + } else if (cm->current_video_frame > 0) { + // not first frame of one pass and kf_boost is set + double q_adj_factor = 1.0; + double q_val; + + active_best_quality = + get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME]); + + // Allow somewhat lower kf minq with small image formats. + if ((cm->width * cm->height) <= (352 * 288)) { + q_adj_factor -= 0.25; + } + + // Convert the adjustment factor to a qindex delta + // on active_best_quality. + q_val = vp9_convert_qindex_to_q(active_best_quality); + active_best_quality += vp9_compute_qdelta(rc, q_val, + q_val * q_adj_factor); + } + } else if (!rc->is_src_frame_alt_ref && + !cpi->use_svc && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + // Use the lower of active_worst_quality and recent + // average Q as basis for GF/ARF best Q limit unless last frame was + // a key frame. + if (rc->frames_since_key > 1 && + rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) { + q = rc->avg_frame_qindex[INTER_FRAME]; + } else { + q = active_worst_quality; + } + active_best_quality = get_gf_active_quality(rc, q); + } else { + // Use the lower of active_worst_quality and recent/average Q. + if (cm->current_video_frame > 1) { + if (rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) + active_best_quality = rtc_minq[rc->avg_frame_qindex[INTER_FRAME]]; + else + active_best_quality = rtc_minq[active_worst_quality]; + } else { + if (rc->avg_frame_qindex[KEY_FRAME] < active_worst_quality) + active_best_quality = rtc_minq[rc->avg_frame_qindex[KEY_FRAME]]; + else + active_best_quality = rtc_minq[active_worst_quality]; + } + } + + // Clip the active best and worst quality values to limits + active_best_quality = clamp(active_best_quality, + rc->best_quality, rc->worst_quality); + active_worst_quality = clamp(active_worst_quality, + active_best_quality, rc->worst_quality); + + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + +#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY + // Limit Q range for the adaptive loop. + if (cm->frame_type == KEY_FRAME && + !rc->this_key_frame_forced && + !(cm->current_video_frame == 0)) { + int qdelta = 0; + vp9_clear_system_state(); + qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, + active_worst_quality, 2.0); + *top_index = active_worst_quality + qdelta; + *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index; + } +#endif + + // Special case code to try and match quality with forced key frames + if (cm->frame_type == KEY_FRAME && rc->this_key_frame_forced) { + q = rc->last_boosted_qindex; + } else { + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, + active_best_quality, active_worst_quality); + if (q > *top_index) { + // Special case when we are targeting the max allowed rate + if (rc->this_frame_target >= rc->max_frame_bandwidth) + *top_index = q; + else + q = *top_index; + } + } + assert(*top_index <= rc->worst_quality && + *top_index >= rc->best_quality); + assert(*bottom_index <= rc->worst_quality && + *bottom_index >= rc->best_quality); + assert(q <= rc->worst_quality && q >= rc->best_quality); + return q; +} + +static int get_active_cq_level(const RATE_CONTROL *rc, + const VP9EncoderConfig *const oxcf) { + static const double cq_adjust_threshold = 0.5; + int active_cq_level = oxcf->cq_level; + if (oxcf->rc_mode == VPX_CQ && + rc->total_target_bits > 0) { + const double x = (double)rc->total_actual_bits / rc->total_target_bits; + if (x < cq_adjust_threshold) { + active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold); + } + } + return active_cq_level; +} + +static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, + int *bottom_index, + int *top_index) { + const VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const int cq_level = get_active_cq_level(rc, oxcf); + int active_best_quality; + int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi); + int q; + + if (frame_is_intra_only(cm)) { + + // Handle the special case for key frames forced when we have reached + // the maximum key frame interval. Here force the Q to a range + // based on the ambient Q to reduce the risk of popping. + if (rc->this_key_frame_forced) { + int qindex = rc->last_boosted_qindex; + double last_boosted_q = vp9_convert_qindex_to_q(qindex); + int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q, + last_boosted_q * 0.75); + active_best_quality = MAX(qindex + delta_qindex, rc->best_quality); + } else { + // not first frame of one pass and kf_boost is set + double q_adj_factor = 1.0; + double q_val; + + active_best_quality = + get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME]); + + // Allow somewhat lower kf minq with small image formats. + if ((cm->width * cm->height) <= (352 * 288)) { + q_adj_factor -= 0.25; + } + + // Convert the adjustment factor to a qindex delta + // on active_best_quality. + q_val = vp9_convert_qindex_to_q(active_best_quality); + active_best_quality += vp9_compute_qdelta(rc, q_val, + q_val * q_adj_factor); + } + } else if (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + // Use the lower of active_worst_quality and recent + // average Q as basis for GF/ARF best Q limit unless last frame was + // a key frame. + if (rc->frames_since_key > 1 && + rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) { + q = rc->avg_frame_qindex[INTER_FRAME]; + } else { + q = rc->avg_frame_qindex[KEY_FRAME]; + } + // For constrained quality dont allow Q less than the cq level + if (oxcf->rc_mode == VPX_CQ) { + if (q < cq_level) + q = cq_level; + + active_best_quality = get_gf_active_quality(rc, q); + + // Constrained quality use slightly lower active best. + active_best_quality = active_best_quality * 15 / 16; + + } else if (oxcf->rc_mode == VPX_Q) { + if (!cpi->refresh_alt_ref_frame) { + active_best_quality = cq_level; + } else { + active_best_quality = get_gf_active_quality(rc, q); + } + } else { + active_best_quality = get_gf_active_quality(rc, q); + } + } else { + if (oxcf->rc_mode == VPX_Q) { + active_best_quality = cq_level; + } else { + // Use the lower of active_worst_quality and recent/average Q. + if (cm->current_video_frame > 1) + active_best_quality = inter_minq[rc->avg_frame_qindex[INTER_FRAME]]; + else + active_best_quality = inter_minq[rc->avg_frame_qindex[KEY_FRAME]]; + // For the constrained quality mode we don't want + // q to fall below the cq level. + if ((oxcf->rc_mode == VPX_CQ) && + (active_best_quality < cq_level)) { + active_best_quality = cq_level; + } + } + } + + // Clip the active best and worst quality values to limits + active_best_quality = clamp(active_best_quality, + rc->best_quality, rc->worst_quality); + active_worst_quality = clamp(active_worst_quality, + active_best_quality, rc->worst_quality); + + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + +#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY + { + int qdelta = 0; + vp9_clear_system_state(); + + // Limit Q range for the adaptive loop. + if (cm->frame_type == KEY_FRAME && + !rc->this_key_frame_forced && + !(cm->current_video_frame == 0)) { + qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, + active_worst_quality, 2.0); + } else if (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, + active_worst_quality, 1.75); + } + *top_index = active_worst_quality + qdelta; + *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index; + } +#endif + + if (oxcf->rc_mode == VPX_Q) { + q = active_best_quality; + // Special case code to try and match quality with forced key frames + } else if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced) { + q = rc->last_boosted_qindex; + } else { + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, + active_best_quality, active_worst_quality); + if (q > *top_index) { + // Special case when we are targeting the max allowed rate + if (rc->this_frame_target >= rc->max_frame_bandwidth) + *top_index = q; + else + q = *top_index; + } + } + + assert(*top_index <= rc->worst_quality && + *top_index >= rc->best_quality); + assert(*bottom_index <= rc->worst_quality && + *bottom_index >= rc->best_quality); + assert(q <= rc->worst_quality && q >= rc->best_quality); + return q; +} + +static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, + int *bottom_index, + int *top_index) { + const VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const int cq_level = get_active_cq_level(rc, oxcf); + int active_best_quality; + int active_worst_quality = cpi->twopass.active_worst_quality; + int q; + + if (frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi)) { + // Handle the special case for key frames forced when we have75 reached + // the maximum key frame interval. Here force the Q to a range + // based on the ambient Q to reduce the risk of popping. + if (rc->this_key_frame_forced) { + int qindex = rc->last_boosted_qindex; + double last_boosted_q = vp9_convert_qindex_to_q(qindex); + int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q, + last_boosted_q * 0.75); + active_best_quality = MAX(qindex + delta_qindex, rc->best_quality); + } else { + // Not forced keyframe. + double q_adj_factor = 1.0; + double q_val; + // Baseline value derived from cpi->active_worst_quality and kf boost. + active_best_quality = get_kf_active_quality(rc, active_worst_quality); + + // Allow somewhat lower kf minq with small image formats. + if ((cm->width * cm->height) <= (352 * 288)) { + q_adj_factor -= 0.25; + } + + // Make a further adjustment based on the kf zero motion measure. + q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct); + + // Convert the adjustment factor to a qindex delta + // on active_best_quality. + q_val = vp9_convert_qindex_to_q(active_best_quality); + active_best_quality += vp9_compute_qdelta(rc, q_val, + q_val * q_adj_factor); + } + } else if (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + // Use the lower of active_worst_quality and recent + // average Q as basis for GF/ARF best Q limit unless last frame was + // a key frame. + if (rc->frames_since_key > 1 && + rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) { + q = rc->avg_frame_qindex[INTER_FRAME]; + } else { + q = active_worst_quality; + } + // For constrained quality dont allow Q less than the cq level + if (oxcf->rc_mode == VPX_CQ) { + if (q < cq_level) + q = cq_level; + + active_best_quality = get_gf_active_quality(rc, q); + + // Constrained quality use slightly lower active best. + active_best_quality = active_best_quality * 15 / 16; + + } else if (oxcf->rc_mode == VPX_Q) { + if (!cpi->refresh_alt_ref_frame) { + active_best_quality = cq_level; + } else { + active_best_quality = get_gf_active_quality(rc, q); + } + } else { + active_best_quality = get_gf_active_quality(rc, q); + } + } else { + if (oxcf->rc_mode == VPX_Q) { + active_best_quality = cq_level; + } else { + active_best_quality = inter_minq[active_worst_quality]; + + // For the constrained quality mode we don't want + // q to fall below the cq level. + if ((oxcf->rc_mode == VPX_CQ) && + (active_best_quality < cq_level)) { + active_best_quality = cq_level; + } + } + } + + // Clip the active best and worst quality values to limits. + active_best_quality = clamp(active_best_quality, + rc->best_quality, rc->worst_quality); + active_worst_quality = clamp(active_worst_quality, + active_best_quality, rc->worst_quality); + + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + +#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY vp9_clear_system_state(); + { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + const double rate_factor_deltas[RATE_FACTOR_LEVELS] = { + 1.00, // INTER_NORMAL + 1.00, // INTER_HIGH + 1.50, // GF_ARF_LOW + 1.75, // GF_ARF_STD + 2.00, // KF_STD + }; + const double rate_factor = + rate_factor_deltas[gf_group->rf_level[gf_group->index]]; + int qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, + active_worst_quality, rate_factor); + *top_index = active_worst_quality + qdelta; + *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index; + } +#endif - cpi->frames_since_key = 0; - cpi->key_frame_count++; + if (oxcf->rc_mode == VPX_Q) { + q = active_best_quality; + // Special case code to try and match quality with forced key frames. + } else if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced) { + q = rc->last_boosted_qindex; + } else { + q = vp9_rc_regulate_q(cpi, rc->this_frame_target, + active_best_quality, active_worst_quality); + if (q > *top_index) { + // Special case when we are targeting the max allowed rate. + if (rc->this_frame_target >= rc->max_frame_bandwidth) + *top_index = q; + else + q = *top_index; + } + } + + assert(*top_index <= rc->worst_quality && + *top_index >= rc->best_quality); + assert(*bottom_index <= rc->worst_quality && + *bottom_index >= rc->best_quality); + assert(q <= rc->worst_quality && q >= rc->best_quality); + return q; } +int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi, + int *bottom_index, int *top_index) { + int q; + if (cpi->oxcf.pass == 0) { + if (cpi->oxcf.rc_mode == VPX_CBR) + q = rc_pick_q_and_bounds_one_pass_cbr(cpi, bottom_index, top_index); + else + q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index); + } else { + q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index); + } + if (cpi->sf.use_nonrd_pick_mode) { + if (cpi->sf.force_frame_boost == 1) + q -= cpi->sf.max_delta_qindex; -void vp9_compute_frame_size_bounds(VP9_COMP *cpi, int *frame_under_shoot_limit, - int *frame_over_shoot_limit) { - // Set-up bounds on acceptable frame size: - if (cpi->oxcf.fixed_q >= 0) { - // Fixed Q scenario: frame size never outranges target (there is no target!) + if (q < *bottom_index) + *bottom_index = q; + else if (q > *top_index) + *top_index = q; + } + return q; +} + +void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi, + int frame_target, + int *frame_under_shoot_limit, + int *frame_over_shoot_limit) { + if (cpi->oxcf.rc_mode == VPX_Q) { *frame_under_shoot_limit = 0; *frame_over_shoot_limit = INT_MAX; } else { - if (cpi->common.frame_type == KEY_FRAME) { - *frame_over_shoot_limit = cpi->this_frame_target * 9 / 8; - *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8; - } else { - if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) { - *frame_over_shoot_limit = cpi->this_frame_target * 9 / 8; - *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8; - } else { - // Stron overshoot limit for constrained quality - if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) { - *frame_over_shoot_limit = cpi->this_frame_target * 11 / 8; - *frame_under_shoot_limit = cpi->this_frame_target * 2 / 8; - } else { - *frame_over_shoot_limit = cpi->this_frame_target * 11 / 8; - *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8; - } - } - } - // For very small rate targets where the fractional adjustment - // (eg * 7/8) may be tiny make sure there is at least a minimum - // range. - *frame_over_shoot_limit += 200; - *frame_under_shoot_limit -= 200; - if (*frame_under_shoot_limit < 0) - *frame_under_shoot_limit = 0; + // may be tiny make sure there is at least a minimum range. + const int tolerance = (cpi->sf.recode_tolerance * frame_target) / 100; + *frame_under_shoot_limit = MAX(frame_target - tolerance - 200, 0); + *frame_over_shoot_limit = MIN(frame_target + tolerance + 200, + cpi->rc.max_frame_bandwidth); } } +void vp9_rc_set_frame_target(VP9_COMP *cpi, int target) { + const VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; -// return of 0 means drop frame -int vp9_pick_frame_size(VP9_COMP *cpi) { - VP9_COMMON *cm = &cpi->common; + rc->this_frame_target = target; + + // Target rate per SB64 (including partial SB64s. + rc->sb64_target_rate = ((int64_t)rc->this_frame_target * 64 * 64) / + (cm->width * cm->height); +} + +static void update_alt_ref_frame_stats(VP9_COMP *cpi) { + // this frame refreshes means next frames don't unless specified by user + RATE_CONTROL *const rc = &cpi->rc; + rc->frames_since_golden = 0; + + // Mark the alt ref as done (setting to 0 means no further alt refs pending). + rc->source_alt_ref_pending = 0; + + // Set the alternate reference frame active flag + rc->source_alt_ref_active = 1; +} + +static void update_golden_frame_stats(VP9_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + + // Update the Golden frame usage counts. + if (cpi->refresh_golden_frame) { + // this frame refreshes means next frames don't unless specified by user + rc->frames_since_golden = 0; + + if (cpi->oxcf.pass == 2) { + if (!rc->source_alt_ref_pending && + cpi->twopass.gf_group.rf_level[0] == GF_ARF_STD) + rc->source_alt_ref_active = 0; + } else if (!rc->source_alt_ref_pending) { + rc->source_alt_ref_active = 0; + } + + // Decrement count down till next gf + if (rc->frames_till_gf_update_due > 0) + rc->frames_till_gf_update_due--; + + } else if (!cpi->refresh_alt_ref_frame) { + // Decrement count down till next gf + if (rc->frames_till_gf_update_due > 0) + rc->frames_till_gf_update_due--; + + rc->frames_since_golden++; + } +} + +void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { + const VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + RATE_CONTROL *const rc = &cpi->rc; + const int qindex = cm->base_qindex; + + // Update rate control heuristics + rc->projected_frame_size = (int)(bytes_used << 3); + + // Post encode loop adjustment of Q prediction. + vp9_rc_update_rate_correction_factors( + cpi, (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF || + oxcf->rc_mode == VPX_CBR) ? 2 : 0); + + // Keep a record of last Q and ambient average Q. + if (cm->frame_type == KEY_FRAME) { + rc->last_q[KEY_FRAME] = qindex; + rc->avg_frame_qindex[KEY_FRAME] = + ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2); + } else { + if (rc->is_src_frame_alt_ref || + !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) || + (cpi->use_svc && oxcf->rc_mode == VPX_CBR)) { + rc->last_q[INTER_FRAME] = qindex; + rc->avg_frame_qindex[INTER_FRAME] = + ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2); + rc->ni_frames++; + rc->tot_q += vp9_convert_qindex_to_q(qindex); + rc->avg_q = rc->tot_q / rc->ni_frames; + // Calculate the average Q for normal inter frames (not key or GFU + // frames). + rc->ni_tot_qi += qindex; + rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames; + } + } + + // Keep record of last boosted (KF/KF/ARF) Q value. + // If the current frame is coded at a lower Q then we also update it. + // If all mbs in this group are skipped only update if the Q value is + // better than that already stored. + // This is used to help set quality in forced key frames to reduce popping + if ((qindex < rc->last_boosted_qindex) || + ((cpi->static_mb_pct < 100) && + ((cm->frame_type == KEY_FRAME) || cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) { + rc->last_boosted_qindex = qindex; + } + + update_buffer_level(cpi, rc->projected_frame_size); + + // Rolling monitors of whether we are over or underspending used to help + // regulate min and Max Q in two pass. + if (cm->frame_type != KEY_FRAME) { + rc->rolling_target_bits = ROUND_POWER_OF_TWO( + rc->rolling_target_bits * 3 + rc->this_frame_target, 2); + rc->rolling_actual_bits = ROUND_POWER_OF_TWO( + rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2); + rc->long_rolling_target_bits = ROUND_POWER_OF_TWO( + rc->long_rolling_target_bits * 31 + rc->this_frame_target, 5); + rc->long_rolling_actual_bits = ROUND_POWER_OF_TWO( + rc->long_rolling_actual_bits * 31 + rc->projected_frame_size, 5); + } + + // Actual bits spent + rc->total_actual_bits += rc->projected_frame_size; + rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0; + + rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits; + + if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame && + (cm->frame_type != KEY_FRAME)) + // Update the alternate reference frame stats as appropriate. + update_alt_ref_frame_stats(cpi); + else + // Update the Golden frame stats as appropriate. + update_golden_frame_stats(cpi); if (cm->frame_type == KEY_FRAME) - calc_iframe_target_size(cpi); - else - calc_pframe_target_size(cpi); - - return 1; + rc->frames_since_key = 0; + if (cm->show_frame) { + rc->frames_since_key++; + rc->frames_to_key--; + } +} + +void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) { + // Update buffer level with zero size, update frame counters, and return. + update_buffer_level(cpi, 0); + cpi->common.last_frame_type = cpi->common.frame_type; + cpi->rc.frames_since_key++; + cpi->rc.frames_to_key--; +} + +// Use this macro to turn on/off use of alt-refs in one-pass mode. +#define USE_ALTREF_FOR_ONE_PASS 1 + +static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) { + static const int af_ratio = 10; + const RATE_CONTROL *const rc = &cpi->rc; + int target; +#if USE_ALTREF_FOR_ONE_PASS + target = (!rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) ? + (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio) / + (rc->baseline_gf_interval + af_ratio - 1) : + (rc->avg_frame_bandwidth * rc->baseline_gf_interval) / + (rc->baseline_gf_interval + af_ratio - 1); +#else + target = rc->avg_frame_bandwidth; +#endif + return vp9_rc_clamp_pframe_target_size(cpi, target); +} + +static int calc_iframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) { + static const int kf_ratio = 25; + const RATE_CONTROL *rc = &cpi->rc; + const int target = rc->avg_frame_bandwidth * kf_ratio; + return vp9_rc_clamp_iframe_target_size(cpi, target); +} + +void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + int target; + // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic. + if (!cpi->refresh_alt_ref_frame && + (cm->current_video_frame == 0 || + (cpi->frame_flags & FRAMEFLAGS_KEY) || + rc->frames_to_key == 0 || + (cpi->oxcf.auto_key && 0))) { + cm->frame_type = KEY_FRAME; + rc->this_key_frame_forced = cm->current_video_frame != 0 && + rc->frames_to_key == 0; + rc->frames_to_key = cpi->oxcf.key_freq; + rc->kf_boost = DEFAULT_KF_BOOST; + rc->source_alt_ref_active = 0; + } else { + cm->frame_type = INTER_FRAME; + } + if (rc->frames_till_gf_update_due == 0) { + rc->baseline_gf_interval = DEFAULT_GF_INTERVAL; + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + // NOTE: frames_till_gf_update_due must be <= frames_to_key. + if (rc->frames_till_gf_update_due > rc->frames_to_key) + rc->frames_till_gf_update_due = rc->frames_to_key; + cpi->refresh_golden_frame = 1; + rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS; + rc->gfu_boost = DEFAULT_GF_BOOST; + } + if (cm->frame_type == KEY_FRAME) + target = calc_iframe_target_size_one_pass_vbr(cpi); + else + target = calc_pframe_target_size_one_pass_vbr(cpi); + vp9_rc_set_frame_target(cpi, target); +} + +static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { + const VP9EncoderConfig *oxcf = &cpi->oxcf; + const RATE_CONTROL *rc = &cpi->rc; + const SVC *const svc = &cpi->svc; + const int64_t diff = rc->optimal_buffer_level - rc->buffer_level; + const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100; + int min_frame_target = MAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS); + int target = rc->avg_frame_bandwidth; + if (svc->number_temporal_layers > 1 && + oxcf->rc_mode == VPX_CBR) { + // Note that for layers, avg_frame_bandwidth is the cumulative + // per-frame-bandwidth. For the target size of this frame, use the + // layer average frame size (i.e., non-cumulative per-frame-bw). + int current_temporal_layer = svc->temporal_layer_id; + const LAYER_CONTEXT *lc = &svc->layer_context[current_temporal_layer]; + target = lc->avg_frame_size; + min_frame_target = MAX(lc->avg_frame_size >> 4, FRAME_OVERHEAD_BITS); + } + if (diff > 0) { + // Lower the target bandwidth for this frame. + const int pct_low = (int)MIN(diff / one_pct_bits, oxcf->under_shoot_pct); + target -= (target * pct_low) / 200; + } else if (diff < 0) { + // Increase the target bandwidth for this frame. + const int pct_high = (int)MIN(-diff / one_pct_bits, oxcf->over_shoot_pct); + target += (target * pct_high) / 200; + } + return MAX(min_frame_target, target); +} + +static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { + const RATE_CONTROL *rc = &cpi->rc; + const VP9EncoderConfig *oxcf = &cpi->oxcf; + const SVC *const svc = &cpi->svc; + int target; + if (cpi->common.current_video_frame == 0) { + target = ((rc->starting_buffer_level / 2) > INT_MAX) + ? INT_MAX : (int)(rc->starting_buffer_level / 2); + } else { + int kf_boost = 32; + double framerate = cpi->framerate; + if (svc->number_temporal_layers > 1 && + oxcf->rc_mode == VPX_CBR) { + // Use the layer framerate for temporal layers CBR mode. + const LAYER_CONTEXT *lc = &svc->layer_context[svc->temporal_layer_id]; + framerate = lc->framerate; + } + kf_boost = MAX(kf_boost, (int)(2 * framerate - 16)); + if (rc->frames_since_key < framerate / 2) { + kf_boost = (int)(kf_boost * rc->frames_since_key / + (framerate / 2)); + } + target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4; + } + return vp9_rc_clamp_iframe_target_size(cpi, target); +} + +void vp9_rc_get_svc_params(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + int target = rc->avg_frame_bandwidth; + if ((cm->current_video_frame == 0) || + (cpi->frame_flags & FRAMEFLAGS_KEY) || + (cpi->oxcf.auto_key && (rc->frames_since_key % + cpi->oxcf.key_freq == 0))) { + cm->frame_type = KEY_FRAME; + rc->source_alt_ref_active = 0; + + if (is_two_pass_svc(cpi)) { + cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame = 1; + cpi->ref_frame_flags &= + (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); + } + + if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) { + target = calc_iframe_target_size_one_pass_cbr(cpi); + } + } else { + cm->frame_type = INTER_FRAME; + + if (is_two_pass_svc(cpi)) { + LAYER_CONTEXT *lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; + if (cpi->svc.spatial_layer_id == 0) { + lc->is_key_frame = 0; + } else { + lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame; + if (lc->is_key_frame) + cpi->ref_frame_flags &= (~VP9_LAST_FLAG); + } + cpi->ref_frame_flags &= (~VP9_ALT_FLAG); + } + + if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) { + target = calc_pframe_target_size_one_pass_cbr(cpi); + } + } + vp9_rc_set_frame_target(cpi, target); + rc->frames_till_gf_update_due = INT_MAX; + rc->baseline_gf_interval = INT_MAX; +} + +void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + int target; + // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic. + if ((cm->current_video_frame == 0 || + (cpi->frame_flags & FRAMEFLAGS_KEY) || + rc->frames_to_key == 0 || + (cpi->oxcf.auto_key && 0))) { + cm->frame_type = KEY_FRAME; + rc->this_key_frame_forced = cm->current_video_frame != 0 && + rc->frames_to_key == 0; + rc->frames_to_key = cpi->oxcf.key_freq; + rc->kf_boost = DEFAULT_KF_BOOST; + rc->source_alt_ref_active = 0; + target = calc_iframe_target_size_one_pass_cbr(cpi); + } else { + cm->frame_type = INTER_FRAME; + target = calc_pframe_target_size_one_pass_cbr(cpi); + } + vp9_rc_set_frame_target(cpi, target); + // Don't use gf_update by default in CBR mode. + rc->frames_till_gf_update_due = INT_MAX; + rc->baseline_gf_interval = INT_MAX; +} + +int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget) { + int start_index = rc->worst_quality; + int target_index = rc->worst_quality; + int i; + + // Convert the average q value to an index. + for (i = rc->best_quality; i < rc->worst_quality; ++i) { + start_index = i; + if (vp9_convert_qindex_to_q(i) >= qstart) + break; + } + + // Convert the q target to an index + for (i = rc->best_quality; i < rc->worst_quality; ++i) { + target_index = i; + if (vp9_convert_qindex_to_q(i) >= qtarget) + break; + } + + return target_index - start_index; +} + +int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type, + int qindex, double rate_target_ratio) { + int target_index = rc->worst_quality; + int i; + + // Look up the current projected bits per block for the base index + const int base_bits_per_mb = vp9_rc_bits_per_mb(frame_type, qindex, 1.0); + + // Find the target bits per mb based on the base value and given ratio. + const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb); + + // Convert the q target to an index + for (i = rc->best_quality; i < rc->worst_quality; ++i) { + target_index = i; + if (vp9_rc_bits_per_mb(frame_type, i, 1.0) <= target_bits_per_mb ) + break; + } + + return target_index - qindex; +} + +void vp9_rc_set_gf_max_interval(const VP9_COMP *const cpi, + RATE_CONTROL *const rc) { + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + // Set Maximum gf/arf interval + rc->max_gf_interval = 16; + + // Extended interval for genuinely static scenes + rc->static_scene_max_gf_interval = oxcf->key_freq >> 1; + if (rc->static_scene_max_gf_interval > (MAX_LAG_BUFFERS * 2)) + rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2; + + if (is_altref_enabled(cpi)) { + if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1) + rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1; + } + + if (rc->max_gf_interval > rc->static_scene_max_gf_interval) + rc->max_gf_interval = rc->static_scene_max_gf_interval; +} + +void vp9_rc_update_framerate(VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + RATE_CONTROL *const rc = &cpi->rc; + int vbr_max_bits; + + rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate); + rc->min_frame_bandwidth = (int)(rc->avg_frame_bandwidth * + oxcf->two_pass_vbrmin_section / 100); + + rc->min_frame_bandwidth = MAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS); + + // A maximum bitrate for a frame is defined. + // The baseline for this aligns with HW implementations that + // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits + // per 16x16 MB (averaged over a frame). However this limit is extended if + // a very high rate is given on the command line or the the rate cannnot + // be acheived because of a user specificed max q (e.g. when the user + // specifies lossless encode. + vbr_max_bits = (int)(((int64_t)rc->avg_frame_bandwidth * + oxcf->two_pass_vbrmax_section) / 100); + rc->max_frame_bandwidth = MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), + vbr_max_bits); + + vp9_rc_set_gf_max_interval(cpi, rc); } diff --git a/media/libvpx/vp9/encoder/vp9_ratectrl.h b/media/libvpx/vp9/encoder/vp9_ratectrl.h index ddda7130c9af..456daf48d032 100644 --- a/media/libvpx/vp9/encoder/vp9_ratectrl.h +++ b/media/libvpx/vp9/encoder/vp9_ratectrl.h @@ -12,28 +12,188 @@ #ifndef VP9_ENCODER_VP9_RATECTRL_H_ #define VP9_ENCODER_VP9_RATECTRL_H_ -#include "vp9/encoder/vp9_onyx_int.h" +#include "vpx/vpx_integer.h" -#define FRAME_OVERHEAD_BITS 200 +#include "vp9/common/vp9_blockd.h" -void vp9_save_coding_context(VP9_COMP *cpi); -void vp9_restore_coding_context(VP9_COMP *cpi); +#ifdef __cplusplus +extern "C" { +#endif -void vp9_setup_key_frame(VP9_COMP *cpi); -void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var); -int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame); -void vp9_adjust_key_frame_context(VP9_COMP *cpi); -void vp9_compute_frame_size_bounds(VP9_COMP *cpi, - int *frame_under_shoot_limit, - int *frame_over_shoot_limit); +// Bits Per MB at different Q (Multiplied by 512) +#define BPER_MB_NORMBITS 9 -// return of 0 means drop frame -int vp9_pick_frame_size(VP9_COMP *cpi); +typedef enum { + INTER_NORMAL = 0, + INTER_HIGH = 1, + GF_ARF_LOW = 2, + GF_ARF_STD = 3, + KF_STD = 4, + RATE_FACTOR_LEVELS = 5 +} RATE_FACTOR_LEVEL; + +typedef struct { + // Rate targetting variables + int base_frame_target; // A baseline frame target before adjustment + // for previous under or over shoot. + int this_frame_target; // Actual frame target after rc adjustment. + int projected_frame_size; + int sb64_target_rate; + int last_q[FRAME_TYPES]; // Separate values for Intra/Inter + int last_boosted_qindex; // Last boosted GF/KF/ARF q + + int gfu_boost; + int last_boost; + int kf_boost; + + double rate_correction_factors[RATE_FACTOR_LEVELS]; + + int frames_since_golden; + int frames_till_gf_update_due; + int max_gf_interval; + int static_scene_max_gf_interval; + int baseline_gf_interval; + int frames_to_key; + int frames_since_key; + int this_key_frame_forced; + int next_key_frame_forced; + int source_alt_ref_pending; + int source_alt_ref_active; + int is_src_frame_alt_ref; + + int avg_frame_bandwidth; // Average frame size target for clip + int min_frame_bandwidth; // Minimum allocation used for any frame + int max_frame_bandwidth; // Maximum burst rate allowed for a frame. + + int ni_av_qi; + int ni_tot_qi; + int ni_frames; + int avg_frame_qindex[FRAME_TYPES]; + double tot_q; + double avg_q; + + int64_t buffer_level; + int64_t bits_off_target; + int64_t vbr_bits_off_target; + + int decimation_factor; + int decimation_count; + + int rolling_target_bits; + int rolling_actual_bits; + + int long_rolling_target_bits; + int long_rolling_actual_bits; + + int64_t total_actual_bits; + int64_t total_target_bits; + int64_t total_target_vs_actual; + + int worst_quality; + int best_quality; + + int64_t starting_buffer_level; + int64_t optimal_buffer_level; + int64_t maximum_buffer_size; + // int active_best_quality; +} RATE_CONTROL; + +struct VP9_COMP; +struct VP9EncoderConfig; + +void vp9_rc_init(const struct VP9EncoderConfig *oxcf, int pass, + RATE_CONTROL *rc); double vp9_convert_qindex_to_q(int qindex); -int vp9_gfboost_qadjust(int qindex); -int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex, - double correction_factor); -void vp9_setup_inter_frame(VP9_COMP *cpi); + +void vp9_rc_init_minq_luts(); + +// Generally at the high level, the following flow is expected +// to be enforced for rate control: +// First call per frame, one of: +// vp9_rc_get_one_pass_vbr_params() +// vp9_rc_get_one_pass_cbr_params() +// vp9_rc_get_svc_params() +// vp9_rc_get_first_pass_params() +// vp9_rc_get_second_pass_params() +// depending on the usage to set the rate control encode parameters desired. +// +// Then, call encode_frame_to_data_rate() to perform the +// actual encode. This function will in turn call encode_frame() +// one or more times, followed by one of: +// vp9_rc_postencode_update() +// vp9_rc_postencode_update_drop_frame() +// +// The majority of rate control parameters are only expected +// to be set in the vp9_rc_get_..._params() functions and +// updated during the vp9_rc_postencode_update...() functions. +// The only exceptions are vp9_rc_drop_frame() and +// vp9_rc_update_rate_correction_factors() functions. + +// Functions to set parameters for encoding before the actual +// encode_frame_to_data_rate() function. +void vp9_rc_get_one_pass_vbr_params(struct VP9_COMP *cpi); +void vp9_rc_get_one_pass_cbr_params(struct VP9_COMP *cpi); +void vp9_rc_get_svc_params(struct VP9_COMP *cpi); + +// Post encode update of the rate control parameters based +// on bytes used +void vp9_rc_postencode_update(struct VP9_COMP *cpi, uint64_t bytes_used); +// Post encode update of the rate control parameters for dropped frames +void vp9_rc_postencode_update_drop_frame(struct VP9_COMP *cpi); + +// Updates rate correction factors +// Changes only the rate correction factors in the rate control structure. +void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi, int damp_var); + +// Decide if we should drop this frame: For 1-pass CBR. +// Changes only the decimation count in the rate control structure +int vp9_rc_drop_frame(struct VP9_COMP *cpi); + +// Computes frame size bounds. +void vp9_rc_compute_frame_size_bounds(const struct VP9_COMP *cpi, + int this_frame_target, + int *frame_under_shoot_limit, + int *frame_over_shoot_limit); + +// Picks q and q bounds given the target for bits +int vp9_rc_pick_q_and_bounds(const struct VP9_COMP *cpi, + int *bottom_index, + int *top_index); + +// Estimates q to achieve a target bits per frame +int vp9_rc_regulate_q(const struct VP9_COMP *cpi, int target_bits_per_frame, + int active_best_quality, int active_worst_quality); + +// Estimates bits per mb for a given qindex and correction factor. +int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex, + double correction_factor); + +// Clamping utilities for bitrate targets for iframes and pframes. +int vp9_rc_clamp_iframe_target_size(const struct VP9_COMP *const cpi, + int target); +int vp9_rc_clamp_pframe_target_size(const struct VP9_COMP *const cpi, + int target); +// Utility to set frame_target into the RATE_CONTROL structure +// This function is called only from the vp9_rc_get_..._params() functions. +void vp9_rc_set_frame_target(struct VP9_COMP *cpi, int target); + +// Computes a q delta (in "q index" terms) to get from a starting q value +// to a target q value +int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget); + +// Computes a q delta (in "q index" terms) to get from a starting q value +// to a value that should equate to the given rate ratio. +int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type, + int qindex, double rate_target_ratio); + +void vp9_rc_update_framerate(struct VP9_COMP *cpi); + +void vp9_rc_set_gf_max_interval(const struct VP9_COMP *const cpi, + RATE_CONTROL *const rc); + +#ifdef __cplusplus +} // extern "C" +#endif #endif // VP9_ENCODER_VP9_RATECTRL_H_ diff --git a/media/libvpx/vp9/encoder/vp9_rd.c b/media/libvpx/vp9/encoder/vp9_rd.c new file mode 100644 index 000000000000..b826ff42752f --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_rd.c @@ -0,0 +1,571 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "./vp9_rtcd.h" + +#include "vpx_mem/vpx_mem.h" + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_mvref_common.h" +#include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_systemdependent.h" + +#include "vp9/encoder/vp9_cost.h" +#include "vp9/encoder/vp9_encodemb.h" +#include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_mcomp.h" +#include "vp9/encoder/vp9_quantize.h" +#include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_rd.h" +#include "vp9/encoder/vp9_tokenize.h" +#include "vp9/encoder/vp9_variance.h" + +#define RD_THRESH_POW 1.25 +#define RD_MULT_EPB_RATIO 64 + +// Factor to weigh the rate for switchable interp filters. +#define SWITCHABLE_INTERP_RATE_FACTOR 1 + +// The baseline rd thresholds for breaking out of the rd loop for +// certain modes are assumed to be based on 8x8 blocks. +// This table is used to correct for block size. +// The factors here are << 2 (2 = x0.5, 32 = x8 etc). +static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = { + 2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32 +}; + +static void fill_mode_costs(VP9_COMP *cpi) { + const FRAME_CONTEXT *const fc = &cpi->common.fc; + int i, j; + + for (i = 0; i < INTRA_MODES; ++i) + for (j = 0; j < INTRA_MODES; ++j) + vp9_cost_tokens(cpi->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j], + vp9_intra_mode_tree); + + vp9_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree); + vp9_cost_tokens(cpi->intra_uv_mode_cost[KEY_FRAME], + vp9_kf_uv_mode_prob[TM_PRED], vp9_intra_mode_tree); + vp9_cost_tokens(cpi->intra_uv_mode_cost[INTER_FRAME], + fc->uv_mode_prob[TM_PRED], vp9_intra_mode_tree); + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) + vp9_cost_tokens(cpi->switchable_interp_costs[i], + fc->switchable_interp_prob[i], vp9_switchable_interp_tree); +} + +static void fill_token_costs(vp9_coeff_cost *c, + vp9_coeff_probs_model (*p)[PLANE_TYPES]) { + int i, j, k, l; + TX_SIZE t; + for (t = TX_4X4; t <= TX_32X32; ++t) + for (i = 0; i < PLANE_TYPES; ++i) + for (j = 0; j < REF_TYPES; ++j) + for (k = 0; k < COEF_BANDS; ++k) + for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) { + vp9_prob probs[ENTROPY_NODES]; + vp9_model_to_full_probs(p[t][i][j][k][l], probs); + vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs, + vp9_coef_tree); + vp9_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs, + vp9_coef_tree); + assert(c[t][i][j][k][0][l][EOB_TOKEN] == + c[t][i][j][k][1][l][EOB_TOKEN]); + } +} + +// Values are now correlated to quantizer. +static int sad_per_bit16lut[QINDEX_RANGE]; +static int sad_per_bit4lut[QINDEX_RANGE]; + +void vp9_init_me_luts() { + int i; + + // Initialize the sad lut tables using a formulaic calculation for now. + // This is to make it easier to resolve the impact of experimental changes + // to the quantizer tables. + for (i = 0; i < QINDEX_RANGE; ++i) { + const double q = vp9_convert_qindex_to_q(i); + sad_per_bit16lut[i] = (int)(0.0418 * q + 2.4107); + sad_per_bit4lut[i] = (int)(0.063 * q + 2.742); + } +} + +static const int rd_boost_factor[16] = { + 64, 32, 32, 32, 24, 16, 12, 12, + 8, 8, 4, 4, 2, 2, 1, 0 +}; +static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { +128, 144, 128, 128, 144 +}; + +int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) { + const int q = vp9_dc_quant(qindex, 0); + int rdmult = 88 * q * q / 24; + + if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index]; + const int boost_index = MIN(15, (cpi->rc.gfu_boost / 100)); + + rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7; + rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7); + } + return rdmult; +} + +static int compute_rd_thresh_factor(int qindex) { + // TODO(debargha): Adjust the function below. + const int q = (int)(pow(vp9_dc_quant(qindex, 0) / 4.0, RD_THRESH_POW) * 5.12); + return MAX(q, 8); +} + +void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) { + cpi->mb.sadperbit16 = sad_per_bit16lut[qindex]; + cpi->mb.sadperbit4 = sad_per_bit4lut[qindex]; +} + +static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) { + int i, bsize, segment_id; + + for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) { + const int qindex = + clamp(vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex) + + cm->y_dc_delta_q, + 0, MAXQ); + const int q = compute_rd_thresh_factor(qindex); + + for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) { + // Threshold here seems unnecessarily harsh but fine given actual + // range of values used for cpi->sf.thresh_mult[]. + const int t = q * rd_thresh_block_size_factor[bsize]; + const int thresh_max = INT_MAX / t; + + if (bsize >= BLOCK_8X8) { + for (i = 0; i < MAX_MODES; ++i) + rd->threshes[segment_id][bsize][i] = + rd->thresh_mult[i] < thresh_max + ? rd->thresh_mult[i] * t / 4 + : INT_MAX; + } else { + for (i = 0; i < MAX_REFS; ++i) + rd->threshes[segment_id][bsize][i] = + rd->thresh_mult_sub8x8[i] < thresh_max + ? rd->thresh_mult_sub8x8[i] * t / 4 + : INT_MAX; + } + } + } +} + +void vp9_initialize_rd_consts(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; + RD_OPT *const rd = &cpi->rd; + int i; + + vp9_clear_system_state(); + + rd->RDDIV = RDDIV_BITS; // In bits (to multiply D by 128). + rd->RDMULT = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q); + + x->errorperbit = rd->RDMULT / RD_MULT_EPB_RATIO; + x->errorperbit += (x->errorperbit == 0); + + x->select_tx_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL && + cm->frame_type != KEY_FRAME) ? 0 : 1; + + set_block_thresholds(cm, rd); + + if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME) { + fill_token_costs(x->token_costs, cm->fc.coef_probs); + + for (i = 0; i < PARTITION_CONTEXTS; ++i) + vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(cm, i), + vp9_partition_tree); + } + + if (!cpi->sf.use_nonrd_pick_mode || (cm->current_video_frame & 0x07) == 1 || + cm->frame_type == KEY_FRAME) { + fill_mode_costs(cpi); + + if (!frame_is_intra_only(cm)) { + vp9_build_nmv_cost_table(x->nmvjointcost, + cm->allow_high_precision_mv ? x->nmvcost_hp + : x->nmvcost, + &cm->fc.nmvc, cm->allow_high_precision_mv); + + for (i = 0; i < INTER_MODE_CONTEXTS; ++i) + vp9_cost_tokens((int *)cpi->inter_mode_cost[i], + cm->fc.inter_mode_probs[i], vp9_inter_mode_tree); + } + } +} + +static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) { + // NOTE: The tables below must be of the same size. + + // The functions described below are sampled at the four most significant + // bits of x^2 + 8 / 256. + + // Normalized rate: + // This table models the rate for a Laplacian source with given variance + // when quantized with a uniform quantizer with given stepsize. The + // closed form expression is: + // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)], + // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance), + // and H(x) is the binary entropy function. + static const int rate_tab_q10[] = { + 65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, + 4553, 4389, 4255, 4142, 4044, 3958, 3881, 3811, + 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186, + 3133, 3037, 2952, 2877, 2809, 2747, 2690, 2638, + 2589, 2501, 2423, 2353, 2290, 2232, 2179, 2130, + 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651, + 1608, 1530, 1460, 1398, 1342, 1290, 1243, 1199, + 1159, 1086, 1021, 963, 911, 864, 821, 781, + 745, 680, 623, 574, 530, 490, 455, 424, + 395, 345, 304, 269, 239, 213, 190, 171, + 154, 126, 104, 87, 73, 61, 52, 44, + 38, 28, 21, 16, 12, 10, 8, 6, + 5, 3, 2, 1, 1, 1, 0, 0, + }; + // Normalized distortion: + // This table models the normalized distortion for a Laplacian source + // with given variance when quantized with a uniform quantizer + // with given stepsize. The closed form expression is: + // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2)) + // where x = qpstep / sqrt(variance). + // Note the actual distortion is Dn * variance. + static const int dist_tab_q10[] = { + 0, 0, 1, 1, 1, 2, 2, 2, + 3, 3, 4, 5, 5, 6, 7, 7, + 8, 9, 11, 12, 13, 15, 16, 17, + 18, 21, 24, 26, 29, 31, 34, 36, + 39, 44, 49, 54, 59, 64, 69, 73, + 78, 88, 97, 106, 115, 124, 133, 142, + 151, 167, 184, 200, 215, 231, 245, 260, + 274, 301, 327, 351, 375, 397, 418, 439, + 458, 495, 528, 559, 587, 613, 637, 659, + 680, 717, 749, 777, 801, 823, 842, 859, + 874, 899, 919, 936, 949, 960, 969, 977, + 983, 994, 1001, 1006, 1010, 1013, 1015, 1017, + 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024, + }; + static const int xsq_iq_q10[] = { + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 40, 48, 56, 64, 72, 80, 88, + 96, 112, 128, 144, 160, 176, 192, 208, + 224, 256, 288, 320, 352, 384, 416, 448, + 480, 544, 608, 672, 736, 800, 864, 928, + 992, 1120, 1248, 1376, 1504, 1632, 1760, 1888, + 2016, 2272, 2528, 2784, 3040, 3296, 3552, 3808, + 4064, 4576, 5088, 5600, 6112, 6624, 7136, 7648, + 8160, 9184, 10208, 11232, 12256, 13280, 14304, 15328, + 16352, 18400, 20448, 22496, 24544, 26592, 28640, 30688, + 32736, 36832, 40928, 45024, 49120, 53216, 57312, 61408, + 65504, 73696, 81888, 90080, 98272, 106464, 114656, 122848, + 131040, 147424, 163808, 180192, 196576, 212960, 229344, 245728, + }; + const int tmp = (xsq_q10 >> 2) + 8; + const int k = get_msb(tmp) - 3; + const int xq = (k << 3) + ((tmp >> k) & 0x7); + const int one_q10 = 1 << 10; + const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k); + const int b_q10 = one_q10 - a_q10; + *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10; + *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10; +} + +void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n, + unsigned int qstep, int *rate, + int64_t *dist) { + // This function models the rate and distortion for a Laplacian + // source with given variance when quantized with a uniform quantizer + // with given stepsize. The closed form expressions are in: + // Hang and Chen, "Source Model for transform video coder and its + // application - Part I: Fundamental Theory", IEEE Trans. Circ. + // Sys. for Video Tech., April 1997. + if (var == 0) { + *rate = 0; + *dist = 0; + } else { + int d_q10, r_q10; + static const uint32_t MAX_XSQ_Q10 = 245727; + const uint64_t xsq_q10_64 = + ((((uint64_t)qstep * qstep * n) << 10) + (var >> 1)) / var; + const int xsq_q10 = (int)MIN(xsq_q10_64, MAX_XSQ_Q10); + model_rd_norm(xsq_q10, &r_q10, &d_q10); + *rate = (n * r_q10 + 2) >> 2; + *dist = (var * (int64_t)d_q10 + 512) >> 10; + } +} + +void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, + const struct macroblockd_plane *pd, + ENTROPY_CONTEXT t_above[16], + ENTROPY_CONTEXT t_left[16]) { + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); + const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; + const ENTROPY_CONTEXT *const above = pd->above_context; + const ENTROPY_CONTEXT *const left = pd->left_context; + + int i; + switch (tx_size) { + case TX_4X4: + vpx_memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w); + vpx_memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h); + break; + case TX_8X8: + for (i = 0; i < num_4x4_w; i += 2) + t_above[i] = !!*(const uint16_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 2) + t_left[i] = !!*(const uint16_t *)&left[i]; + break; + case TX_16X16: + for (i = 0; i < num_4x4_w; i += 4) + t_above[i] = !!*(const uint32_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 4) + t_left[i] = !!*(const uint32_t *)&left[i]; + break; + case TX_32X32: + for (i = 0; i < num_4x4_w; i += 8) + t_above[i] = !!*(const uint64_t *)&above[i]; + for (i = 0; i < num_4x4_h; i += 8) + t_left[i] = !!*(const uint64_t *)&left[i]; + break; + default: + assert(0 && "Invalid transform size."); + break; + } +} + +void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, + uint8_t *ref_y_buffer, int ref_y_stride, + int ref_frame, BLOCK_SIZE block_size) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + int i; + int zero_seen = 0; + int best_index = 0; + int best_sad = INT_MAX; + int this_sad = INT_MAX; + int max_mv = 0; + uint8_t *src_y_ptr = x->plane[0].src.buf; + uint8_t *ref_y_ptr; + const int num_mv_refs = MAX_MV_REF_CANDIDATES + + (cpi->sf.adaptive_motion_search && + block_size < cpi->sf.max_partition_size); + + MV pred_mv[3]; + pred_mv[0] = mbmi->ref_mvs[ref_frame][0].as_mv; + pred_mv[1] = mbmi->ref_mvs[ref_frame][1].as_mv; + pred_mv[2] = x->pred_mv[ref_frame]; + + // Get the sad for each candidate reference mv. + for (i = 0; i < num_mv_refs; ++i) { + const MV *this_mv = &pred_mv[i]; + + max_mv = MAX(max_mv, MAX(abs(this_mv->row), abs(this_mv->col)) >> 3); + if (is_zero_mv(this_mv) && zero_seen) + continue; + + zero_seen |= is_zero_mv(this_mv); + + ref_y_ptr = + &ref_y_buffer[ref_y_stride * (this_mv->row >> 3) + (this_mv->col >> 3)]; + + // Find sad for current vector. + this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride, + ref_y_ptr, ref_y_stride); + + // Note if it is the best so far. + if (this_sad < best_sad) { + best_sad = this_sad; + best_index = i; + } + } + + // Note the index of the mv that worked best in the reference list. + x->mv_best_ref_index[ref_frame] = best_index; + x->max_mv_context[ref_frame] = max_mv; + x->pred_mv_sad[ref_frame] = best_sad; +} + +void vp9_setup_pred_block(const MACROBLOCKD *xd, + struct buf_2d dst[MAX_MB_PLANE], + const YV12_BUFFER_CONFIG *src, + int mi_row, int mi_col, + const struct scale_factors *scale, + const struct scale_factors *scale_uv) { + int i; + + dst[0].buf = src->y_buffer; + dst[0].stride = src->y_stride; + dst[1].buf = src->u_buffer; + dst[2].buf = src->v_buffer; + dst[1].stride = dst[2].stride = src->uv_stride; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col, + i ? scale_uv : scale, + xd->plane[i].subsampling_x, xd->plane[i].subsampling_y); + } +} + +const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi, + int ref_frame) { + const VP9_COMMON *const cm = &cpi->common; + const int ref_idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]; + const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1]; + return (scaled_idx != ref_idx) ? &cm->frame_bufs[scaled_idx].buf : NULL; +} + +int vp9_get_switchable_rate(const VP9_COMP *cpi) { + const MACROBLOCKD *const xd = &cpi->mb.e_mbd; + const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const int ctx = vp9_get_pred_context_switchable_interp(xd); + return SWITCHABLE_INTERP_RATE_FACTOR * + cpi->switchable_interp_costs[ctx][mbmi->interp_filter]; +} + +void vp9_set_rd_speed_thresholds(VP9_COMP *cpi) { + int i; + RD_OPT *const rd = &cpi->rd; + SPEED_FEATURES *const sf = &cpi->sf; + + // Set baseline threshold values. + for (i = 0; i < MAX_MODES; ++i) + rd->thresh_mult[i] = cpi->oxcf.mode == BEST ? -500 : 0; + + rd->thresh_mult[THR_NEARESTMV] = 0; + rd->thresh_mult[THR_NEARESTG] = 0; + rd->thresh_mult[THR_NEARESTA] = 0; + + rd->thresh_mult[THR_DC] += 1000; + + rd->thresh_mult[THR_NEWMV] += 1000; + rd->thresh_mult[THR_NEWA] += 1000; + rd->thresh_mult[THR_NEWG] += 1000; + + // Adjust threshold only in real time mode, which only uses last + // reference frame. + rd->thresh_mult[THR_NEWMV] += sf->elevate_newmv_thresh; + + rd->thresh_mult[THR_NEARMV] += 1000; + rd->thresh_mult[THR_NEARA] += 1000; + rd->thresh_mult[THR_COMP_NEARESTLA] += 1000; + rd->thresh_mult[THR_COMP_NEARESTGA] += 1000; + + rd->thresh_mult[THR_TM] += 1000; + + rd->thresh_mult[THR_COMP_NEARLA] += 1500; + rd->thresh_mult[THR_COMP_NEWLA] += 2000; + rd->thresh_mult[THR_NEARG] += 1000; + rd->thresh_mult[THR_COMP_NEARGA] += 1500; + rd->thresh_mult[THR_COMP_NEWGA] += 2000; + + rd->thresh_mult[THR_ZEROMV] += 2000; + rd->thresh_mult[THR_ZEROG] += 2000; + rd->thresh_mult[THR_ZEROA] += 2000; + rd->thresh_mult[THR_COMP_ZEROLA] += 2500; + rd->thresh_mult[THR_COMP_ZEROGA] += 2500; + + rd->thresh_mult[THR_H_PRED] += 2000; + rd->thresh_mult[THR_V_PRED] += 2000; + rd->thresh_mult[THR_D45_PRED ] += 2500; + rd->thresh_mult[THR_D135_PRED] += 2500; + rd->thresh_mult[THR_D117_PRED] += 2500; + rd->thresh_mult[THR_D153_PRED] += 2500; + rd->thresh_mult[THR_D207_PRED] += 2500; + rd->thresh_mult[THR_D63_PRED] += 2500; + + // Disable frame modes if flags not set. + if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) { + rd->thresh_mult[THR_NEWMV ] = INT_MAX; + rd->thresh_mult[THR_NEARESTMV] = INT_MAX; + rd->thresh_mult[THR_ZEROMV ] = INT_MAX; + rd->thresh_mult[THR_NEARMV ] = INT_MAX; + } + if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) { + rd->thresh_mult[THR_NEARESTG ] = INT_MAX; + rd->thresh_mult[THR_ZEROG ] = INT_MAX; + rd->thresh_mult[THR_NEARG ] = INT_MAX; + rd->thresh_mult[THR_NEWG ] = INT_MAX; + } + if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) { + rd->thresh_mult[THR_NEARESTA ] = INT_MAX; + rd->thresh_mult[THR_ZEROA ] = INT_MAX; + rd->thresh_mult[THR_NEARA ] = INT_MAX; + rd->thresh_mult[THR_NEWA ] = INT_MAX; + } + + if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) != + (VP9_LAST_FLAG | VP9_ALT_FLAG)) { + rd->thresh_mult[THR_COMP_ZEROLA ] = INT_MAX; + rd->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX; + rd->thresh_mult[THR_COMP_NEARLA ] = INT_MAX; + rd->thresh_mult[THR_COMP_NEWLA ] = INT_MAX; + } + if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) != + (VP9_GOLD_FLAG | VP9_ALT_FLAG)) { + rd->thresh_mult[THR_COMP_ZEROGA ] = INT_MAX; + rd->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX; + rd->thresh_mult[THR_COMP_NEARGA ] = INT_MAX; + rd->thresh_mult[THR_COMP_NEWGA ] = INT_MAX; + } +} + +void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) { + const SPEED_FEATURES *const sf = &cpi->sf; + RD_OPT *const rd = &cpi->rd; + int i; + + for (i = 0; i < MAX_REFS; ++i) + rd->thresh_mult_sub8x8[i] = cpi->oxcf.mode == BEST ? -500 : 0; + + rd->thresh_mult_sub8x8[THR_LAST] += 2500; + rd->thresh_mult_sub8x8[THR_GOLD] += 2500; + rd->thresh_mult_sub8x8[THR_ALTR] += 2500; + rd->thresh_mult_sub8x8[THR_INTRA] += 2500; + rd->thresh_mult_sub8x8[THR_COMP_LA] += 4500; + rd->thresh_mult_sub8x8[THR_COMP_GA] += 4500; + + // Check for masked out split cases. + for (i = 0; i < MAX_REFS; ++i) + if (sf->disable_split_mask & (1 << i)) + rd->thresh_mult_sub8x8[i] = INT_MAX; + + // Disable mode test if frame flag is not set. + if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) + rd->thresh_mult_sub8x8[THR_LAST] = INT_MAX; + if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) + rd->thresh_mult_sub8x8[THR_GOLD] = INT_MAX; + if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) + rd->thresh_mult_sub8x8[THR_ALTR] = INT_MAX; + if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) != + (VP9_LAST_FLAG | VP9_ALT_FLAG)) + rd->thresh_mult_sub8x8[THR_COMP_LA] = INT_MAX; + if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) != + (VP9_GOLD_FLAG | VP9_ALT_FLAG)) + rd->thresh_mult_sub8x8[THR_COMP_GA] = INT_MAX; +} diff --git a/media/libvpx/vp9/encoder/vp9_rd.h b/media/libvpx/vp9/encoder/vp9_rd.h new file mode 100644 index 000000000000..eeb5e0f84f6c --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_rd.h @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_ENCODER_VP9_RD_H_ +#define VP9_ENCODER_VP9_RD_H_ + +#include + +#include "vp9/common/vp9_blockd.h" + +#include "vp9/encoder/vp9_block.h" +#include "vp9/encoder/vp9_context_tree.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define RDDIV_BITS 7 + +#define RDCOST(RM, DM, R, D) \ + (((128 + ((int64_t)R) * (RM)) >> 8) + (D << DM)) +#define QIDX_SKIP_THRESH 115 + +#define MV_COST_WEIGHT 108 +#define MV_COST_WEIGHT_SUB 120 + +#define INVALID_MV 0x80008000 + +#define MAX_MODES 30 +#define MAX_REFS 6 + +// This enumerator type needs to be kept aligned with the mode order in +// const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code. +typedef enum { + THR_NEARESTMV, + THR_NEARESTA, + THR_NEARESTG, + + THR_DC, + + THR_NEWMV, + THR_NEWA, + THR_NEWG, + + THR_NEARMV, + THR_NEARA, + THR_COMP_NEARESTLA, + THR_COMP_NEARESTGA, + + THR_TM, + + THR_COMP_NEARLA, + THR_COMP_NEWLA, + THR_NEARG, + THR_COMP_NEARGA, + THR_COMP_NEWGA, + + THR_ZEROMV, + THR_ZEROG, + THR_ZEROA, + THR_COMP_ZEROLA, + THR_COMP_ZEROGA, + + THR_H_PRED, + THR_V_PRED, + THR_D135_PRED, + THR_D207_PRED, + THR_D153_PRED, + THR_D63_PRED, + THR_D117_PRED, + THR_D45_PRED, +} THR_MODES; + +typedef enum { + THR_LAST, + THR_GOLD, + THR_ALTR, + THR_COMP_LA, + THR_COMP_GA, + THR_INTRA, +} THR_MODES_SUB8X8; + +typedef struct RD_OPT { + // Thresh_mult is used to set a threshold for the rd score. A higher value + // means that we will accept the best mode so far more often. This number + // is used in combination with the current block size, and thresh_freq_fact + // to pick a threshold. + int thresh_mult[MAX_MODES]; + int thresh_mult_sub8x8[MAX_REFS]; + + int threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES]; + int thresh_freq_fact[BLOCK_SIZES][MAX_MODES]; + + int64_t comp_pred_diff[REFERENCE_MODES]; + int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES]; + int64_t tx_select_diff[TX_MODES]; + // TODO(agrange): can this overflow? + int tx_select_threshes[MAX_REF_FRAMES][TX_MODES]; + + int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS]; + int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS]; + int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS]; + int64_t mask_filter; + + int RDMULT; + int RDDIV; +} RD_OPT; + +struct TileInfo; +struct VP9_COMP; +struct macroblock; + +int vp9_compute_rd_mult(const struct VP9_COMP *cpi, int qindex); + +void vp9_initialize_rd_consts(struct VP9_COMP *cpi); + +void vp9_initialize_me_consts(struct VP9_COMP *cpi, int qindex); + +void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n, + unsigned int qstep, int *rate, + int64_t *dist); + +int vp9_get_switchable_rate(const struct VP9_COMP *cpi); + +const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const struct VP9_COMP *cpi, + int ref_frame); + +void vp9_init_me_luts(); + +void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, + const struct macroblockd_plane *pd, + ENTROPY_CONTEXT t_above[16], + ENTROPY_CONTEXT t_left[16]); + +void vp9_set_rd_speed_thresholds(struct VP9_COMP *cpi); + +void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi); + +static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh, + int thresh_fact) { + return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX; +} + +void vp9_mv_pred(struct VP9_COMP *cpi, MACROBLOCK *x, + uint8_t *ref_y_buffer, int ref_y_stride, + int ref_frame, BLOCK_SIZE block_size); + +void vp9_setup_pred_block(const MACROBLOCKD *xd, + struct buf_2d dst[MAX_MB_PLANE], + const YV12_BUFFER_CONFIG *src, + int mi_row, int mi_col, + const struct scale_factors *scale, + const struct scale_factors *scale_uv); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_ENCODER_VP9_RD_H_ diff --git a/media/libvpx/vp9/encoder/vp9_rdopt.c b/media/libvpx/vp9/encoder/vp9_rdopt.c index 78cb06bc5ede..506c9bcd7151 100644 --- a/media/libvpx/vp9/encoder/vp9_rdopt.c +++ b/media/libvpx/vp9/encoder/vp9_rdopt.c @@ -8,42 +8,38 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include -#include -#include #include +#include -#include "vp9/common/vp9_pragmas.h" -#include "vp9/encoder/vp9_tokenize.h" -#include "vp9/encoder/vp9_treewriter.h" -#include "vp9/encoder/vp9_onyx_int.h" -#include "vp9/encoder/vp9_modecosts.h" -#include "vp9/encoder/vp9_encodeintra.h" +#include "./vp9_rtcd.h" + +#include "vpx_mem/vpx_mem.h" + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_idct.h" +#include "vp9/common/vp9_mvref_common.h" +#include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" -#include "vp9/common/vp9_findnearmv.h" -#include "vp9/common/vp9_quant_common.h" -#include "vp9/encoder/vp9_encodemb.h" -#include "vp9/encoder/vp9_quantize.h" -#include "vp9/encoder/vp9_variance.h" -#include "vp9/encoder/vp9_mcomp.h" -#include "vp9/encoder/vp9_rdopt.h" -#include "vp9/encoder/vp9_ratectrl.h" -#include "vpx_mem/vpx_mem.h" -#include "vp9/common/vp9_systemdependent.h" -#include "vp9/encoder/vp9_encodemv.h" #include "vp9/common/vp9_seg_common.h" -#include "vp9/common/vp9_pred_common.h" -#include "vp9/common/vp9_entropy.h" -#include "./vp9_rtcd.h" -#include "vp9/common/vp9_mvref_common.h" -#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_systemdependent.h" -#define INVALID_MV 0x80008000 +#include "vp9/encoder/vp9_cost.h" +#include "vp9/encoder/vp9_encodemb.h" +#include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_mcomp.h" +#include "vp9/encoder/vp9_quantize.h" +#include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_rd.h" +#include "vp9/encoder/vp9_rdopt.h" +#include "vp9/encoder/vp9_variance.h" -/* Factor to weigh the rate for switchable interp filters */ -#define SWITCHABLE_INTERP_RATE_FACTOR 1 +#define RD_THRESH_MAX_FACT 64 +#define RD_THRESH_INC 1 #define LAST_FRAME_MODE_MASK 0xFFEDCD60 #define GOLDEN_FRAME_MODE_MASK 0xFFDA3BB0 @@ -51,422 +47,212 @@ #define MIN_EARLY_TERM_INDEX 3 -const MODE_DEFINITION vp9_mode_order[MAX_MODES] = { - {NEARESTMV, LAST_FRAME, NONE}, - {NEARESTMV, ALTREF_FRAME, NONE}, - {NEARESTMV, GOLDEN_FRAME, NONE}, +typedef struct { + PREDICTION_MODE mode; + MV_REFERENCE_FRAME ref_frame[2]; +} MODE_DEFINITION; - {DC_PRED, INTRA_FRAME, NONE}, +typedef struct { + MV_REFERENCE_FRAME ref_frame[2]; +} REF_DEFINITION; - {NEWMV, LAST_FRAME, NONE}, - {NEWMV, ALTREF_FRAME, NONE}, - {NEWMV, GOLDEN_FRAME, NONE}, - - {NEARMV, LAST_FRAME, NONE}, - {NEARMV, ALTREF_FRAME, NONE}, - {NEARESTMV, LAST_FRAME, ALTREF_FRAME}, - {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME}, - - {TM_PRED, INTRA_FRAME, NONE}, - - {NEARMV, LAST_FRAME, ALTREF_FRAME}, - {NEWMV, LAST_FRAME, ALTREF_FRAME}, - {NEARMV, GOLDEN_FRAME, NONE}, - {NEARMV, GOLDEN_FRAME, ALTREF_FRAME}, - {NEWMV, GOLDEN_FRAME, ALTREF_FRAME}, - - {ZEROMV, LAST_FRAME, NONE}, - {ZEROMV, GOLDEN_FRAME, NONE}, - {ZEROMV, ALTREF_FRAME, NONE}, - {ZEROMV, LAST_FRAME, ALTREF_FRAME}, - {ZEROMV, GOLDEN_FRAME, ALTREF_FRAME}, - - {H_PRED, INTRA_FRAME, NONE}, - {V_PRED, INTRA_FRAME, NONE}, - {D135_PRED, INTRA_FRAME, NONE}, - {D207_PRED, INTRA_FRAME, NONE}, - {D153_PRED, INTRA_FRAME, NONE}, - {D63_PRED, INTRA_FRAME, NONE}, - {D117_PRED, INTRA_FRAME, NONE}, - {D45_PRED, INTRA_FRAME, NONE}, +struct rdcost_block_args { + MACROBLOCK *x; + ENTROPY_CONTEXT t_above[16]; + ENTROPY_CONTEXT t_left[16]; + int rate; + int64_t dist; + int64_t sse; + int this_rate; + int64_t this_dist; + int64_t this_sse; + int64_t this_rd; + int64_t best_rd; + int skip; + int use_fast_coef_costing; + const scan_order *so; }; -const REF_DEFINITION vp9_ref_order[MAX_REFS] = { - {LAST_FRAME, NONE}, - {GOLDEN_FRAME, NONE}, - {ALTREF_FRAME, NONE}, - {LAST_FRAME, ALTREF_FRAME}, - {GOLDEN_FRAME, ALTREF_FRAME}, - {INTRA_FRAME, NONE}, +static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = { + {NEARESTMV, {LAST_FRAME, NONE}}, + {NEARESTMV, {ALTREF_FRAME, NONE}}, + {NEARESTMV, {GOLDEN_FRAME, NONE}}, + + {DC_PRED, {INTRA_FRAME, NONE}}, + + {NEWMV, {LAST_FRAME, NONE}}, + {NEWMV, {ALTREF_FRAME, NONE}}, + {NEWMV, {GOLDEN_FRAME, NONE}}, + + {NEARMV, {LAST_FRAME, NONE}}, + {NEARMV, {ALTREF_FRAME, NONE}}, + {NEARESTMV, {LAST_FRAME, ALTREF_FRAME}}, + {NEARESTMV, {GOLDEN_FRAME, ALTREF_FRAME}}, + + {TM_PRED, {INTRA_FRAME, NONE}}, + + {NEARMV, {LAST_FRAME, ALTREF_FRAME}}, + {NEWMV, {LAST_FRAME, ALTREF_FRAME}}, + {NEARMV, {GOLDEN_FRAME, NONE}}, + {NEARMV, {GOLDEN_FRAME, ALTREF_FRAME}}, + {NEWMV, {GOLDEN_FRAME, ALTREF_FRAME}}, + + {ZEROMV, {LAST_FRAME, NONE}}, + {ZEROMV, {GOLDEN_FRAME, NONE}}, + {ZEROMV, {ALTREF_FRAME, NONE}}, + {ZEROMV, {LAST_FRAME, ALTREF_FRAME}}, + {ZEROMV, {GOLDEN_FRAME, ALTREF_FRAME}}, + + {H_PRED, {INTRA_FRAME, NONE}}, + {V_PRED, {INTRA_FRAME, NONE}}, + {D135_PRED, {INTRA_FRAME, NONE}}, + {D207_PRED, {INTRA_FRAME, NONE}}, + {D153_PRED, {INTRA_FRAME, NONE}}, + {D63_PRED, {INTRA_FRAME, NONE}}, + {D117_PRED, {INTRA_FRAME, NONE}}, + {D45_PRED, {INTRA_FRAME, NONE}}, }; -// The baseline rd thresholds for breaking out of the rd loop for -// certain modes are assumed to be based on 8x8 blocks. -// This table is used to correct for blocks size. -// The factors here are << 2 (2 = x0.5, 32 = x8 etc). -static int rd_thresh_block_size_factor[BLOCK_SIZES] = - {2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32}; +static const REF_DEFINITION vp9_ref_order[MAX_REFS] = { + {{LAST_FRAME, NONE}}, + {{GOLDEN_FRAME, NONE}}, + {{ALTREF_FRAME, NONE}}, + {{LAST_FRAME, ALTREF_FRAME}}, + {{GOLDEN_FRAME, ALTREF_FRAME}}, + {{INTRA_FRAME, NONE}}, +}; -#define RD_THRESH_MAX_FACT 64 -#define RD_THRESH_INC 1 -#define RD_THRESH_POW 1.25 -#define RD_MULT_EPB_RATIO 64 - -#define MV_COST_WEIGHT 108 -#define MV_COST_WEIGHT_SUB 120 - -static void fill_token_costs(vp9_coeff_cost *c, - vp9_coeff_probs_model (*p)[BLOCK_TYPES]) { - int i, j, k, l; - TX_SIZE t; - for (t = TX_4X4; t <= TX_32X32; t++) - for (i = 0; i < BLOCK_TYPES; i++) - for (j = 0; j < REF_TYPES; j++) - for (k = 0; k < COEF_BANDS; k++) - for (l = 0; l < PREV_COEF_CONTEXTS; l++) { - vp9_prob probs[ENTROPY_NODES]; - vp9_model_to_full_probs(p[t][i][j][k][l], probs); - vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs, - vp9_coef_tree); - vp9_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs, - vp9_coef_tree); - assert(c[t][i][j][k][0][l][DCT_EOB_TOKEN] == - c[t][i][j][k][1][l][DCT_EOB_TOKEN]); - } +static int raster_block_offset(BLOCK_SIZE plane_bsize, + int raster_block, int stride) { + const int bw = b_width_log2(plane_bsize); + const int y = 4 * (raster_block >> bw); + const int x = 4 * (raster_block & ((1 << bw) - 1)); + return y * stride + x; +} +static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize, + int raster_block, int16_t *base) { + const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + return base + raster_block_offset(plane_bsize, raster_block, stride); } -static const int rd_iifactor[32] = { - 4, 4, 3, 2, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, -}; - -// 3* dc_qlookup[Q]*dc_qlookup[Q]; - -/* values are now correlated to quantizer */ -static int sad_per_bit16lut[QINDEX_RANGE]; -static int sad_per_bit4lut[QINDEX_RANGE]; - -void vp9_init_me_luts() { +static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, + int m, int n, int min_plane, int max_plane) { int i; - // Initialize the sad lut tables using a formulaic calculation for now - // This is to make it easier to resolve the impact of experimental changes - // to the quantizer tables. - for (i = 0; i < QINDEX_RANGE; i++) { - sad_per_bit16lut[i] = - (int)((0.0418 * vp9_convert_qindex_to_q(i)) + 2.4107); - sad_per_bit4lut[i] = (int)(0.063 * vp9_convert_qindex_to_q(i) + 2.742); + for (i = min_plane; i < max_plane; ++i) { + struct macroblock_plane *const p = &x->plane[i]; + struct macroblockd_plane *const pd = &x->e_mbd.plane[i]; + + p->coeff = ctx->coeff_pbuf[i][m]; + p->qcoeff = ctx->qcoeff_pbuf[i][m]; + pd->dqcoeff = ctx->dqcoeff_pbuf[i][m]; + p->eobs = ctx->eobs_pbuf[i][m]; + + ctx->coeff_pbuf[i][m] = ctx->coeff_pbuf[i][n]; + ctx->qcoeff_pbuf[i][m] = ctx->qcoeff_pbuf[i][n]; + ctx->dqcoeff_pbuf[i][m] = ctx->dqcoeff_pbuf[i][n]; + ctx->eobs_pbuf[i][m] = ctx->eobs_pbuf[i][n]; + + ctx->coeff_pbuf[i][n] = p->coeff; + ctx->qcoeff_pbuf[i][n] = p->qcoeff; + ctx->dqcoeff_pbuf[i][n] = pd->dqcoeff; + ctx->eobs_pbuf[i][n] = p->eobs; } } -int vp9_compute_rd_mult(VP9_COMP *cpi, int qindex) { - const int q = vp9_dc_quant(qindex, 0); - // TODO(debargha): Adjust the function below - int rdmult = 88 * q * q / 25; - if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) { - if (cpi->twopass.next_iiratio > 31) - rdmult += (rdmult * rd_iifactor[31]) >> 4; - else - rdmult += (rdmult * rd_iifactor[cpi->twopass.next_iiratio]) >> 4; - } - return rdmult; -} - -static int compute_rd_thresh_factor(int qindex) { - int q; - // TODO(debargha): Adjust the function below - q = (int)(pow(vp9_dc_quant(qindex, 0) / 4.0, RD_THRESH_POW) * 5.12); - if (q < 8) - q = 8; - return q; -} - -void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) { - cpi->mb.sadperbit16 = sad_per_bit16lut[qindex]; - cpi->mb.sadperbit4 = sad_per_bit4lut[qindex]; -} - -static void set_block_thresholds(VP9_COMP *cpi) { - int i, bsize, segment_id; - VP9_COMMON *cm = &cpi->common; - - for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) { - int q; - int segment_qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex); - segment_qindex = clamp(segment_qindex + cm->y_dc_delta_q, 0, MAXQ); - q = compute_rd_thresh_factor(segment_qindex); - - for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) { - // Threshold here seem unecessarily harsh but fine given actual - // range of values used for cpi->sf.thresh_mult[] - int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]); - - for (i = 0; i < MAX_MODES; ++i) { - if (cpi->sf.thresh_mult[i] < thresh_max) { - cpi->rd_threshes[segment_id][bsize][i] = - cpi->sf.thresh_mult[i] * q * - rd_thresh_block_size_factor[bsize] / 4; - } else { - cpi->rd_threshes[segment_id][bsize][i] = INT_MAX; - } - } - - for (i = 0; i < MAX_REFS; ++i) { - if (cpi->sf.thresh_mult_sub8x8[i] < thresh_max) { - cpi->rd_thresh_sub8x8[segment_id][bsize][i] = - cpi->sf.thresh_mult_sub8x8[i] * q * - rd_thresh_block_size_factor[bsize] / 4; - } else { - cpi->rd_thresh_sub8x8[segment_id][bsize][i] = INT_MAX; - } - } - } - } -} - -void vp9_initialize_rd_consts(VP9_COMP *cpi) { - VP9_COMMON *cm = &cpi->common; - int qindex, i; - - vp9_clear_system_state(); // __asm emms; - - // Further tests required to see if optimum is different - // for key frames, golden frames and arf frames. - // if (cpi->common.refresh_golden_frame || - // cpi->common.refresh_alt_ref_frame) - qindex = clamp(cm->base_qindex + cm->y_dc_delta_q, 0, MAXQ); - - cpi->RDDIV = RDDIV_BITS; // in bits (to multiply D by 128) - cpi->RDMULT = vp9_compute_rd_mult(cpi, qindex); - - cpi->mb.errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO; - cpi->mb.errorperbit += (cpi->mb.errorperbit == 0); - - vp9_set_speed_features(cpi); - - cpi->mb.select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL && - cm->frame_type != KEY_FRAME) ? - 0 : 1; - - set_block_thresholds(cpi); - - fill_token_costs(cpi->mb.token_costs, cm->fc.coef_probs); - - for (i = 0; i < PARTITION_CONTEXTS; i++) - vp9_cost_tokens(cpi->mb.partition_cost[i], get_partition_probs(cm, i), - vp9_partition_tree); - - /*rough estimate for costing*/ - vp9_init_mode_costs(cpi); - - if (!frame_is_intra_only(cm)) { - vp9_build_nmv_cost_table( - cpi->mb.nmvjointcost, - cm->allow_high_precision_mv ? cpi->mb.nmvcost_hp : cpi->mb.nmvcost, - &cm->fc.nmvc, - cm->allow_high_precision_mv, 1, 1); - - for (i = 0; i < INTER_MODE_CONTEXTS; i++) { - MB_PREDICTION_MODE m; - - for (m = NEARESTMV; m < MB_MODE_COUNT; m++) - cpi->mb.inter_mode_cost[i][INTER_OFFSET(m)] = - cost_token(vp9_inter_mode_tree, - cm->fc.inter_mode_probs[i], - &vp9_inter_mode_encodings[INTER_OFFSET(m)]); - } - } -} - -static INLINE void linear_interpolate2(double x, int ntab, int inv_step, - const double *tab1, const double *tab2, - double *v1, double *v2) { - double y = x * inv_step; - int d = (int) y; - if (d >= ntab - 1) { - *v1 = tab1[ntab - 1]; - *v2 = tab2[ntab - 1]; - } else { - double a = y - d; - *v1 = tab1[d] * (1 - a) + tab1[d + 1] * a; - *v2 = tab2[d] * (1 - a) + tab2[d + 1] * a; - } -} - -static void model_rd_norm(double x, double *R, double *D) { - static const int inv_tab_step = 8; - static const int tab_size = 120; - // NOTE: The tables below must be of the same size - // - // Normalized rate - // This table models the rate for a Laplacian source - // source with given variance when quantized with a uniform quantizer - // with given stepsize. The closed form expression is: - // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)], - // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance), - // and H(x) is the binary entropy function. - static const double rate_tab[] = { - 64.00, 4.944, 3.949, 3.372, 2.966, 2.655, 2.403, 2.194, - 2.014, 1.858, 1.720, 1.596, 1.485, 1.384, 1.291, 1.206, - 1.127, 1.054, 0.986, 0.923, 0.863, 0.808, 0.756, 0.708, - 0.662, 0.619, 0.579, 0.541, 0.506, 0.473, 0.442, 0.412, - 0.385, 0.359, 0.335, 0.313, 0.291, 0.272, 0.253, 0.236, - 0.220, 0.204, 0.190, 0.177, 0.165, 0.153, 0.142, 0.132, - 0.123, 0.114, 0.106, 0.099, 0.091, 0.085, 0.079, 0.073, - 0.068, 0.063, 0.058, 0.054, 0.050, 0.047, 0.043, 0.040, - 0.037, 0.034, 0.032, 0.029, 0.027, 0.025, 0.023, 0.022, - 0.020, 0.019, 0.017, 0.016, 0.015, 0.014, 0.013, 0.012, - 0.011, 0.010, 0.009, 0.008, 0.008, 0.007, 0.007, 0.006, - 0.006, 0.005, 0.005, 0.005, 0.004, 0.004, 0.004, 0.003, - 0.003, 0.003, 0.003, 0.002, 0.002, 0.002, 0.002, 0.002, - 0.002, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, - 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.000, - }; - // Normalized distortion - // This table models the normalized distortion for a Laplacian source - // source with given variance when quantized with a uniform quantizer - // with given stepsize. The closed form expression is: - // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2)) - // where x = qpstep / sqrt(variance) - // Note the actual distortion is Dn * variance. - static const double dist_tab[] = { - 0.000, 0.001, 0.005, 0.012, 0.021, 0.032, 0.045, 0.061, - 0.079, 0.098, 0.119, 0.142, 0.166, 0.190, 0.216, 0.242, - 0.269, 0.296, 0.324, 0.351, 0.378, 0.405, 0.432, 0.458, - 0.484, 0.509, 0.534, 0.557, 0.580, 0.603, 0.624, 0.645, - 0.664, 0.683, 0.702, 0.719, 0.735, 0.751, 0.766, 0.780, - 0.794, 0.807, 0.819, 0.830, 0.841, 0.851, 0.861, 0.870, - 0.878, 0.886, 0.894, 0.901, 0.907, 0.913, 0.919, 0.925, - 0.930, 0.935, 0.939, 0.943, 0.947, 0.951, 0.954, 0.957, - 0.960, 0.963, 0.966, 0.968, 0.971, 0.973, 0.975, 0.976, - 0.978, 0.980, 0.981, 0.982, 0.984, 0.985, 0.986, 0.987, - 0.988, 0.989, 0.990, 0.990, 0.991, 0.992, 0.992, 0.993, - 0.993, 0.994, 0.994, 0.995, 0.995, 0.996, 0.996, 0.996, - 0.996, 0.997, 0.997, 0.997, 0.997, 0.998, 0.998, 0.998, - 0.998, 0.998, 0.998, 0.999, 0.999, 0.999, 0.999, 0.999, - 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 1.000, - }; - /* - assert(sizeof(rate_tab) == tab_size * sizeof(rate_tab[0]); - assert(sizeof(dist_tab) == tab_size * sizeof(dist_tab[0]); - assert(sizeof(rate_tab) == sizeof(dist_tab)); - */ - assert(x >= 0.0); - linear_interpolate2(x, tab_size, inv_tab_step, - rate_tab, dist_tab, R, D); -} - -static void model_rd_from_var_lapndz(int var, int n, int qstep, - int *rate, int64_t *dist) { - // This function models the rate and distortion for a Laplacian - // source with given variance when quantized with a uniform quantizer - // with given stepsize. The closed form expressions are in: - // Hang and Chen, "Source Model for transform video coder and its - // application - Part I: Fundamental Theory", IEEE Trans. Circ. - // Sys. for Video Tech., April 1997. - vp9_clear_system_state(); - if (var == 0 || n == 0) { - *rate = 0; - *dist = 0; - } else { - double D, R; - double s2 = (double) var / n; - double x = qstep / sqrt(s2); - model_rd_norm(x, &R, &D); - *rate = (int)((n << 8) * R + 0.5); - *dist = (int)(var * D + 0.5); - } - vp9_clear_system_state(); -} - static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum) { // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. - int i, rate_sum = 0, dist_sum = 0; + int i; + int64_t rate_sum = 0; + int64_t dist_sum = 0; + const int ref = xd->mi[0]->mbmi.ref_frame[0]; + unsigned int sse; + unsigned int var = 0; + unsigned int sum_sse = 0; + const int shift = 8; + int rate; + int64_t dist; + + x->pred_sse[ref] = 0; for (i = 0; i < MAX_MB_PLANE; ++i) { struct macroblock_plane *const p = &x->plane[i]; struct macroblockd_plane *const pd = &xd->plane[i]; const BLOCK_SIZE bs = get_plane_block_size(bsize, pd); - unsigned int sse; - int rate; - int64_t dist; - (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, - pd->dst.buf, pd->dst.stride, &sse); - // sse works better than var, since there is no dc prediction used - model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs], - pd->dequant[1] >> 3, &rate, &dist); + const TX_SIZE max_tx_size = max_txsize_lookup[bs]; + const BLOCK_SIZE unit_size = txsize_to_bsize[max_tx_size]; + int bw = 1 << (b_width_log2_lookup[bs] - b_width_log2_lookup[unit_size]); + int bh = 1 << (b_height_log2_lookup[bs] - b_width_log2_lookup[unit_size]); + int idx, idy; + int lw = b_width_log2_lookup[unit_size] + 2; + int lh = b_height_log2_lookup[unit_size] + 2; - rate_sum += rate; - dist_sum += (int)dist; - } + sum_sse = 0; - *out_rate_sum = rate_sum; - *out_dist_sum = dist_sum << 4; -} + for (idy = 0; idy < bh; ++idy) { + for (idx = 0; idx < bw; ++idx) { + uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw); + uint8_t *dst = pd->dst.buf + (idy * pd->dst.stride << lh) + (idx << lh); + int block_idx = (idy << 1) + idx; -static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE bsize, - TX_SIZE tx_size, - MACROBLOCK *x, MACROBLOCKD *xd, - int *out_rate_sum, int64_t *out_dist_sum, - int *out_skip) { - int j, k; - BLOCK_SIZE bs; - struct macroblock_plane *const p = &x->plane[0]; - struct macroblockd_plane *const pd = &xd->plane[0]; - const int width = 4 << num_4x4_blocks_wide_lookup[bsize]; - const int height = 4 << num_4x4_blocks_high_lookup[bsize]; - int rate_sum = 0; - int64_t dist_sum = 0; - const int t = 4 << tx_size; + var = cpi->fn_ptr[unit_size].vf(src, p->src.stride, + dst, pd->dst.stride, &sse); + x->bsse[(i << 2) + block_idx] = sse; + sum_sse += sse; - if (tx_size == TX_4X4) { - bs = BLOCK_4X4; - } else if (tx_size == TX_8X8) { - bs = BLOCK_8X8; - } else if (tx_size == TX_16X16) { - bs = BLOCK_16X16; - } else if (tx_size == TX_32X32) { - bs = BLOCK_32X32; - } else { - assert(0); - } + if (!x->select_tx_size) { + if (x->bsse[(i << 2) + block_idx] < p->quant_thred[0] >> shift) + x->skip_txfm[(i << 2) + block_idx] = 1; + else if (var < p->quant_thred[1] >> shift) + x->skip_txfm[(i << 2) + block_idx] = 2; + else + x->skip_txfm[(i << 2) + block_idx] = 0; + } - *out_skip = 1; - for (j = 0; j < height; j += t) { - for (k = 0; k < width; k += t) { - int rate; + if (i == 0) + x->pred_sse[ref] += sse; + } + } + + // Fast approximate the modelling function. + if (cpi->oxcf.speed > 4) { + int64_t rate; int64_t dist; - unsigned int sse; - cpi->fn_ptr[bs].vf(&p->src.buf[j * p->src.stride + k], p->src.stride, - &pd->dst.buf[j * pd->dst.stride + k], pd->dst.stride, - &sse); - // sse works better than var, since there is no dc prediction used - model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3, &rate, &dist); + int64_t square_error = sse; + int quantizer = (pd->dequant[1] >> 3); + + if (quantizer < 120) + rate = (square_error * (280 - quantizer)) >> 8; + else + rate = 0; + dist = (square_error * quantizer) >> 8; + rate_sum += rate; + dist_sum += dist; + } else { + vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs], + pd->dequant[1] >> 3, &rate, &dist); rate_sum += rate; dist_sum += dist; - *out_skip &= (rate < 1024); } } - *out_rate_sum = rate_sum; + *out_rate_sum = (int)rate_sum; *out_dist_sum = dist_sum << 4; } -int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, +int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz) { int i; int64_t error = 0, sqcoeff = 0; for (i = 0; i < block_size; i++) { - int this_diff = coeff[i] - dqcoeff[i]; - error += (unsigned)this_diff * this_diff; - sqcoeff += (unsigned) coeff[i] * coeff[i]; + const int diff = coeff[i] - dqcoeff[i]; + error += diff * diff; + sqcoeff += coeff[i] * coeff[i]; } *ssz = sqcoeff; @@ -484,43 +270,41 @@ static const int16_t band_counts[TX_SIZES][8] = { { 1, 2, 3, 4, 11, 256 - 21, 0 }, { 1, 2, 3, 4, 11, 1024 - 21, 0 }, }; - static INLINE int cost_coeffs(MACROBLOCK *x, int plane, int block, ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L, TX_SIZE tx_size, - const int16_t *scan, const int16_t *nb) { + const int16_t *scan, const int16_t *nb, + int use_fast_coef_costing) { MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; - struct macroblockd_plane *pd = &xd->plane[plane]; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + const struct macroblock_plane *p = &x->plane[plane]; + const struct macroblockd_plane *pd = &xd->plane[plane]; const PLANE_TYPE type = pd->plane_type; const int16_t *band_count = &band_counts[tx_size][1]; - const int eob = pd->eobs[block]; - const int16_t *const qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block); - const int ref = mbmi->ref_frame[0] != INTRA_FRAME; - unsigned int (*token_costs)[2][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] = - x->token_costs[tx_size][type][ref]; - const ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L; - uint8_t *p_tok = x->token_cache; - int pt = combine_entropy_contexts(above_ec, left_ec); + const int eob = p->eobs[block]; + const int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = + x->token_costs[tx_size][type][is_inter_block(mbmi)]; + uint8_t token_cache[32 * 32]; + int pt = combine_entropy_contexts(*A, *L); int c, cost; - // Check for consistency of tx_size with mode info - assert(type == PLANE_TYPE_Y_WITH_DC ? mbmi->tx_size == tx_size - : get_uv_tx_size(mbmi) == tx_size); + assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size + : get_uv_tx_size(mbmi, pd) == tx_size); if (eob == 0) { // single eob token - cost = token_costs[0][0][pt][DCT_EOB_TOKEN]; + cost = token_costs[0][0][pt][EOB_TOKEN]; c = 0; } else { int band_left = *band_count++; // dc token - int v = qcoeff_ptr[0]; + int v = qcoeff[0]; int prev_t = vp9_dct_value_tokens_ptr[v].token; cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v]; - p_tok[0] = vp9_pt_energy_class[prev_t]; + token_cache[0] = vp9_pt_energy_class[prev_t]; ++token_costs; // ac tokens @@ -528,11 +312,15 @@ static INLINE int cost_coeffs(MACROBLOCK *x, const int rc = scan[c]; int t; - v = qcoeff_ptr[rc]; + v = qcoeff[rc]; t = vp9_dct_value_tokens_ptr[v].token; - pt = get_coef_context(nb, p_tok, c); - cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v]; - p_tok[rc] = vp9_pt_energy_class[t]; + if (use_fast_coef_costing) { + cost += (*token_costs)[!prev_t][!prev_t][t] + vp9_dct_value_cost_ptr[v]; + } else { + pt = get_coef_context(nb, token_cache, c); + cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v]; + token_cache[rc] = vp9_pt_energy_class[t]; + } prev_t = t; if (!--band_left) { band_left = *band_count++; @@ -542,8 +330,12 @@ static INLINE int cost_coeffs(MACROBLOCK *x, // eob token if (band_left) { - pt = get_coef_context(nb, p_tok, c); - cost += (*token_costs)[0][pt][DCT_EOB_TOKEN]; + if (use_fast_coef_costing) { + cost += (*token_costs)[0][!prev_t][EOB_TOKEN]; + } else { + pt = get_coef_context(nb, token_cache, c); + cost += (*token_costs)[0][pt][EOB_TOKEN]; + } } } @@ -552,24 +344,22 @@ static INLINE int cost_coeffs(MACROBLOCK *x, return cost; } - -static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) { +static void dist_block(int plane, int block, TX_SIZE tx_size, + struct rdcost_block_args* args) { const int ss_txfrm_size = tx_size << 1; - struct rdcost_block_args* args = arg; MACROBLOCK* const x = args->x; MACROBLOCKD* const xd = &x->e_mbd; - struct macroblock_plane *const p = &x->plane[plane]; - struct macroblockd_plane *const pd = &xd->plane[plane]; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; int64_t this_sse; - int shift = args->tx_size == TX_32X32 ? 0 : 2; + int shift = tx_size == TX_32X32 ? 0 : 2; int16_t *const coeff = BLOCK_OFFSET(p->coeff, block); int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse) >> shift; args->sse = this_sse >> shift; - if (x->skip_encode && - xd->mi_8x8[0]->mbmi.ref_frame[0] == INTRA_FRAME) { + if (x->skip_encode && !is_inter_block(&xd->mi[0]->mbmi)) { // TODO(jingning): tune the model to better capture the distortion. int64_t p = (pd->dequant[1] * pd->dequant[1] * (1 << ss_txfrm_size)) >> (shift + 2); @@ -579,42 +369,66 @@ static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) { } static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, void *arg) { - struct rdcost_block_args* args = arg; - + TX_SIZE tx_size, struct rdcost_block_args* args) { int x_idx, y_idx; - txfrm_block_to_raster_xy(plane_bsize, args->tx_size, block, &x_idx, &y_idx); + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x_idx, &y_idx); args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx, - args->t_left + y_idx, args->tx_size, - args->scan, args->nb); + args->t_left + y_idx, tx_size, + args->so->scan, args->so->neighbors, + args->use_fast_coef_costing); } -static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, void *arg) { +static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, + TX_SIZE tx_size, void *arg) { struct rdcost_block_args *args = arg; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; - struct encode_b_args encode_args = {x, NULL}; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; int64_t rd1, rd2, rd; if (args->skip) return; - if (!is_inter_block(&xd->mi_8x8[0]->mbmi)) - vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &encode_args); - else - vp9_xform_quant(plane, block, plane_bsize, tx_size, &encode_args); + if (!is_inter_block(mbmi)) { + vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip); + dist_block(plane, block, tx_size, args); + } else if (max_txsize_lookup[plane_bsize] == tx_size) { + if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 0) { + // full forward transform and quantization + vp9_xform_quant(x, plane, block, plane_bsize, tx_size); + dist_block(plane, block, tx_size, args); + } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) { + // compute DC coefficient + int16_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block); + int16_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block); + vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size); + args->sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4; + args->dist = args->sse; + if (!x->plane[plane].eobs[block]) + args->dist = args->sse - ((coeff[0] * coeff[0] - + (coeff[0] - dqcoeff[0]) * (coeff[0] - dqcoeff[0])) >> 2); + } else { + // skip forward transform + x->plane[plane].eobs[block] = 0; + args->sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4; + args->dist = args->sse; + } + } else { + // full forward transform and quantization + vp9_xform_quant(x, plane, block, plane_bsize, tx_size); + dist_block(plane, block, tx_size, args); + } - dist_block(plane, block, tx_size, args); rate_block(plane, block, plane_bsize, tx_size, args); rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist); rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse); // TODO(jingning): temporarily enabled only for luma component rd = MIN(rd1, rd2); - if (!xd->lossless && plane == 0) - x->zcoeff_blk[tx_size][block] = rd1 > rd2 || !xd->plane[plane].eobs[block]; + if (plane == 0) + x->zcoeff_blk[tx_size][block] = !x->plane[plane].eobs[block] || + (rd1 > rd2 && !xd->lossless); args->this_rate += args->rate; args->this_dist += args->dist; @@ -627,371 +441,172 @@ static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize, } } -void vp9_get_entropy_contexts(TX_SIZE tx_size, - ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16], - const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left, - int num_4x4_w, int num_4x4_h) { - int i; - switch (tx_size) { - case TX_4X4: - vpx_memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w); - vpx_memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h); - break; - case TX_8X8: - for (i = 0; i < num_4x4_w; i += 2) - t_above[i] = !!*(const uint16_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 2) - t_left[i] = !!*(const uint16_t *)&left[i]; - break; - case TX_16X16: - for (i = 0; i < num_4x4_w; i += 4) - t_above[i] = !!*(const uint32_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 4) - t_left[i] = !!*(const uint32_t *)&left[i]; - break; - case TX_32X32: - for (i = 0; i < num_4x4_w; i += 8) - t_above[i] = !!*(const uint64_t *)&above[i]; - for (i = 0; i < num_4x4_h; i += 8) - t_left[i] = !!*(const uint64_t *)&left[i]; - break; - default: - assert(!"Invalid transform size."); - } -} - -static void init_rdcost_stack(MACROBLOCK *x, TX_SIZE tx_size, - const int num_4x4_w, const int num_4x4_h, - const int64_t ref_rdcost, - struct rdcost_block_args *arg) { - vpx_memset(arg, 0, sizeof(struct rdcost_block_args)); - arg->x = x; - arg->tx_size = tx_size; - arg->bw = num_4x4_w; - arg->bh = num_4x4_h; - arg->best_rd = ref_rdcost; -} - static void txfm_rd_in_plane(MACROBLOCK *x, - struct rdcost_block_args *rd_stack, int *rate, int64_t *distortion, int *skippable, int64_t *sse, int64_t ref_best_rd, int plane, - BLOCK_SIZE bsize, TX_SIZE tx_size) { + BLOCK_SIZE bsize, TX_SIZE tx_size, + int use_fast_coef_casting) { MACROBLOCKD *const xd = &x->e_mbd; - struct macroblockd_plane *const pd = &xd->plane[plane]; - const BLOCK_SIZE bs = get_plane_block_size(bsize, pd); - const int num_4x4_w = num_4x4_blocks_wide_lookup[bs]; - const int num_4x4_h = num_4x4_blocks_high_lookup[bs]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + struct rdcost_block_args args; + vp9_zero(args); + args.x = x; + args.best_rd = ref_best_rd; + args.use_fast_coef_costing = use_fast_coef_casting; - init_rdcost_stack(x, tx_size, num_4x4_w, num_4x4_h, - ref_best_rd, rd_stack); if (plane == 0) - xd->mi_8x8[0]->mbmi.tx_size = tx_size; + xd->mi[0]->mbmi.tx_size = tx_size; - vp9_get_entropy_contexts(tx_size, rd_stack->t_above, rd_stack->t_left, - pd->above_context, pd->left_context, - num_4x4_w, num_4x4_h); + vp9_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left); - get_scan(xd, tx_size, pd->plane_type, 0, &rd_stack->scan, &rd_stack->nb); + args.so = get_scan(xd, tx_size, pd->plane_type, 0); - foreach_transformed_block_in_plane(xd, bsize, plane, - block_yrd_txfm, rd_stack); - if (rd_stack->skip) { + vp9_foreach_transformed_block_in_plane(xd, bsize, plane, + block_rd_txfm, &args); + if (args.skip) { *rate = INT_MAX; *distortion = INT64_MAX; *sse = INT64_MAX; *skippable = 0; } else { - *distortion = rd_stack->this_dist; - *rate = rd_stack->this_rate; - *sse = rd_stack->this_sse; - *skippable = vp9_is_skippable_in_plane(xd, bsize, plane); + *distortion = args.this_dist; + *rate = args.this_rate; + *sse = args.this_sse; + *skippable = vp9_is_skippable_in_plane(x, bsize, plane); } } -static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x, - int *rate, int64_t *distortion, - int *skip, int64_t *sse, - int64_t ref_best_rd, - BLOCK_SIZE bs) { +static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x, + int *rate, int64_t *distortion, + int *skip, int64_t *sse, + int64_t ref_best_rd, + BLOCK_SIZE bs) { const TX_SIZE max_tx_size = max_txsize_lookup[bs]; VP9_COMMON *const cm = &cpi->common; const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode]; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; mbmi->tx_size = MIN(max_tx_size, largest_tx_size); - txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip, - &sse[mbmi->tx_size], ref_best_rd, 0, bs, - mbmi->tx_size); - cpi->tx_stepdown_count[0]++; + txfm_rd_in_plane(x, rate, distortion, skip, + sse, ref_best_rd, 0, bs, + mbmi->tx_size, cpi->sf.use_fast_coef_costing); } -static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, - int (*r)[2], int *rate, - int64_t *d, int64_t *distortion, - int *s, int *skip, - int64_t tx_cache[TX_MODES], - BLOCK_SIZE bs) { +static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, + int *rate, + int64_t *distortion, + int *skip, + int64_t *psse, + int64_t tx_cache[TX_MODES], + int64_t ref_best_rd, + BLOCK_SIZE bs) { const TX_SIZE max_tx_size = max_txsize_lookup[bs]; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; - vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd); - int64_t rd[TX_SIZES][2]; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + vp9_prob skip_prob = vp9_get_skip_prob(cm, xd); + int r[TX_SIZES][2], s[TX_SIZES]; + int64_t d[TX_SIZES], sse[TX_SIZES]; + int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX}, + {INT64_MAX, INT64_MAX}, + {INT64_MAX, INT64_MAX}, + {INT64_MAX, INT64_MAX}}; int n, m; int s0, s1; + const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode]; + int64_t best_rd = INT64_MAX; + TX_SIZE best_tx = max_tx_size; const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs); - - for (n = TX_4X4; n <= max_tx_size; n++) { - r[n][1] = r[n][0]; - if (r[n][0] == INT_MAX) - continue; - for (m = 0; m <= n - (n == max_tx_size); m++) { - if (m == n) - r[n][1] += vp9_cost_zero(tx_probs[m]); - else - r[n][1] += vp9_cost_one(tx_probs[m]); - } - } - assert(skip_prob > 0); s0 = vp9_cost_bit(skip_prob, 0); s1 = vp9_cost_bit(skip_prob, 1); - for (n = TX_4X4; n <= max_tx_size; n++) { + for (n = max_tx_size; n >= 0; n--) { + txfm_rd_in_plane(x, &r[n][0], &d[n], &s[n], + &sse[n], ref_best_rd, 0, bs, n, + cpi->sf.use_fast_coef_costing); + r[n][1] = r[n][0]; + if (r[n][0] < INT_MAX) { + for (m = 0; m <= n - (n == (int) max_tx_size); m++) { + if (m == n) + r[n][1] += vp9_cost_zero(tx_probs[m]); + else + r[n][1] += vp9_cost_one(tx_probs[m]); + } + } if (d[n] == INT64_MAX) { rd[n][0] = rd[n][1] = INT64_MAX; - continue; - } - if (s[n]) { + } else if (s[n]) { rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]); } else { rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]); rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]); } - } - if (max_tx_size == TX_32X32 && - (cm->tx_mode == ALLOW_32X32 || - (cm->tx_mode == TX_MODE_SELECT && - rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] && - rd[TX_32X32][1] < rd[TX_4X4][1]))) { - mbmi->tx_size = TX_32X32; - } else if (max_tx_size >= TX_16X16 && - (cm->tx_mode == ALLOW_16X16 || - cm->tx_mode == ALLOW_32X32 || - (cm->tx_mode == TX_MODE_SELECT && - rd[TX_16X16][1] < rd[TX_8X8][1] && - rd[TX_16X16][1] < rd[TX_4X4][1]))) { - mbmi->tx_size = TX_16X16; - } else if (cm->tx_mode == ALLOW_8X8 || - cm->tx_mode == ALLOW_16X16 || - cm->tx_mode == ALLOW_32X32 || - (cm->tx_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) { - mbmi->tx_size = TX_8X8; - } else { - mbmi->tx_size = TX_4X4; + // Early termination in transform size search. + if (cpi->sf.tx_size_search_breakout && + (rd[n][1] == INT64_MAX || + (n < (int) max_tx_size && rd[n][1] > rd[n + 1][1]) || + s[n] == 1)) + break; + + if (rd[n][1] < best_rd) { + best_tx = n; + best_rd = rd[n][1]; + } } + mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ? + best_tx : MIN(max_tx_size, max_mode_tx_size); + *distortion = d[mbmi->tx_size]; *rate = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT]; *skip = s[mbmi->tx_size]; + *psse = sse[mbmi->tx_size]; tx_cache[ONLY_4X4] = rd[TX_4X4][0]; tx_cache[ALLOW_8X8] = rd[TX_8X8][0]; tx_cache[ALLOW_16X16] = rd[MIN(max_tx_size, TX_16X16)][0]; tx_cache[ALLOW_32X32] = rd[MIN(max_tx_size, TX_32X32)][0]; - if (max_tx_size == TX_32X32 && - rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] && - rd[TX_32X32][1] < rd[TX_4X4][1]) + + if (max_tx_size == TX_32X32 && best_tx == TX_32X32) { tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1]; - else if (max_tx_size >= TX_16X16 && - rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1]) + } else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) { tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1]; - else - tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ? - rd[TX_4X4][1] : rd[TX_8X8][1]; - - if (max_tx_size == TX_32X32 && - rd[TX_32X32][1] < rd[TX_16X16][1] && - rd[TX_32X32][1] < rd[TX_8X8][1] && - rd[TX_32X32][1] < rd[TX_4X4][1]) { - cpi->tx_stepdown_count[0]++; - } else if (max_tx_size >= TX_16X16 && - rd[TX_16X16][1] < rd[TX_8X8][1] && - rd[TX_16X16][1] < rd[TX_4X4][1]) { - cpi->tx_stepdown_count[max_tx_size - TX_16X16]++; } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) { - cpi->tx_stepdown_count[max_tx_size - TX_8X8]++; + tx_cache[TX_MODE_SELECT] = rd[TX_8X8][1]; } else { - cpi->tx_stepdown_count[max_tx_size - TX_4X4]++; + tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1]; } } -static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x, - int (*r)[2], int *rate, - int64_t *d, int64_t *distortion, - int *s, int *skip, int64_t *sse, - int64_t ref_best_rd, - BLOCK_SIZE bs) { - const TX_SIZE max_tx_size = max_txsize_lookup[bs]; - VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; - vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd); - int64_t rd[TX_SIZES][2]; - int n, m; - int s0, s1; - double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00}; - // double scale_r[TX_SIZES] = {2.82, 2.00, 1.41, 1.00}; - - const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs); - - // for (n = TX_4X4; n <= max_txfm_size; n++) - // r[n][0] = (r[n][0] * scale_r[n]); - - for (n = TX_4X4; n <= max_tx_size; n++) { - r[n][1] = r[n][0]; - for (m = 0; m <= n - (n == max_tx_size); m++) { - if (m == n) - r[n][1] += vp9_cost_zero(tx_probs[m]); - else - r[n][1] += vp9_cost_one(tx_probs[m]); - } - } - - assert(skip_prob > 0); - s0 = vp9_cost_bit(skip_prob, 0); - s1 = vp9_cost_bit(skip_prob, 1); - - for (n = TX_4X4; n <= max_tx_size; n++) { - if (s[n]) { - rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]); - } else { - rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]); - rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]); - } - } - for (n = TX_4X4; n <= max_tx_size; n++) { - rd[n][0] = (int64_t)(scale_rd[n] * rd[n][0]); - rd[n][1] = (int64_t)(scale_rd[n] * rd[n][1]); - } - - if (max_tx_size == TX_32X32 && - (cm->tx_mode == ALLOW_32X32 || - (cm->tx_mode == TX_MODE_SELECT && - rd[TX_32X32][1] <= rd[TX_16X16][1] && - rd[TX_32X32][1] <= rd[TX_8X8][1] && - rd[TX_32X32][1] <= rd[TX_4X4][1]))) { - mbmi->tx_size = TX_32X32; - } else if (max_tx_size >= TX_16X16 && - (cm->tx_mode == ALLOW_16X16 || - cm->tx_mode == ALLOW_32X32 || - (cm->tx_mode == TX_MODE_SELECT && - rd[TX_16X16][1] <= rd[TX_8X8][1] && - rd[TX_16X16][1] <= rd[TX_4X4][1]))) { - mbmi->tx_size = TX_16X16; - } else if (cm->tx_mode == ALLOW_8X8 || - cm->tx_mode == ALLOW_16X16 || - cm->tx_mode == ALLOW_32X32 || - (cm->tx_mode == TX_MODE_SELECT && - rd[TX_8X8][1] <= rd[TX_4X4][1])) { - mbmi->tx_size = TX_8X8; - } else { - mbmi->tx_size = TX_4X4; - } - - // Actually encode using the chosen mode if a model was used, but do not - // update the r, d costs - txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip, - &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size); - - if (max_tx_size == TX_32X32 && - rd[TX_32X32][1] <= rd[TX_16X16][1] && - rd[TX_32X32][1] <= rd[TX_8X8][1] && - rd[TX_32X32][1] <= rd[TX_4X4][1]) { - cpi->tx_stepdown_count[0]++; - } else if (max_tx_size >= TX_16X16 && - rd[TX_16X16][1] <= rd[TX_8X8][1] && - rd[TX_16X16][1] <= rd[TX_4X4][1]) { - cpi->tx_stepdown_count[max_tx_size - TX_16X16]++; - } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) { - cpi->tx_stepdown_count[max_tx_size - TX_8X8]++; - } else { - cpi->tx_stepdown_count[max_tx_size - TX_4X4]++; - } -} - -static void super_block_yrd(VP9_COMP *cpi, - MACROBLOCK *x, int *rate, int64_t *distortion, - int *skip, int64_t *psse, BLOCK_SIZE bs, +static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, + int64_t *distortion, int *skip, + int64_t *psse, BLOCK_SIZE bs, int64_t txfm_cache[TX_MODES], int64_t ref_best_rd) { - int r[TX_SIZES][2], s[TX_SIZES]; - int64_t d[TX_SIZES], sse[TX_SIZES]; MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; - struct rdcost_block_args *rdcost_stack = &cpi->rdcost_stack; - const int b_inter_mode = is_inter_block(mbmi); + int64_t sse; + int64_t *ret_sse = psse ? psse : &sse; - assert(bs == mbmi->sb_type); - if (b_inter_mode) - vp9_subtract_sby(x, bs); + assert(bs == xd->mi[0]->mbmi.sb_type); - if (cpi->sf.tx_size_search_method == USE_LARGESTALL || - (cpi->sf.tx_size_search_method != USE_FULL_RD && - !b_inter_mode)) { + if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) { vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t)); - choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse, - ref_best_rd, bs); - if (psse) - *psse = sse[mbmi->tx_size]; - return; - } - - if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER && - b_inter_mode) { - if (bs >= BLOCK_32X32) - model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd, - &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]); - if (bs >= BLOCK_16X16) - model_rd_for_sb_y_tx(cpi, bs, TX_16X16, x, xd, - &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]); - - model_rd_for_sb_y_tx(cpi, bs, TX_8X8, x, xd, - &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]); - - model_rd_for_sb_y_tx(cpi, bs, TX_4X4, x, xd, - &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]); - - choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s, - skip, sse, ref_best_rd, bs); + choose_largest_tx_size(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd, + bs); } else { - if (bs >= BLOCK_32X32) - txfm_rd_in_plane(x, rdcost_stack, &r[TX_32X32][0], &d[TX_32X32], - &s[TX_32X32], &sse[TX_32X32], - ref_best_rd, 0, bs, TX_32X32); - if (bs >= BLOCK_16X16) - txfm_rd_in_plane(x, rdcost_stack, &r[TX_16X16][0], &d[TX_16X16], - &s[TX_16X16], &sse[TX_16X16], - ref_best_rd, 0, bs, TX_16X16); - txfm_rd_in_plane(x, rdcost_stack, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], - &sse[TX_8X8], ref_best_rd, 0, bs, TX_8X8); - txfm_rd_in_plane(x, rdcost_stack, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], - &sse[TX_4X4], ref_best_rd, 0, bs, TX_4X4); - choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, - skip, txfm_cache, bs); + choose_tx_size_from_rd(cpi, x, rate, distortion, skip, ret_sse, + txfm_cache, ref_best_rd, bs); } - if (psse) - *psse = sse[mbmi->tx_size]; } -static int conditional_skipintra(MB_PREDICTION_MODE mode, - MB_PREDICTION_MODE best_intra_mode) { +static int conditional_skipintra(PREDICTION_MODE mode, + PREDICTION_MODE best_intra_mode) { if (mode == D117_PRED && best_intra_mode != V_PRED && best_intra_mode != D135_PRED) @@ -1012,27 +627,24 @@ static int conditional_skipintra(MB_PREDICTION_MODE mode, } static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, - MB_PREDICTION_MODE *best_mode, - int *bmode_costs, + PREDICTION_MODE *best_mode, + const int *bmode_costs, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, int *bestrate, int *bestratey, int64_t *bestdistortion, BLOCK_SIZE bsize, int64_t rd_thresh) { - MB_PREDICTION_MODE mode; - MACROBLOCKD *xd = &x->e_mbd; + PREDICTION_MODE mode; + MACROBLOCKD *const xd = &x->e_mbd; int64_t best_rd = rd_thresh; - int rate = 0; - int64_t distortion; + struct macroblock_plane *p = &x->plane[0]; struct macroblockd_plane *pd = &xd->plane[0]; const int src_stride = p->src.stride; const int dst_stride = pd->dst.stride; - uint8_t *src_init = raster_block_offset_uint8(BLOCK_8X8, ib, - p->src.buf, src_stride); - uint8_t *dst_init = raster_block_offset_uint8(BLOCK_8X8, ib, - pd->dst.buf, dst_stride); - int16_t *src_diff, *coeff; - + const uint8_t *src_init = &p->src.buf[raster_block_offset(BLOCK_8X8, ib, + src_stride)]; + uint8_t *dst_init = &pd->dst.buf[raster_block_offset(BLOCK_8X8, ib, + dst_stride)]; ENTROPY_CONTEXT ta[2], tempa[2]; ENTROPY_CONTEXT tl[2], templ[2]; @@ -1045,11 +657,13 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, vpx_memcpy(ta, a, sizeof(ta)); vpx_memcpy(tl, l, sizeof(tl)); - xd->mi_8x8[0]->mbmi.tx_size = TX_4X4; + xd->mi[0]->mbmi.tx_size = TX_4X4; for (mode = DC_PRED; mode <= TM_PRED; ++mode) { int64_t this_rd; int ratey = 0; + int64_t distortion = 0; + int rate = bmode_costs[mode]; if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode))) continue; @@ -1061,56 +675,52 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, continue; } - rate = bmode_costs[mode]; - distortion = 0; - vpx_memcpy(tempa, ta, sizeof(ta)); vpx_memcpy(templ, tl, sizeof(tl)); for (idy = 0; idy < num_4x4_blocks_high; ++idy) { for (idx = 0; idx < num_4x4_blocks_wide; ++idx) { - int64_t ssz; - const int16_t *scan; - const int16_t *nb; - uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride; - uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride; const int block = ib + idy * 2 + idx; - TX_TYPE tx_type; - xd->mi_8x8[0]->bmi[block].as_mode = mode; - src_diff = raster_block_offset_int16(BLOCK_8X8, block, p->src_diff); - coeff = BLOCK_OFFSET(x->plane[0].coeff, block); + const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride]; + uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride]; + int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block, + p->src_diff); + int16_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block); + xd->mi[0]->bmi[block].as_mode = mode; vp9_predict_intra_block(xd, block, 1, TX_4X4, mode, x->skip_encode ? src : dst, x->skip_encode ? src_stride : dst_stride, - dst, dst_stride); - vp9_subtract_block(4, 4, src_diff, 8, - src, src_stride, - dst, dst_stride); + dst, dst_stride, idx, idy, 0); + vp9_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride); - tx_type = get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block); - get_scan_nb_4x4(tx_type, &scan, &nb); - - if (tx_type != DCT_DCT) - vp9_short_fht4x4(src_diff, coeff, 8, tx_type); - else - x->fwd_txm4x4(src_diff, coeff, 8); - - vp9_regular_quantize_b_4x4(x, 4, block, scan, get_iscan_4x4(tx_type)); - - ratey += cost_coeffs(x, 0, block, - tempa + idx, templ + idy, TX_4X4, scan, nb); - distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block), - 16, &ssz) >> 2; - if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) - goto next; - - if (tx_type != DCT_DCT) - vp9_iht4x4_16_add(BLOCK_OFFSET(pd->dqcoeff, block), - dst, pd->dst.stride, tx_type); - else - xd->itxm_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, pd->dst.stride, - 16); + if (xd->lossless) { + const scan_order *so = &vp9_default_scan_orders[TX_4X4]; + vp9_fwht4x4(src_diff, coeff, 8); + vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan); + ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4, + so->scan, so->neighbors, + cpi->sf.use_fast_coef_costing); + if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) + goto next; + vp9_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, dst_stride, + p->eobs[block]); + } else { + int64_t unused; + const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block); + const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type]; + vp9_fht4x4(src_diff, coeff, 8, tx_type); + vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan); + ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4, + so->scan, so->neighbors, + cpi->sf.use_fast_coef_costing); + distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block), + 16, &unused) >> 2; + if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) + goto next; + vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block), + dst, dst_stride, p->eobs[block]); + } } } @@ -1143,18 +753,16 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, return best_rd; } -static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi, - MACROBLOCK * const mb, - int * const rate, - int * const rate_y, - int64_t * const distortion, +static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb, + int *rate, int *rate_y, + int64_t *distortion, int64_t best_rd) { int i, j; - MACROBLOCKD *const xd = &mb->e_mbd; - MODE_INFO *const mic = xd->mi_8x8[0]; - const MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride]; - const MODE_INFO *left_mi = xd->left_available ? xd->mi_8x8[-1] : NULL; - const BLOCK_SIZE bsize = xd->mi_8x8[0]->mbmi.sb_type; + const MACROBLOCKD *const xd = &mb->e_mbd; + MODE_INFO *const mic = xd->mi[0]; + const MODE_INFO *above_mi = xd->mi[-xd->mi_stride]; + const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL; + const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type; const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; int idx, idy; @@ -1163,25 +771,23 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi, int tot_rate_y = 0; int64_t total_rd = 0; ENTROPY_CONTEXT t_above[4], t_left[4]; - int *bmode_costs; + const int *bmode_costs = cpi->mbmode_cost; vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above)); vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left)); - bmode_costs = mb->mbmode_cost; - // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block. for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { - MB_PREDICTION_MODE best_mode = DC_PRED; + PREDICTION_MODE best_mode = DC_PRED; int r = INT_MAX, ry = INT_MAX; int64_t d = INT64_MAX, this_rd = INT64_MAX; i = idy * 2 + idx; if (cpi->common.frame_type == KEY_FRAME) { - const MB_PREDICTION_MODE A = above_block_mode(mic, above_mi, i); - const MB_PREDICTION_MODE L = left_block_mode(mic, left_mi, i); + const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, i); + const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, i); - bmode_costs = mb->y_mode_costs[A][L]; + bmode_costs = cpi->y_mode_costs[A][L]; } this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs, @@ -1220,15 +826,15 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int64_t tx_cache[TX_MODES], int64_t best_rd) { - MB_PREDICTION_MODE mode; - MB_PREDICTION_MODE mode_selected = DC_PRED; + PREDICTION_MODE mode; + PREDICTION_MODE mode_selected = DC_PRED; MACROBLOCKD *const xd = &x->e_mbd; - MODE_INFO *const mic = xd->mi_8x8[0]; + MODE_INFO *const mic = xd->mi[0]; int this_rate, this_rate_tokenonly, s; int64_t this_distortion, this_rd; TX_SIZE best_tx = TX_4X4; int i; - int *bmode_costs = x->mbmode_cost; + int *bmode_costs = cpi->mbmode_cost; if (cpi->sf.tx_size_search_method == USE_FULL_RD) for (i = 0; i < TX_MODES; i++) @@ -1237,22 +843,19 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, /* Y Search for intra prediction mode */ for (mode = DC_PRED; mode <= TM_PRED; mode++) { int64_t local_tx_cache[TX_MODES]; - MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride]; - MODE_INFO *left_mi = xd->left_available ? xd->mi_8x8[-1] : NULL; - - if (!(cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]] & (1 << mode))) - continue; + MODE_INFO *above_mi = xd->mi[-xd->mi_stride]; + MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL; if (cpi->common.frame_type == KEY_FRAME) { - const MB_PREDICTION_MODE A = above_block_mode(mic, above_mi, 0); - const MB_PREDICTION_MODE L = left_block_mode(mic, left_mi, 0); + const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0); + const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0); - bmode_costs = x->y_mode_costs[A][L]; + bmode_costs = cpi->y_mode_costs[A][L]; } mic->mbmi.mode = mode; - super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL, - bsize, local_tx_cache, best_rd); + super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, + &s, NULL, bsize, local_tx_cache, best_rd); if (this_rate_tokenonly == INT_MAX) continue; @@ -1287,13 +890,13 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, return best_rd; } -static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x, +static void super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *distortion, int *skippable, int64_t *sse, BLOCK_SIZE bsize, int64_t ref_best_rd) { MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; - TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi); + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const TX_SIZE uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]); int plane; int pnrate = 0, pnskip = 1; int64_t pndist = 0, pnsse = 0; @@ -1301,8 +904,11 @@ static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x, if (ref_best_rd < 0) goto term; - if (is_inter_block(mbmi)) - vp9_subtract_sbuv(x, bsize); + if (is_inter_block(mbmi)) { + int plane; + for (plane = 1; plane < MAX_MB_PLANE; ++plane) + vp9_subtract_plane(x, bsize, plane); + } *rate = 0; *distortion = 0; @@ -1310,8 +916,9 @@ static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x, *skippable = 1; for (plane = 1; plane < MAX_MB_PLANE; ++plane) { - txfm_rd_in_plane(x, &cpi->rdcost_stack, &pnrate, &pndist, &pnskip, &pnsse, - ref_best_rd, plane, bsize, uv_txfm_size); + txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse, + ref_best_rd, plane, bsize, uv_tx_size, + cpi->sf.use_fast_coef_costing); if (pnrate == INT_MAX) goto term; *rate += pnrate; @@ -1333,30 +940,26 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, - BLOCK_SIZE bsize) { - MB_PREDICTION_MODE mode; - MB_PREDICTION_MODE mode_selected = DC_PRED; + BLOCK_SIZE bsize, TX_SIZE max_tx_size) { + MACROBLOCKD *xd = &x->e_mbd; + PREDICTION_MODE mode; + PREDICTION_MODE mode_selected = DC_PRED; int64_t best_rd = INT64_MAX, this_rd; int this_rate_tokenonly, this_rate, s; int64_t this_distortion, this_sse; - // int mode_mask = (bsize <= BLOCK_8X8) - // ? ALL_INTRA_MODES : cpi->sf.intra_uv_mode_mask; - - for (mode = DC_PRED; mode <= TM_PRED; mode ++) { - // if (!(mode_mask & (1 << mode))) - if (!(cpi->sf.intra_uv_mode_mask[max_uv_txsize_lookup[bsize]] - & (1 << mode))) + for (mode = DC_PRED; mode <= TM_PRED; ++mode) { + if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode))) continue; - x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode; + xd->mi[0]->mbmi.uv_mode = mode; super_block_uvrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, &this_sse, bsize, best_rd); if (this_rate_tokenonly == INT_MAX) continue; this_rate = this_rate_tokenonly + - x->intra_uv_mode_cost[cpi->common.frame_type][mode]; + cpi->intra_uv_mode_cost[cpi->common.frame_type][mode]; this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion); if (this_rd < best_rd) { @@ -1366,92 +969,55 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, *rate_tokenonly = this_rate_tokenonly; *distortion = this_distortion; *skippable = s; - if (!x->select_txfm_size) { - int i; - struct macroblock_plane *const p = x->plane; - struct macroblockd_plane *const pd = x->e_mbd.plane; - for (i = 1; i < MAX_MB_PLANE; ++i) { - p[i].coeff = ctx->coeff_pbuf[i][2]; - pd[i].qcoeff = ctx->qcoeff_pbuf[i][2]; - pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2]; - pd[i].eobs = ctx->eobs_pbuf[i][2]; - - ctx->coeff_pbuf[i][2] = ctx->coeff_pbuf[i][0]; - ctx->qcoeff_pbuf[i][2] = ctx->qcoeff_pbuf[i][0]; - ctx->dqcoeff_pbuf[i][2] = ctx->dqcoeff_pbuf[i][0]; - ctx->eobs_pbuf[i][2] = ctx->eobs_pbuf[i][0]; - - ctx->coeff_pbuf[i][0] = p[i].coeff; - ctx->qcoeff_pbuf[i][0] = pd[i].qcoeff; - ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff; - ctx->eobs_pbuf[i][0] = pd[i].eobs; - } - } + if (!x->select_tx_size) + swap_block_ptr(x, ctx, 2, 0, 1, MAX_MB_PLANE); } } - x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode_selected; - + xd->mi[0]->mbmi.uv_mode = mode_selected; return best_rd; } -static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x, +static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable, BLOCK_SIZE bsize) { - int64_t this_rd; - int64_t this_sse; + const VP9_COMMON *cm = &cpi->common; + int64_t unused; - x->e_mbd.mi_8x8[0]->mbmi.uv_mode = DC_PRED; + x->e_mbd.mi[0]->mbmi.uv_mode = DC_PRED; super_block_uvrd(cpi, x, rate_tokenonly, distortion, - skippable, &this_sse, bsize, INT64_MAX); - *rate = *rate_tokenonly + - x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED]; - this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion); - - return this_rd; + skippable, &unused, bsize, INT64_MAX); + *rate = *rate_tokenonly + cpi->intra_uv_mode_cost[cm->frame_type][DC_PRED]; + return RDCOST(x->rdmult, x->rddiv, *rate, *distortion); } static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, - BLOCK_SIZE bsize, int *rate_uv, - int *rate_uv_tokenonly, + BLOCK_SIZE bsize, TX_SIZE max_tx_size, + int *rate_uv, int *rate_uv_tokenonly, int64_t *dist_uv, int *skip_uv, - MB_PREDICTION_MODE *mode_uv) { + PREDICTION_MODE *mode_uv) { MACROBLOCK *const x = &cpi->mb; // Use an estimated rd for uv_intra based on DC_PRED if the // appropriate speed flag is set. if (cpi->sf.use_uv_intra_rd_estimate) { - rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv, - bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize); + rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, + skip_uv, bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize); // Else do a proper rd search for each possible transform size that may // be considered in the main rd loop. } else { rd_pick_intra_sbuv_mode(cpi, x, ctx, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv, - bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize); + bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size); } - *mode_uv = x->e_mbd.mi_8x8[0]->mbmi.uv_mode; + *mode_uv = x->e_mbd.mi[0]->mbmi.uv_mode; } -static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode, +static int cost_mv_ref(const VP9_COMP *cpi, PREDICTION_MODE mode, int mode_context) { - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; - const int segment_id = xd->mi_8x8[0]->mbmi.segment_id; - - // Don't account for mode here if segment skip is enabled. - if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) { - assert(is_inter_mode(mode)); - return x->inter_mode_cost[mode_context][INTER_OFFSET(mode)]; - } else { - return 0; - } -} - -void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) { - x->e_mbd.mi_8x8[0]->mbmi.mode = mb; - x->e_mbd.mi_8x8[0]->mbmi.mv[0].as_int = mv->as_int; + assert(is_inter_mode(mode)); + return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)]; } static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, @@ -1461,79 +1027,59 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int_mv single_newmv[MAX_REF_FRAMES], int *rate_mv); -static int labels2mode(MACROBLOCK *x, int i, - MB_PREDICTION_MODE this_mode, - int_mv *this_mv, int_mv *this_second_mv, - int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], - int_mv seg_mvs[MAX_REF_FRAMES], - int_mv *best_ref_mv, - int_mv *second_best_ref_mv, - int *mvjcost, int *mvcost[2], VP9_COMP *cpi) { - MACROBLOCKD *const xd = &x->e_mbd; - MODE_INFO *const mic = xd->mi_8x8[0]; - MB_MODE_INFO *mbmi = &mic->mbmi; - int cost = 0, thismvcost = 0; +static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCKD *xd, int i, + PREDICTION_MODE mode, int_mv this_mv[2], + int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], + int_mv seg_mvs[MAX_REF_FRAMES], + int_mv *best_ref_mv[2], const int *mvjcost, + int *mvcost[2]) { + MODE_INFO *const mic = xd->mi[0]; + const MB_MODE_INFO *const mbmi = &mic->mbmi; + int thismvcost = 0; int idx, idy; const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type]; const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type]; - const int has_second_rf = has_second_ref(mbmi); + const int is_compound = has_second_ref(mbmi); - /* We have to be careful retrieving previously-encoded motion vectors. - Ones from this macroblock have to be pulled from the BLOCKD array - as they have not yet made it to the bmi array in our MB_MODE_INFO. */ - MB_PREDICTION_MODE m; - - // the only time we should do costing for new motion vector or mode - // is when we are on a new label (jbb May 08, 2007) - switch (m = this_mode) { + switch (mode) { case NEWMV: - this_mv->as_int = seg_mvs[mbmi->ref_frame[0]].as_int; - thismvcost = vp9_mv_bit_cost(&this_mv->as_mv, &best_ref_mv->as_mv, + this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int; + thismvcost += vp9_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv, mvjcost, mvcost, MV_COST_WEIGHT_SUB); - if (has_second_rf) { - this_second_mv->as_int = seg_mvs[mbmi->ref_frame[1]].as_int; - thismvcost += vp9_mv_bit_cost(&this_second_mv->as_mv, - &second_best_ref_mv->as_mv, + if (is_compound) { + this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int; + thismvcost += vp9_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv, mvjcost, mvcost, MV_COST_WEIGHT_SUB); } break; - case NEARESTMV: - this_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int; - if (has_second_rf) - this_second_mv->as_int = - frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int; - break; case NEARMV: - this_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame[0]].as_int; - if (has_second_rf) - this_second_mv->as_int = - frame_mv[NEARMV][mbmi->ref_frame[1]].as_int; + case NEARESTMV: + this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int; + if (is_compound) + this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int; break; case ZEROMV: - this_mv->as_int = 0; - if (has_second_rf) - this_second_mv->as_int = 0; + this_mv[0].as_int = 0; + if (is_compound) + this_mv[1].as_int = 0; break; default: break; } - cost = cost_mv_ref(cpi, this_mode, - mbmi->mode_context[mbmi->ref_frame[0]]); + mic->bmi[i].as_mv[0].as_int = this_mv[0].as_int; + if (is_compound) + mic->bmi[i].as_mv[1].as_int = this_mv[1].as_int; - mic->bmi[i].as_mv[0].as_int = this_mv->as_int; - if (has_second_rf) - mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int; - - mic->bmi[i].as_mode = m; + mic->bmi[i].as_mode = mode; for (idy = 0; idy < num_4x4_blocks_high; ++idy) for (idx = 0; idx < num_4x4_blocks_wide; ++idx) vpx_memcpy(&mic->bmi[i + idy * 2 + idx], &mic->bmi[i], sizeof(mic->bmi[i])); - cost += thismvcost; - return cost; + return cost_mv_ref(cpi, mode, mbmi->mode_context[mbmi->ref_frame[0]]) + + thismvcost; } static int64_t encode_inter_mb_segment(VP9_COMP *cpi, @@ -1543,32 +1089,38 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, int *labelyrate, int64_t *distortion, int64_t *sse, ENTROPY_CONTEXT *ta, - ENTROPY_CONTEXT *tl) { + ENTROPY_CONTEXT *tl, + int mi_row, int mi_col) { int k; MACROBLOCKD *xd = &x->e_mbd; struct macroblockd_plane *const pd = &xd->plane[0]; struct macroblock_plane *const p = &x->plane[0]; - MODE_INFO *const mi = xd->mi_8x8[0]; - const BLOCK_SIZE bsize = mi->mbmi.sb_type; - const int width = plane_block_width(bsize, pd); - const int height = plane_block_height(bsize, pd); + MODE_INFO *const mi = xd->mi[0]; + const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd); + const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize]; int idx, idy; - uint8_t *const src = raster_block_offset_uint8(BLOCK_8X8, i, - p->src.buf, p->src.stride); - uint8_t *const dst = raster_block_offset_uint8(BLOCK_8X8, i, - pd->dst.buf, pd->dst.stride); + const uint8_t *const src = &p->src.buf[raster_block_offset(BLOCK_8X8, i, + p->src.stride)]; + uint8_t *const dst = &pd->dst.buf[raster_block_offset(BLOCK_8X8, i, + pd->dst.stride)]; int64_t thisdistortion = 0, thissse = 0; int thisrate = 0, ref; + const scan_order *so = &vp9_default_scan_orders[TX_4X4]; const int is_compound = has_second_ref(&mi->mbmi); + const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter); + for (ref = 0; ref < 1 + is_compound; ++ref) { - const uint8_t *pre = raster_block_offset_uint8(BLOCK_8X8, i, - pd->pre[ref].buf, pd->pre[ref].stride); + const uint8_t *pre = &pd->pre[ref].buf[raster_block_offset(BLOCK_8X8, i, + pd->pre[ref].stride)]; vp9_build_inter_predictor(pre, pd->pre[ref].stride, dst, pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv, - &xd->scale_factor[ref], - width, height, ref, &xd->subpix, MV_PRECISION_Q3); + &xd->block_refs[ref]->sf, width, height, ref, + kernel, MV_PRECISION_Q3, + mi_col * MI_SIZE + 4 * (i % 2), + mi_row * MI_SIZE + 4 * (i / 2)); } vp9_subtract_block(height, width, @@ -1586,16 +1138,13 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, coeff = BLOCK_OFFSET(p->coeff, k); x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff), coeff, 8); - vp9_regular_quantize_b_4x4(x, 4, k, get_scan_4x4(DCT_DCT), - get_iscan_4x4(DCT_DCT)); + vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan); thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz); thissse += ssz; - thisrate += cost_coeffs(x, 0, k, - ta + (k & 1), - tl + (k >> 1), TX_4X4, - vp9_default_scan_4x4, - vp9_default_scan_4x4_neighbors); + thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4, + so->scan, so->neighbors, + cpi->sf.use_fast_coef_costing); rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2); rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2); rd = MIN(rd1, rd2); @@ -1624,7 +1173,7 @@ typedef struct { } SEG_RDSTAT; typedef struct { - int_mv *ref_mv, *second_ref_mv; + int_mv *ref_mv[2]; int_mv mvp; int64_t segment_rd; @@ -1632,55 +1181,108 @@ typedef struct { int64_t d; int64_t sse; int segment_yrate; - MB_PREDICTION_MODE modes[4]; + PREDICTION_MODE modes[4]; SEG_RDSTAT rdstat[4][INTER_MODES]; int mvthresh; } BEST_SEG_INFO; -static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) { - int r = 0; - r |= (mv->as_mv.row >> 3) < x->mv_row_min; - r |= (mv->as_mv.row >> 3) > x->mv_row_max; - r |= (mv->as_mv.col >> 3) < x->mv_col_min; - r |= (mv->as_mv.col >> 3) > x->mv_col_max; - return r; +static INLINE int mv_check_bounds(const MACROBLOCK *x, const MV *mv) { + return (mv->row >> 3) < x->mv_row_min || + (mv->row >> 3) > x->mv_row_max || + (mv->col >> 3) < x->mv_col_min || + (mv->col >> 3) > x->mv_col_max; } static INLINE void mi_buf_shift(MACROBLOCK *x, int i) { - MB_MODE_INFO *const mbmi = &x->e_mbd.mi_8x8[0]->mbmi; + MB_MODE_INFO *const mbmi = &x->e_mbd.mi[0]->mbmi; struct macroblock_plane *const p = &x->plane[0]; struct macroblockd_plane *const pd = &x->e_mbd.plane[0]; - p->src.buf = raster_block_offset_uint8(BLOCK_8X8, i, p->src.buf, - p->src.stride); + p->src.buf = &p->src.buf[raster_block_offset(BLOCK_8X8, i, p->src.stride)]; assert(((intptr_t)pd->pre[0].buf & 0x7) == 0); - pd->pre[0].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[0].buf, - pd->pre[0].stride); + pd->pre[0].buf = &pd->pre[0].buf[raster_block_offset(BLOCK_8X8, i, + pd->pre[0].stride)]; if (has_second_ref(mbmi)) - pd->pre[1].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[1].buf, - pd->pre[1].stride); + pd->pre[1].buf = &pd->pre[1].buf[raster_block_offset(BLOCK_8X8, i, + pd->pre[1].stride)]; } static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src, struct buf_2d orig_pre[2]) { - MB_MODE_INFO *mbmi = &x->e_mbd.mi_8x8[0]->mbmi; + MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi; x->plane[0].src = orig_src; x->e_mbd.plane[0].pre[0] = orig_pre[0]; if (has_second_ref(mbmi)) x->e_mbd.plane[0].pre[1] = orig_pre[1]; } -static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, - const TileInfo *const tile, - BEST_SEG_INFO *bsi_buf, int filter_idx, - int_mv seg_mvs[4][MAX_REF_FRAMES], - int mi_row, int mi_col) { - int i, br = 0, idx, idy; +static INLINE int mv_has_subpel(const MV *mv) { + return (mv->row & 0x0F) || (mv->col & 0x0F); +} + +// Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion. +// TODO(aconverse): Find out if this is still productive then clean up or remove +static int check_best_zero_mv( + const VP9_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES], + int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], + int inter_mode_mask, int this_mode, + const MV_REFERENCE_FRAME ref_frames[2]) { + if ((inter_mode_mask & (1 << ZEROMV)) && + (this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) && + frame_mv[this_mode][ref_frames[0]].as_int == 0 && + (ref_frames[1] == NONE || + frame_mv[this_mode][ref_frames[1]].as_int == 0)) { + int rfc = mode_context[ref_frames[0]]; + int c1 = cost_mv_ref(cpi, NEARMV, rfc); + int c2 = cost_mv_ref(cpi, NEARESTMV, rfc); + int c3 = cost_mv_ref(cpi, ZEROMV, rfc); + + if (this_mode == NEARMV) { + if (c1 > c3) return 0; + } else if (this_mode == NEARESTMV) { + if (c2 > c3) return 0; + } else { + assert(this_mode == ZEROMV); + if (ref_frames[1] == NONE) { + if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) || + (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0)) + return 0; + } else { + if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0 && + frame_mv[NEARESTMV][ref_frames[1]].as_int == 0) || + (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0 && + frame_mv[NEARMV][ref_frames[1]].as_int == 0)) + return 0; + } + } + } + return 1; +} + +static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x, + const TileInfo * const tile, + int_mv *best_ref_mv, + int_mv *second_best_ref_mv, + int64_t best_rd, int *returntotrate, + int *returnyrate, + int64_t *returndistortion, + int *skippable, int64_t *psse, + int mvthresh, + int_mv seg_mvs[4][MAX_REF_FRAMES], + BEST_SEG_INFO *bsi_buf, int filter_idx, + int mi_row, int mi_col) { + int i; + BEST_SEG_INFO *bsi = bsi_buf + filter_idx; + MACROBLOCKD *xd = &x->e_mbd; + MODE_INFO *mi = xd->mi[0]; + MB_MODE_INFO *mbmi = &mi->mbmi; + int mode_idx; + int k, br = 0, idx, idy; int64_t bd = 0, block_sse = 0; - MB_PREDICTION_MODE this_mode; - MODE_INFO *mi = x->e_mbd.mi_8x8[0]; - MB_MODE_INFO *const mbmi = &mi->mbmi; - struct macroblockd_plane *const pd = &x->e_mbd.plane[0]; + PREDICTION_MODE this_mode; + VP9_COMMON *cm = &cpi->common; + struct macroblock_plane *const p = &x->plane[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; const int label_count = 4; int64_t this_segment_rd = 0; int label_mv_thresh; @@ -1688,18 +1290,25 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize = mbmi->sb_type; const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; - vp9_variance_fn_ptr_t *v_fn_ptr; ENTROPY_CONTEXT t_above[2], t_left[2]; - BEST_SEG_INFO *bsi = bsi_buf + filter_idx; - int mode_idx; int subpelmv = 1, have_ref = 0; const int has_second_rf = has_second_ref(mbmi); + const int inter_mode_mask = cpi->sf.inter_mode_mask[bsize]; + + vp9_zero(*bsi); + + bsi->segment_rd = best_rd; + bsi->ref_mv[0] = best_ref_mv; + bsi->ref_mv[1] = second_best_ref_mv; + bsi->mvp.as_int = best_ref_mv->as_int; + bsi->mvthresh = mvthresh; + + for (i = 0; i < 4; i++) + bsi->modes[i] = ZEROMV; vpx_memcpy(t_above, pd->above_context, sizeof(t_above)); vpx_memcpy(t_left, pd->left_context, sizeof(t_left)); - v_fn_ptr = &cpi->fn_ptr[bsize]; - // 64 makes this threshold really big effectively // making it so that we very rarely check mvs on // segments. setting this to 1 would make mv thresh @@ -1711,24 +1320,21 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { // TODO(jingning,rbultje): rewrite the rate-distortion optimization // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop - int_mv mode_mv[MB_MODE_COUNT], second_mode_mv[MB_MODE_COUNT]; + int_mv mode_mv[MB_MODE_COUNT][2]; int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; - MB_PREDICTION_MODE mode_selected = ZEROMV; + PREDICTION_MODE mode_selected = ZEROMV; int64_t best_rd = INT64_MAX; - i = idy * 2 + idx; + const int i = idy * 2 + idx; + int ref; - frame_mv[ZEROMV][mbmi->ref_frame[0]].as_int = 0; - vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, tile, - &frame_mv[NEARESTMV][mbmi->ref_frame[0]], - &frame_mv[NEARMV][mbmi->ref_frame[0]], - i, 0, mi_row, mi_col); - if (has_second_rf) { - frame_mv[ZEROMV][mbmi->ref_frame[1]].as_int = 0; - vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, tile, - &frame_mv[NEARESTMV][mbmi->ref_frame[1]], - &frame_mv[NEARMV][mbmi->ref_frame[1]], - i, 1, mi_row, mi_col); + for (ref = 0; ref < 1 + has_second_rf; ++ref) { + const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref]; + frame_mv[ZEROMV][frame].as_int = 0; + vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, i, ref, mi_row, mi_col, + &frame_mv[NEARESTMV][frame], + &frame_mv[NEARMV][frame]); } + // search for the best motion vector on this segment for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { const struct buf_2d orig_src = x->plane[0].src; @@ -1736,43 +1342,13 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, mode_idx = INTER_OFFSET(this_mode); bsi->rdstat[i][mode_idx].brdcost = INT64_MAX; + if (!(inter_mode_mask & (1 << this_mode))) + continue; - // if we're near/nearest and mv == 0,0, compare to zeromv - if ((this_mode == NEARMV || this_mode == NEARESTMV || - this_mode == ZEROMV) && - frame_mv[this_mode][mbmi->ref_frame[0]].as_int == 0 && - (!has_second_rf || - frame_mv[this_mode][mbmi->ref_frame[1]].as_int == 0)) { - int rfc = mbmi->mode_context[mbmi->ref_frame[0]]; - int c1 = cost_mv_ref(cpi, NEARMV, rfc); - int c2 = cost_mv_ref(cpi, NEARESTMV, rfc); - int c3 = cost_mv_ref(cpi, ZEROMV, rfc); - - if (this_mode == NEARMV) { - if (c1 > c3) - continue; - } else if (this_mode == NEARESTMV) { - if (c2 > c3) - continue; - } else { - assert(this_mode == ZEROMV); - if (!has_second_rf) { - if ((c3 >= c2 && - frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) || - (c3 >= c1 && - frame_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0)) - continue; - } else { - if ((c3 >= c2 && - frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0 && - frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int == 0) || - (c3 >= c1 && - frame_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0 && - frame_mv[NEARMV][mbmi->ref_frame[1]].as_int == 0)) - continue; - } - } - } + if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv, + inter_mode_mask, + this_mode, mbmi->ref_frame)) + continue; vpx_memcpy(orig_pre, pd->pre, sizeof(orig_pre)); vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above, @@ -1783,19 +1359,20 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, // motion search for newmv (single predictor case only) if (!has_second_rf && this_mode == NEWMV && seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) { + MV *const new_mv = &mode_mv[NEWMV][0].as_mv; int step_param = 0; - int further_steps; int thissme, bestsme = INT_MAX; int sadpb = x->sadperbit4; - int_mv mvp_full; + MV mvp_full; int max_mv; + int sad_list[5]; /* Is the best so far sufficiently good that we cant justify doing * and new motion search. */ if (best_rd < label_mv_thresh) break; - if (cpi->compressor_speed) { + if (cpi->oxcf.mode != BEST) { // use previous block's result as next block's MV predictor. if (i > 0) { bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int; @@ -1808,91 +1385,79 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, else max_mv = MAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3; - if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) { + if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) { // Take wtd average of the step_params based on the last frame's // max mv magnitude and the best ref mvs of the current block for // the given reference. - step_param = (vp9_init_search_range(cpi, max_mv) + - cpi->mv_step_param) >> 1; + step_param = (vp9_init_search_range(max_mv) + + cpi->mv_step_param) / 2; } else { step_param = cpi->mv_step_param; } - mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3; - mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3; + mvp_full.row = bsi->mvp.as_mv.row >> 3; + mvp_full.col = bsi->mvp.as_mv.col >> 3; - if (cpi->sf.adaptive_motion_search && cpi->common.show_frame) { - mvp_full.as_mv.row = x->pred_mv[mbmi->ref_frame[0]].as_mv.row >> 3; - mvp_full.as_mv.col = x->pred_mv[mbmi->ref_frame[0]].as_mv.col >> 3; + if (cpi->sf.adaptive_motion_search) { + mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].row >> 3; + mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].col >> 3; step_param = MAX(step_param, 8); } - further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; // adjust src pointer for this block mi_buf_shift(x, i); - if (cpi->sf.search_method == HEX) { - bestsme = vp9_hex_search(x, &mvp_full.as_mv, - step_param, - sadpb, 1, v_fn_ptr, 1, - &bsi->ref_mv->as_mv, - &mode_mv[NEWMV].as_mv); - } else if (cpi->sf.search_method == SQUARE) { - bestsme = vp9_square_search(x, &mvp_full.as_mv, - step_param, - sadpb, 1, v_fn_ptr, 1, - &bsi->ref_mv->as_mv, - &mode_mv[NEWMV].as_mv); - } else if (cpi->sf.search_method == BIGDIA) { - bestsme = vp9_bigdia_search(x, &mvp_full.as_mv, - step_param, - sadpb, 1, v_fn_ptr, 1, - &bsi->ref_mv->as_mv, - &mode_mv[NEWMV].as_mv); - } else { - bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, - sadpb, further_steps, 0, v_fn_ptr, - bsi->ref_mv, &mode_mv[NEWMV]); - } + + vp9_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv); + + bestsme = vp9_full_pixel_search( + cpi, x, bsize, &mvp_full, step_param, sadpb, + cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? sad_list : NULL, + &bsi->ref_mv[0]->as_mv, new_mv, + INT_MAX, 1); // Should we do a full search (best quality only) - if (cpi->compressor_speed == 0) { + if (cpi->oxcf.mode == BEST) { + int_mv *const best_mv = &mi->bmi[i].as_mv[0]; /* Check if mvp_full is within the range. */ - clamp_mv(&mvp_full.as_mv, x->mv_col_min, x->mv_col_max, + clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); - thissme = cpi->full_search_sad(x, &mvp_full, - sadpb, 16, v_fn_ptr, - x->nmvjointcost, x->mvcost, - bsi->ref_mv, i); - + sadpb, 16, &cpi->fn_ptr[bsize], + &bsi->ref_mv[0]->as_mv, + &best_mv->as_mv); + sad_list[1] = sad_list[2] = sad_list[3] = sad_list[4] = INT_MAX; if (thissme < bestsme) { bestsme = thissme; - mode_mv[NEWMV].as_int = mi->bmi[i].as_mv[0].as_int; + *new_mv = best_mv->as_mv; } else { - /* The full search result is actually worse so re-instate the - * previous best vector */ - mi->bmi[i].as_mv[0].as_int = mode_mv[NEWMV].as_int; + // The full search result is actually worse so re-instate the + // previous best vector + best_mv->as_mv = *new_mv; } } if (bestsme < INT_MAX) { int distortion; - unsigned int sse; - cpi->find_fractional_mv_step(x, - &mode_mv[NEWMV].as_mv, - &bsi->ref_mv->as_mv, - cpi->common.allow_high_precision_mv, - x->errorperbit, v_fn_ptr, - 0, cpi->sf.subpel_iters_per_step, - x->nmvjointcost, x->mvcost, - &distortion, &sse); + cpi->find_fractional_mv_step( + x, + new_mv, + &bsi->ref_mv[0]->as_mv, + cm->allow_high_precision_mv, + x->errorperbit, &cpi->fn_ptr[bsize], + cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_iters_per_step, + cond_sad_list(cpi, sad_list), + x->nmvjointcost, x->mvcost, + &distortion, + &x->pred_sse[mbmi->ref_frame[0]], + NULL, 0, 0); // save motion search result for use in compound prediction - seg_mvs[i][mbmi->ref_frame[0]].as_int = mode_mv[NEWMV].as_int; + seg_mvs[i][mbmi->ref_frame[0]].as_mv = *new_mv; } if (cpi->sf.adaptive_motion_search) - x->pred_mv[mbmi->ref_frame[0]].as_int = mode_mv[NEWMV].as_int; + x->pred_mv[mbmi->ref_frame[0]] = *new_mv; // restore src pointers mi_buf_restore(x, orig_src, orig_pre); @@ -1923,58 +1488,44 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, } bsi->rdstat[i][mode_idx].brate = - labels2mode(x, i, this_mode, &mode_mv[this_mode], - &second_mode_mv[this_mode], frame_mv, seg_mvs[i], - bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost, - x->mvcost, cpi); + set_and_cost_bmi_mvs(cpi, xd, i, this_mode, mode_mv[this_mode], + frame_mv, seg_mvs[i], bsi->ref_mv, + x->nmvjointcost, x->mvcost); - - bsi->rdstat[i][mode_idx].mvs[0].as_int = mode_mv[this_mode].as_int; - if (num_4x4_blocks_wide > 1) - bsi->rdstat[i + 1][mode_idx].mvs[0].as_int = - mode_mv[this_mode].as_int; - if (num_4x4_blocks_high > 1) - bsi->rdstat[i + 2][mode_idx].mvs[0].as_int = - mode_mv[this_mode].as_int; - if (has_second_rf) { - bsi->rdstat[i][mode_idx].mvs[1].as_int = - second_mode_mv[this_mode].as_int; + for (ref = 0; ref < 1 + has_second_rf; ++ref) { + bsi->rdstat[i][mode_idx].mvs[ref].as_int = + mode_mv[this_mode][ref].as_int; if (num_4x4_blocks_wide > 1) - bsi->rdstat[i + 1][mode_idx].mvs[1].as_int = - second_mode_mv[this_mode].as_int; + bsi->rdstat[i + 1][mode_idx].mvs[ref].as_int = + mode_mv[this_mode][ref].as_int; if (num_4x4_blocks_high > 1) - bsi->rdstat[i + 2][mode_idx].mvs[1].as_int = - second_mode_mv[this_mode].as_int; + bsi->rdstat[i + 2][mode_idx].mvs[ref].as_int = + mode_mv[this_mode][ref].as_int; } // Trap vectors that reach beyond the UMV borders - if (mv_check_bounds(x, &mode_mv[this_mode])) - continue; - if (has_second_rf && - mv_check_bounds(x, &second_mode_mv[this_mode])) + if (mv_check_bounds(x, &mode_mv[this_mode][0].as_mv) || + (has_second_rf && + mv_check_bounds(x, &mode_mv[this_mode][1].as_mv))) continue; if (filter_idx > 0) { BEST_SEG_INFO *ref_bsi = bsi_buf; - subpelmv = (mode_mv[this_mode].as_mv.row & 0x0f) || - (mode_mv[this_mode].as_mv.col & 0x0f); - have_ref = mode_mv[this_mode].as_int == - ref_bsi->rdstat[i][mode_idx].mvs[0].as_int; - if (has_second_rf) { - subpelmv |= (second_mode_mv[this_mode].as_mv.row & 0x0f) || - (second_mode_mv[this_mode].as_mv.col & 0x0f); - have_ref &= second_mode_mv[this_mode].as_int == - ref_bsi->rdstat[i][mode_idx].mvs[1].as_int; + subpelmv = 0; + have_ref = 1; + + for (ref = 0; ref < 1 + has_second_rf; ++ref) { + subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv); + have_ref &= mode_mv[this_mode][ref].as_int == + ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int; } if (filter_idx > 1 && !subpelmv && !have_ref) { ref_bsi = bsi_buf + 1; - have_ref = mode_mv[this_mode].as_int == - ref_bsi->rdstat[i][mode_idx].mvs[0].as_int; - if (has_second_rf) { - have_ref &= second_mode_mv[this_mode].as_int == - ref_bsi->rdstat[i][mode_idx].mvs[1].as_int; - } + have_ref = 1; + for (ref = 0; ref < 1 + has_second_rf; ++ref) + have_ref &= mode_mv[this_mode][ref].as_int == + ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int; } if (!subpelmv && have_ref && @@ -2003,16 +1554,17 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, &bsi->rdstat[i][mode_idx].bdist, &bsi->rdstat[i][mode_idx].bsse, bsi->rdstat[i][mode_idx].ta, - bsi->rdstat[i][mode_idx].tl); + bsi->rdstat[i][mode_idx].tl, + mi_row, mi_col); if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) { bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv, bsi->rdstat[i][mode_idx].brate, 0); bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate; - bsi->rdstat[i][mode_idx].eobs = pd->eobs[i]; + bsi->rdstat[i][mode_idx].eobs = p->eobs[i]; if (num_4x4_blocks_wide > 1) - bsi->rdstat[i + 1][mode_idx].eobs = pd->eobs[i + 1]; + bsi->rdstat[i + 1][mode_idx].eobs = p->eobs[i + 1]; if (num_4x4_blocks_high > 1) - bsi->rdstat[i + 2][mode_idx].eobs = pd->eobs[i + 2]; + bsi->rdstat[i + 2][mode_idx].eobs = p->eobs[i + 2]; } if (bsi->rdstat[i][mode_idx].brdcost < best_rd) { @@ -2027,17 +1579,16 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, for (midx = 0; midx < INTER_MODES; ++midx) bsi->rdstat[iy][midx].brdcost = INT64_MAX; bsi->segment_rd = INT64_MAX; - return; + return INT64_MAX;; } mode_idx = INTER_OFFSET(mode_selected); vpx_memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above)); vpx_memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left)); - labels2mode(x, i, mode_selected, &mode_mv[mode_selected], - &second_mode_mv[mode_selected], frame_mv, seg_mvs[i], - bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost, - x->mvcost, cpi); + set_and_cost_bmi_mvs(cpi, xd, i, mode_selected, mode_mv[mode_selected], + frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost, + x->mvcost); br += bsi->rdstat[i][mode_idx].brate; bd += bsi->rdstat[i][mode_idx].bdist; @@ -2051,7 +1602,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, for (midx = 0; midx < INTER_MODES; ++midx) bsi->rdstat[iy][midx].brdcost = INT64_MAX; bsi->segment_rd = INT64_MAX; - return; + return INT64_MAX;; } } } /* for each label */ @@ -2063,44 +1614,8 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, bsi->sse = block_sse; // update the coding decisions - for (i = 0; i < 4; ++i) - bsi->modes[i] = mi->bmi[i].as_mode; -} - -static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, - const TileInfo *const tile, - int_mv *best_ref_mv, - int_mv *second_best_ref_mv, - int64_t best_rd, - int *returntotrate, - int *returnyrate, - int64_t *returndistortion, - int *skippable, int64_t *psse, - int mvthresh, - int_mv seg_mvs[4][MAX_REF_FRAMES], - BEST_SEG_INFO *bsi_buf, - int filter_idx, - int mi_row, int mi_col) { - int i; - BEST_SEG_INFO *bsi = bsi_buf + filter_idx; - MACROBLOCKD *xd = &x->e_mbd; - MODE_INFO *mi = xd->mi_8x8[0]; - MB_MODE_INFO *mbmi = &mi->mbmi; - int mode_idx; - - vp9_zero(*bsi); - - bsi->segment_rd = best_rd; - bsi->ref_mv = best_ref_mv; - bsi->second_ref_mv = second_best_ref_mv; - bsi->mvp.as_int = best_ref_mv->as_int; - bsi->mvthresh = mvthresh; - - for (i = 0; i < 4; i++) - bsi->modes[i] = ZEROMV; - - rd_check_segment_txsize(cpi, x, tile, bsi_buf, filter_idx, seg_mvs, - mi_row, mi_col); + for (k = 0; k < 4; ++k) + bsi->modes[k] = mi->bmi[k].as_mode; if (bsi->segment_rd > best_rd) return INT64_MAX; @@ -2110,7 +1625,7 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int; if (has_second_ref(mbmi)) mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int; - xd->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs; + x->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs; mi->bmi[i].as_mode = bsi->modes[i]; } @@ -2120,73 +1635,19 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, *returntotrate = bsi->r; *returndistortion = bsi->d; *returnyrate = bsi->segment_yrate; - *skippable = vp9_is_skippable_in_plane(&x->e_mbd, BLOCK_8X8, 0); + *skippable = vp9_is_skippable_in_plane(x, BLOCK_8X8, 0); *psse = bsi->sse; mbmi->mode = bsi->modes[3]; return bsi->segment_rd; } -static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x, - uint8_t *ref_y_buffer, int ref_y_stride, - int ref_frame, BLOCK_SIZE block_size ) { - MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; - int_mv this_mv; - int i; - int zero_seen = 0; - int best_index = 0; - int best_sad = INT_MAX; - int this_sad = INT_MAX; - unsigned int max_mv = 0; - - uint8_t *src_y_ptr = x->plane[0].src.buf; - uint8_t *ref_y_ptr; - int row_offset, col_offset; - int num_mv_refs = MAX_MV_REF_CANDIDATES + - (cpi->sf.adaptive_motion_search && - cpi->common.show_frame && - block_size < cpi->sf.max_partition_size); - - // Get the sad for each candidate reference mv - for (i = 0; i < num_mv_refs; i++) { - this_mv.as_int = (i < MAX_MV_REF_CANDIDATES) ? - mbmi->ref_mvs[ref_frame][i].as_int : x->pred_mv[ref_frame].as_int; - - max_mv = MAX(max_mv, - MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3); - // The list is at an end if we see 0 for a second time. - if (!this_mv.as_int && zero_seen) - break; - zero_seen = zero_seen || !this_mv.as_int; - - row_offset = this_mv.as_mv.row >> 3; - col_offset = this_mv.as_mv.col >> 3; - ref_y_ptr = ref_y_buffer + (ref_y_stride * row_offset) + col_offset; - - // Find sad for current vector. - this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride, - ref_y_ptr, ref_y_stride, - 0x7fffffff); - - // Note if it is the best so far. - if (this_sad < best_sad) { - best_sad = this_sad; - best_index = i; - } - } - - // Note the index of the mv that worked best in the reference list. - x->mv_best_ref_index[ref_frame] = best_index; - x->max_mv_context[ref_frame] = max_mv; -} - -static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, +static void estimate_ref_frame_costs(const VP9_COMMON *cm, + const MACROBLOCKD *xd, + int segment_id, unsigned int *ref_costs_single, unsigned int *ref_costs_comp, vp9_prob *comp_mode_p) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &cpi->mb.e_mbd; int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME); if (seg_ref_active) { @@ -2194,11 +1655,11 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, vpx_memset(ref_costs_comp, 0, MAX_REF_FRAMES * sizeof(*ref_costs_comp)); *comp_mode_p = 128; } else { - vp9_prob intra_inter_p = vp9_get_pred_prob_intra_inter(cm, xd); + vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd); vp9_prob comp_inter_p = 128; - if (cm->comp_pred_mode == HYBRID_PREDICTION) { - comp_inter_p = vp9_get_pred_prob_comp_inter_inter(cm, xd); + if (cm->reference_mode == REFERENCE_MODE_SELECT) { + comp_inter_p = vp9_get_reference_mode_prob(cm, xd); *comp_mode_p = comp_inter_p; } else { *comp_mode_p = 128; @@ -2206,12 +1667,12 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0); - if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) { + if (cm->reference_mode != COMPOUND_REFERENCE) { vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd); vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd); unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1); - if (cm->comp_pred_mode == HYBRID_PREDICTION) + if (cm->reference_mode == REFERENCE_MODE_SELECT) base_cost += vp9_cost_bit(comp_inter_p, 0); ref_costs_single[LAST_FRAME] = ref_costs_single[GOLDEN_FRAME] = @@ -2226,11 +1687,11 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, ref_costs_single[GOLDEN_FRAME] = 512; ref_costs_single[ALTREF_FRAME] = 512; } - if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY) { + if (cm->reference_mode != SINGLE_REFERENCE) { vp9_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd); unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1); - if (cm->comp_pred_mode == HYBRID_PREDICTION) + if (cm->reference_mode == REFERENCE_MODE_SELECT) base_cost += vp9_cost_bit(comp_inter_p, 1); ref_costs_comp[LAST_FRAME] = base_cost + vp9_cost_bit(ref_comp_p, 0); @@ -2244,142 +1705,90 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int mode_index, - int_mv *ref_mv, - int_mv *second_ref_mv, - int64_t comp_pred_diff[NB_PREDICTION_TYPES], - int64_t tx_size_diff[TX_MODES], - int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) { + int64_t comp_pred_diff[REFERENCE_MODES], + const int64_t tx_size_diff[TX_MODES], + int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS], + int skippable) { MACROBLOCKD *const xd = &x->e_mbd; // Take a snapshot of the coding context so it can be // restored if we decide to encode this way ctx->skip = x->skip; + ctx->skippable = skippable; ctx->best_mode_index = mode_index; - ctx->mic = *xd->mi_8x8[0]; - - ctx->best_ref_mv.as_int = ref_mv->as_int; - ctx->second_best_ref_mv.as_int = second_ref_mv->as_int; - - ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_PREDICTION_ONLY]; - ctx->comp_pred_diff = (int)comp_pred_diff[COMP_PREDICTION_ONLY]; - ctx->hybrid_pred_diff = (int)comp_pred_diff[HYBRID_PREDICTION]; + ctx->mic = *xd->mi[0]; + ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE]; + ctx->comp_pred_diff = (int)comp_pred_diff[COMPOUND_REFERENCE]; + ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT]; vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff)); vpx_memcpy(ctx->best_filter_diff, best_filter_diff, sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS); } -static void setup_pred_block(const MACROBLOCKD *xd, - struct buf_2d dst[MAX_MB_PLANE], - const YV12_BUFFER_CONFIG *src, - int mi_row, int mi_col, - const struct scale_factors *scale, - const struct scale_factors *scale_uv) { - int i; - - dst[0].buf = src->y_buffer; - dst[0].stride = src->y_stride; - dst[1].buf = src->u_buffer; - dst[2].buf = src->v_buffer; - dst[1].stride = dst[2].stride = src->uv_stride; -#if CONFIG_ALPHA - dst[3].buf = src->alpha_buffer; - dst[3].stride = src->alpha_stride; -#endif - - // TODO(jkoleszar): Make scale factors per-plane data - for (i = 0; i < MAX_MB_PLANE; i++) { - setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col, - i ? scale_uv : scale, - xd->plane[i].subsampling_x, xd->plane[i].subsampling_y); - } -} - static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, const TileInfo *const tile, - int idx, MV_REFERENCE_FRAME frame_type, + MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE block_size, int mi_row, int mi_col, int_mv frame_nearest_mv[MAX_REF_FRAMES], int_mv frame_near_mv[MAX_REF_FRAMES], - struct buf_2d yv12_mb[4][MAX_MB_PLANE], - struct scale_factors scale[MAX_REF_FRAMES]) { - VP9_COMMON *cm = &cpi->common; - YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]]; + struct buf_2d yv12_mb[4][MAX_MB_PLANE]) { + const VP9_COMMON *cm = &cpi->common; + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; - - // set up scaling factors - scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1]; - - scale[frame_type].sfc->set_scaled_offsets(&scale[frame_type], - mi_row * MI_SIZE, mi_col * MI_SIZE); + MODE_INFO *const mi = xd->mi[0]; + int_mv *const candidates = mi->mbmi.ref_mvs[ref_frame]; + const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf; // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this // use the UV scaling factors. - setup_pred_block(xd, yv12_mb[frame_type], yv12, mi_row, mi_col, - &scale[frame_type], &scale[frame_type]); + vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf); // Gets an initial list of candidate vectors from neighbours and orders them - vp9_find_mv_refs(cm, xd, tile, xd->mi_8x8[0], - xd->last_mi, - frame_type, - mbmi->ref_mvs[frame_type], mi_row, mi_col); + vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col); // Candidate refinement carried out at encoder and decoder - vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, - mbmi->ref_mvs[frame_type], - &frame_nearest_mv[frame_type], - &frame_near_mv[frame_type]); + vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates, + &frame_nearest_mv[ref_frame], + &frame_near_mv[ref_frame]); // Further refinement that is encode side only to test the top few candidates // in full and choose the best as the centre point for subsequent searches. // The current implementation doesn't support scaling. - if (!vp9_is_scaled(scale[frame_type].sfc) && block_size >= BLOCK_8X8) - mv_pred(cpi, x, yv12_mb[frame_type][0].buf, yv12->y_stride, - frame_type, block_size); -} - -static YV12_BUFFER_CONFIG *get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) { - YV12_BUFFER_CONFIG *scaled_ref_frame = NULL; - int fb = get_ref_frame_idx(cpi, ref_frame); - int fb_scale = get_scale_ref_frame_idx(cpi, ref_frame); - if (cpi->scaled_ref_idx[fb_scale] != cpi->common.ref_frame_map[fb]) - scaled_ref_frame = &cpi->common.yv12_fb[cpi->scaled_ref_idx[fb_scale]]; - return scaled_ref_frame; -} - -static INLINE int get_switchable_rate(const MACROBLOCK *x) { - const MACROBLOCKD *const xd = &x->e_mbd; - const MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; - const int ctx = vp9_get_pred_context_switchable_interp(xd); - return SWITCHABLE_INTERP_RATE_FACTOR * - x->switchable_interp_costs[ctx][mbmi->interp_filter]; + if (!vp9_is_scaled(sf) && block_size >= BLOCK_8X8) + vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, + ref_frame, block_size); } static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, - const TileInfo *const tile, BLOCK_SIZE bsize, int mi_row, int mi_col, int_mv *tmp_mv, int *rate_mv) { MACROBLOCKD *xd = &x->e_mbd; - VP9_COMMON *cm = &cpi->common; - MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; - struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}}; + const VP9_COMMON *cm = &cpi->common; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; + struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}}; int bestsme = INT_MAX; - int further_steps, step_param; + int step_param; int sadpb = x->sadperbit16; - int_mv mvp_full; + MV mvp_full; int ref = mbmi->ref_frame[0]; - int_mv ref_mv = mbmi->ref_mvs[ref][0]; - const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]); + MV ref_mv = mbmi->ref_mvs[ref][0].as_mv; int tmp_col_min = x->mv_col_min; int tmp_col_max = x->mv_col_max; int tmp_row_min = x->mv_row_min; int tmp_row_max = x->mv_row_max; + int sad_list[5]; - YV12_BUFFER_CONFIG *scaled_ref_frame = get_scaled_ref_frame(cpi, ref); + const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, + ref); + + MV pred_mv[3]; + pred_mv[0] = mbmi->ref_mvs[ref][0].as_mv; + pred_mv[1] = mbmi->ref_mvs[ref][1].as_mv; + pred_mv[2] = x->pred_mv[ref]; if (scaled_ref_frame) { int i; @@ -2389,84 +1798,63 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0]; - setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL); + vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL); } - vp9_clamp_mv_min_max(x, &ref_mv.as_mv); + vp9_set_mv_search_range(x, &ref_mv); - // Adjust search parameters based on small partitions' result. - if (x->fast_ms) { - // && abs(mvp_full.as_mv.row - x->pred_mv.as_mv.row) < 24 && - // abs(mvp_full.as_mv.col - x->pred_mv.as_mv.col) < 24) { - // adjust search range - step_param = 6; - if (x->fast_ms > 1) - step_param = 8; - - // Get prediction MV. - mvp_full.as_int = x->pred_mv[ref].as_int; - - // Adjust MV sign if needed. - if (cm->ref_frame_sign_bias[ref]) { - mvp_full.as_mv.col *= -1; - mvp_full.as_mv.row *= -1; - } + // Work out the size of the first step in the mv step search. + // 0 here is maximum length first step. 1 is MAX >> 1 etc. + if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) { + // Take wtd average of the step_params based on the last frame's + // max mv magnitude and that based on the best ref mvs of the current + // block for the given reference. + step_param = (vp9_init_search_range(x->max_mv_context[ref]) + + cpi->mv_step_param) / 2; } else { - // Work out the size of the first step in the mv step search. - // 0 here is maximum length first step. 1 is MAX >> 1 etc. - if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) { - // Take wtd average of the step_params based on the last frame's - // max mv magnitude and that based on the best ref mvs of the current - // block for the given reference. - step_param = (vp9_init_search_range(cpi, x->max_mv_context[ref]) + - cpi->mv_step_param) >> 1; - } else { - step_param = cpi->mv_step_param; - } + step_param = cpi->mv_step_param; } - if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 && - cpi->common.show_frame) { + if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) { int boffset = 2 * (b_width_log2(BLOCK_64X64) - MIN(b_height_log2(bsize), b_width_log2(bsize))); step_param = MAX(step_param, boffset); } - mvp_full.as_int = x->mv_best_ref_index[ref] < MAX_MV_REF_CANDIDATES ? - mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_int : - x->pred_mv[ref].as_int; + if (cpi->sf.adaptive_motion_search) { + int bwl = b_width_log2(bsize); + int bhl = b_height_log2(bsize); + int i; + int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4); - mvp_full.as_mv.col >>= 3; - mvp_full.as_mv.row >>= 3; + if (tlevel < 5) + step_param += 2; - // Further step/diamond searches as necessary - further_steps = (cpi->sf.max_step_search_steps - 1) - step_param; + for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) { + if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) { + x->pred_mv[ref].row = 0; + x->pred_mv[ref].col = 0; + tmp_mv->as_int = INVALID_MV; - if (cpi->sf.search_method == HEX) { - bestsme = vp9_hex_search(x, &mvp_full.as_mv, - step_param, - sadpb, 1, - &cpi->fn_ptr[block_size], 1, - &ref_mv.as_mv, &tmp_mv->as_mv); - } else if (cpi->sf.search_method == SQUARE) { - bestsme = vp9_square_search(x, &mvp_full.as_mv, - step_param, - sadpb, 1, - &cpi->fn_ptr[block_size], 1, - &ref_mv.as_mv, &tmp_mv->as_mv); - } else if (cpi->sf.search_method == BIGDIA) { - bestsme = vp9_bigdia_search(x, &mvp_full.as_mv, - step_param, - sadpb, 1, - &cpi->fn_ptr[block_size], 1, - &ref_mv.as_mv, &tmp_mv->as_mv); - } else { - bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, - sadpb, further_steps, 1, - &cpi->fn_ptr[block_size], - &ref_mv, tmp_mv); + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) + xd->plane[i].pre[0] = backup_yv12[i]; + } + return; + } + } } + mvp_full = pred_mv[x->mv_best_ref_index[ref]]; + + mvp_full.col >>= 3; + mvp_full.row >>= 3; + + bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, + cond_sad_list(cpi, sad_list), + &ref_mv, &tmp_mv->as_mv, INT_MAX, 1); + x->mv_col_min = tmp_col_min; x->mv_col_max = tmp_col_max; x->mv_row_min = tmp_row_min; @@ -2474,20 +1862,21 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, if (bestsme < INT_MAX) { int dis; /* TODO: use dis in distortion calculation later. */ - unsigned int sse; - cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv, + cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv, cm->allow_high_precision_mv, x->errorperbit, - &cpi->fn_ptr[block_size], - 0, cpi->sf.subpel_iters_per_step, + &cpi->fn_ptr[bsize], + cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_iters_per_step, + cond_sad_list(cpi, sad_list), x->nmvjointcost, x->mvcost, - &dis, &sse); + &dis, &x->pred_sse[ref], NULL, 0, 0); } - *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv, + *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); - if (cpi->sf.adaptive_motion_search && cpi->common.show_frame) - x->pred_mv[ref].as_int = tmp_mv->as_int; + if (cpi->sf.adaptive_motion_search) + x->pred_mv[ref] = tmp_mv->as_mv; if (scaled_ref_frame) { int i; @@ -2502,24 +1891,25 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES], int *rate_mv) { - int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize); + const int pw = 4 * num_4x4_blocks_wide_lookup[bsize]; + const int ph = 4 * num_4x4_blocks_high_lookup[bsize]; MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] }; int_mv ref_mv[2]; - const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]); int ite, ref; // Prediction buffer from second frame. uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t)); + const InterpKernel *kernel = vp9_get_interp_kernel(mbmi->interp_filter); // Do joint motion search in compound mode to get more accurate mv. struct buf_2d backup_yv12[2][MAX_MB_PLANE]; struct buf_2d scaled_first_yv12 = xd->plane[0].pre[0]; int last_besterr[2] = {INT_MAX, INT_MAX}; - YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = { - get_scaled_ref_frame(cpi, mbmi->ref_frame[0]), - get_scaled_ref_frame(cpi, mbmi->ref_frame[1]) + const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = { + vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]), + vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1]) }; for (ref = 0; ref < 2; ++ref) { @@ -2532,11 +1922,10 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, // motion search code to be used without additional modifications. for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[ref][i] = xd->plane[i].pre[ref]; - setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col, NULL); + vp9_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col, + NULL); } - xd->scale_factor[ref].sfc->set_scaled_offsets(&xd->scale_factor[ref], - mi_row, mi_col); frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int; } @@ -2546,7 +1935,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, struct buf_2d ref_yv12[2]; int bestsme = INT_MAX; int sadpb = x->sadperbit16; - int_mv tmp_mv; + MV tmp_mv; int search_range = 3; int tmp_col_min = x->mv_col_min; @@ -2564,28 +1953,30 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, ref_yv12[!id].stride, second_pred, pw, &frame_mv[refs[!id]].as_mv, - &xd->scale_factor[!id], + &xd->block_refs[!id]->sf, pw, ph, 0, - &xd->subpix, MV_PRECISION_Q3); + kernel, MV_PRECISION_Q3, + mi_col * MI_SIZE, mi_row * MI_SIZE); // Compound motion search on first ref frame. if (id) xd->plane[0].pre[0] = ref_yv12[id]; - vp9_clamp_mv_min_max(x, &ref_mv[id].as_mv); + vp9_set_mv_search_range(x, &ref_mv[id].as_mv); // Use mv result from single mode as mvp. - tmp_mv.as_int = frame_mv[refs[id]].as_int; + tmp_mv = frame_mv[refs[id]].as_mv; - tmp_mv.as_mv.col >>= 3; - tmp_mv.as_mv.row >>= 3; + tmp_mv.col >>= 3; + tmp_mv.row >>= 3; // Small-range full-pixel motion search bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb, search_range, - &cpi->fn_ptr[block_size], - x->nmvjointcost, x->mvcost, - &ref_mv[id], second_pred, - pw, ph); + &cpi->fn_ptr[bsize], + &ref_mv[id].as_mv, second_pred); + if (bestsme < INT_MAX) + bestsme = vp9_get_mvpred_av_var(x, &tmp_mv, &ref_mv[id].as_mv, + second_pred, &cpi->fn_ptr[bsize], 1); x->mv_col_min = tmp_col_min; x->mv_col_max = tmp_col_max; @@ -2595,14 +1986,14 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, if (bestsme < INT_MAX) { int dis; /* TODO: use dis in distortion calculation later. */ unsigned int sse; - - bestsme = cpi->find_fractional_mv_step_comp( - x, &tmp_mv.as_mv, + bestsme = cpi->find_fractional_mv_step( + x, &tmp_mv, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv, x->errorperbit, - &cpi->fn_ptr[block_size], - 0, cpi->sf.subpel_iters_per_step, + &cpi->fn_ptr[bsize], + 0, cpi->sf.mv.subpel_iters_per_step, + NULL, x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, pw, ph); @@ -2612,7 +2003,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, xd->plane[0].pre[0] = scaled_first_yv12; if (bestsme < last_besterr[id]) { - frame_mv[refs[id]].as_int = tmp_mv.as_int; + frame_mv[refs[id]].as_mv = tmp_mv; last_besterr[id] = bestsme; } else { break; @@ -2637,26 +2028,116 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, vpx_free(second_pred); } +static INLINE void restore_dst_buf(MACROBLOCKD *xd, + uint8_t *orig_dst[MAX_MB_PLANE], + int orig_dst_stride[MAX_MB_PLANE]) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].dst.buf = orig_dst[i]; + xd->plane[i].dst.stride = orig_dst_stride[i]; + } +} + +static void rd_encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int *rate2, + int64_t *distortion, int64_t *distortion_uv, + int *disable_skip) { + VP9_COMMON *cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]); + const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]); + unsigned int var, sse; + // Skipping threshold for ac. + unsigned int thresh_ac; + // Skipping threshold for dc + unsigned int thresh_dc; + + var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride, + xd->plane[0].dst.buf, + xd->plane[0].dst.stride, &sse); + + if (x->encode_breakout > 0) { + // Set a maximum for threshold to avoid big PSNR loss in low bitrate + // case. Use extreme low threshold for static frames to limit skipping. + const unsigned int max_thresh = (cpi->allow_encode_breakout == + ENCODE_BREAKOUT_LIMITED) ? 128 : 36000; + // The encode_breakout input + const unsigned int min_thresh = + MIN(((unsigned int)x->encode_breakout << 4), max_thresh); + + // Calculate threshold according to dequant value. + thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9; + thresh_ac = clamp(thresh_ac, min_thresh, max_thresh); + + // Adjust threshold according to partition size. + thresh_ac >>= 8 - (b_width_log2(bsize) + + b_height_log2(bsize)); + thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6); + } else { + thresh_ac = 0; + thresh_dc = 0; + } + + // Y skipping condition checking + if (sse < thresh_ac || sse == 0) { + // dc skipping checking + if ((sse - var) < thresh_dc || sse == var) { + unsigned int sse_u, sse_v; + unsigned int var_u, var_v; + + var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf, + x->plane[1].src.stride, + xd->plane[1].dst.buf, + xd->plane[1].dst.stride, &sse_u); + + // U skipping condition checking + if ((sse_u * 4 < thresh_ac || sse_u == 0) && + (sse_u - var_u < thresh_dc || sse_u == var_u)) { + var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf, + x->plane[2].src.stride, + xd->plane[2].dst.buf, + xd->plane[2].dst.stride, &sse_v); + + // V skipping condition checking + if ((sse_v * 4 < thresh_ac || sse_v == 0) && + (sse_v - var_v < thresh_dc || sse_v == var_v)) { + x->skip = 1; + + // The cost of skip bit needs to be added. + *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); + + // Scaling factor for SSE from spatial domain to frequency domain + // is 16. Adjust distortion accordingly. + *distortion_uv = (sse_u + sse_v) << 4; + *distortion = (sse << 4) + *distortion_uv; + + *disable_skip = 1; + } + } + } + } +} + static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, - const TileInfo *const tile, BLOCK_SIZE bsize, int64_t txfm_cache[], int *rate2, int64_t *distortion, int *skippable, int *rate_y, int64_t *distortion_y, int *rate_uv, int64_t *distortion_uv, - int *mode_excluded, int *disable_skip, - INTERPOLATION_TYPE *best_filter, + int *disable_skip, int_mv (*mode_mv)[MAX_REF_FRAMES], int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES], + INTERP_FILTER (*single_filter)[MAX_REF_FRAMES], + int (*single_skippable)[MAX_REF_FRAMES], int64_t *psse, const int64_t ref_best_rd) { VP9_COMMON *cm = &cpi->common; + RD_OPT *rd_opt = &cpi->rd; MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; const int is_comp_pred = has_second_ref(mbmi); - const int num_refs = is_comp_pred ? 2 : 1; const int this_mode = mbmi->mode; int_mv *frame_mv = mode_mv[this_mode]; int i; @@ -2667,16 +2148,41 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64); int pred_exists = 0; int intpel_mv; - int64_t rd, best_rd = INT64_MAX; + int64_t rd, tmp_rd, best_rd = INT64_MAX; int best_needs_copy = 0; uint8_t *orig_dst[MAX_MB_PLANE]; int orig_dst_stride[MAX_MB_PLANE]; int rs = 0; + INTERP_FILTER best_filter = SWITCHABLE; + uint8_t skip_txfm[MAX_MB_PLANE << 2] = {0}; + int64_t bsse[MAX_MB_PLANE << 2] = {0}; + + int bsl = mi_width_log2_lookup[bsize]; + int pred_filter_search = cpi->sf.cb_pred_filter_search ? + (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_video_frame)) & 0x1 : 0; + + if (pred_filter_search) { + INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE; + if (xd->up_available) + af = xd->mi[-xd->mi_stride]->mbmi.interp_filter; + if (xd->left_available) + lf = xd->mi[-1]->mbmi.interp_filter; + + if ((this_mode != NEWMV) || (af == lf)) + best_filter = af; + } if (is_comp_pred) { if (frame_mv[refs[0]].as_int == INVALID_MV || frame_mv[refs[1]].as_int == INVALID_MV) return INT64_MAX; + + if (cpi->sf.adaptive_mode_search) { + if (single_filter[this_mode][refs[0]] == + single_filter[this_mode][refs[1]]) + best_filter = single_filter[this_mode][refs[0]]; + } } if (this_mode == NEWMV) { @@ -2700,58 +2206,24 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, *rate2 += rate_mv; } else { int_mv tmp_mv; - single_motion_search(cpi, x, tile, bsize, mi_row, mi_col, + single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv); + if (tmp_mv.as_int == INVALID_MV) + return INT64_MAX; *rate2 += rate_mv; frame_mv[refs[0]].as_int = - xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int; + xd->mi[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int; single_newmv[refs[0]].as_int = tmp_mv.as_int; } } - // if we're near/nearest and mv == 0,0, compare to zeromv - if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) && - frame_mv[refs[0]].as_int == 0 && - !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) && - (num_refs == 1 || frame_mv[refs[1]].as_int == 0)) { - int rfc = mbmi->mode_context[mbmi->ref_frame[0]]; - int c1 = cost_mv_ref(cpi, NEARMV, rfc); - int c2 = cost_mv_ref(cpi, NEARESTMV, rfc); - int c3 = cost_mv_ref(cpi, ZEROMV, rfc); - - if (this_mode == NEARMV) { - if (c1 > c3) - return INT64_MAX; - } else if (this_mode == NEARESTMV) { - if (c2 > c3) - return INT64_MAX; - } else { - assert(this_mode == ZEROMV); - if (num_refs == 1) { - if ((c3 >= c2 && - mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) || - (c3 >= c1 && - mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0)) - return INT64_MAX; - } else { - if ((c3 >= c2 && - mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0 && - mode_mv[NEARESTMV][mbmi->ref_frame[1]].as_int == 0) || - (c3 >= c1 && - mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0 && - mode_mv[NEARMV][mbmi->ref_frame[1]].as_int == 0)) - return INT64_MAX; - } - } - } - - for (i = 0; i < num_refs; ++i) { + for (i = 0; i < is_comp_pred + 1; ++i) { cur_mv[i] = frame_mv[refs[i]]; // Clip "next_nearest" so that it does not extend to far out of image if (this_mode != NEWMV) clamp_mv2(&cur_mv[i].as_mv, xd); - if (mv_check_bounds(x, &cur_mv[i])) + if (mv_check_bounds(x, &cur_mv[i].as_mv)) return INT64_MAX; mbmi->mv[i].as_int = cur_mv[i].as_int; } @@ -2770,67 +2242,63 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other * words if you present them in that order, the second one is always known * if the first is known */ - *rate2 += cost_mv_ref(cpi, this_mode, - mbmi->mode_context[mbmi->ref_frame[0]]); + *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]); - if (!(*mode_excluded)) { - if (is_comp_pred) { - *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY); - } else { - *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY); - } - } + if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd && + mbmi->mode != NEARESTMV) + return INT64_MAX; pred_exists = 0; // Are all MVs integer pel for Y and UV - intpel_mv = (mbmi->mv[0].as_mv.row & 15) == 0 && - (mbmi->mv[0].as_mv.col & 15) == 0; + intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv); if (is_comp_pred) - intpel_mv &= (mbmi->mv[1].as_mv.row & 15) == 0 && - (mbmi->mv[1].as_mv.col & 15) == 0; + intpel_mv &= !mv_has_subpel(&mbmi->mv[1].as_mv); + // Search for best switchable filter by checking the variance of // pred error irrespective of whether the filter will be used - if (cm->mcomp_filter_type != BILINEAR) { - *best_filter = EIGHTTAP; - if (x->source_variance < - cpi->sf.disable_filter_search_var_thresh) { - *best_filter = EIGHTTAP; - vp9_zero(cpi->rd_filter_cache); - } else { - int i, newbest; + rd_opt->mask_filter = 0; + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) + rd_opt->filter_cache[i] = INT64_MAX; + + if (cm->interp_filter != BILINEAR) { + if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) { + best_filter = EIGHTTAP; + } else if (best_filter == SWITCHABLE) { + int newbest; int tmp_rate_sum = 0; int64_t tmp_dist_sum = 0; - cpi->rd_filter_cache[SWITCHABLE_FILTERS] = INT64_MAX; for (i = 0; i < SWITCHABLE_FILTERS; ++i) { int j; int64_t rs_rd; mbmi->interp_filter = i; - vp9_setup_interp_filters(xd, mbmi->interp_filter, cm); - rs = get_switchable_rate(x); + rs = vp9_get_switchable_rate(cpi); rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0); if (i > 0 && intpel_mv) { - cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv, - tmp_rate_sum, tmp_dist_sum); - cpi->rd_filter_cache[SWITCHABLE_FILTERS] = - MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS], - cpi->rd_filter_cache[i] + rs_rd); - rd = cpi->rd_filter_cache[i]; - if (cm->mcomp_filter_type == SWITCHABLE) + rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum); + rd_opt->filter_cache[i] = rd; + rd_opt->filter_cache[SWITCHABLE_FILTERS] = + MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS], rd + rs_rd); + if (cm->interp_filter == SWITCHABLE) rd += rs_rd; + rd_opt->mask_filter = MAX(rd_opt->mask_filter, rd); } else { int rate_sum = 0; int64_t dist_sum = 0; - if ((cm->mcomp_filter_type == SWITCHABLE && + if (i > 0 && cpi->sf.adaptive_interp_filter_search && + (cpi->sf.interp_filter_search_mask & (1 << i))) { + rate_sum = INT_MAX; + dist_sum = INT64_MAX; + continue; + } + + if ((cm->interp_filter == SWITCHABLE && (!i || best_needs_copy)) || - (cm->mcomp_filter_type != SWITCHABLE && - (cm->mcomp_filter_type == mbmi->interp_filter || + (cm->interp_filter != SWITCHABLE && + (cm->interp_filter == mbmi->interp_filter || (i == 0 && intpel_mv)))) { - for (j = 0; j < MAX_MB_PLANE; j++) { - xd->plane[j].dst.buf = orig_dst[j]; - xd->plane[j].dst.stride = orig_dst_stride[j]; - } + restore_dst_buf(xd, orig_dst, orig_dst_stride); } else { for (j = 0; j < MAX_MB_PLANE; j++) { xd->plane[j].dst.buf = tmp_buf + j * 64 * 64; @@ -2839,25 +2307,24 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum); - cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv, - rate_sum, dist_sum); - cpi->rd_filter_cache[SWITCHABLE_FILTERS] = - MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS], - cpi->rd_filter_cache[i] + rs_rd); - rd = cpi->rd_filter_cache[i]; - if (cm->mcomp_filter_type == SWITCHABLE) + + rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum); + rd_opt->filter_cache[i] = rd; + rd_opt->filter_cache[SWITCHABLE_FILTERS] = + MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS], rd + rs_rd); + if (cm->interp_filter == SWITCHABLE) rd += rs_rd; + rd_opt->mask_filter = MAX(rd_opt->mask_filter, rd); + if (i == 0 && intpel_mv) { tmp_rate_sum = rate_sum; tmp_dist_sum = dist_sum; } } + if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) { if (rd / 2 > ref_best_rd) { - for (i = 0; i < MAX_MB_PLANE; i++) { - xd->plane[i].dst.buf = orig_dst[i]; - xd->plane[i].dst.stride = orig_dst_stride[i]; - } + restore_dst_buf(xd, orig_dst, orig_dst_stride); return INT64_MAX; } } @@ -2865,29 +2332,27 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (newbest) { best_rd = rd; - *best_filter = mbmi->interp_filter; - if (cm->mcomp_filter_type == SWITCHABLE && i && !intpel_mv) + best_filter = mbmi->interp_filter; + if (cm->interp_filter == SWITCHABLE && i && !intpel_mv) best_needs_copy = !best_needs_copy; + vpx_memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm)); + vpx_memcpy(bsse, x->bsse, sizeof(bsse)); } - if ((cm->mcomp_filter_type == SWITCHABLE && newbest) || - (cm->mcomp_filter_type != SWITCHABLE && - cm->mcomp_filter_type == mbmi->interp_filter)) { + if ((cm->interp_filter == SWITCHABLE && newbest) || + (cm->interp_filter != SWITCHABLE && + cm->interp_filter == mbmi->interp_filter)) { pred_exists = 1; + tmp_rd = best_rd; } } - - for (i = 0; i < MAX_MB_PLANE; i++) { - xd->plane[i].dst.buf = orig_dst[i]; - xd->plane[i].dst.stride = orig_dst_stride[i]; - } + restore_dst_buf(xd, orig_dst, orig_dst_stride); } } // Set the appropriate filter - mbmi->interp_filter = cm->mcomp_filter_type != SWITCHABLE ? - cm->mcomp_filter_type : *best_filter; - vp9_setup_interp_filters(xd, mbmi->interp_filter, cm); - rs = cm->mcomp_filter_type == SWITCHABLE ? get_switchable_rate(x) : 0; + mbmi->interp_filter = cm->interp_filter != SWITCHABLE ? + cm->interp_filter : best_filter; + rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi) : 0; if (pred_exists) { if (best_needs_copy) { @@ -2897,131 +2362,64 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, xd->plane[i].dst.stride = 64; } } + rd = tmp_rd + RDCOST(x->rdmult, x->rddiv, rs, 0); } else { - // Handles the special case when a filter that is not in the - // switchable list (ex. bilinear, 6-tap) is indicated at the frame level - vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); - } - - - if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) { int tmp_rate; int64_t tmp_dist; + // Handles the special case when a filter that is not in the + // switchable list (ex. bilinear) is indicated at the frame level, or + // skip condition holds. + vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist); rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist); + vpx_memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm)); + vpx_memcpy(bsse, x->bsse, sizeof(bsse)); + } + + if (!is_comp_pred) + single_filter[this_mode][refs[0]] = mbmi->interp_filter; + + if (cpi->sf.adaptive_mode_search) + if (is_comp_pred) + if (single_skippable[this_mode][refs[0]] && + single_skippable[this_mode][refs[1]]) + vpx_memset(skip_txfm, 1, sizeof(skip_txfm)); + + if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) { // if current pred_error modeled rd is substantially more than the best // so far, do not bother doing full rd if (rd / 2 > ref_best_rd) { - for (i = 0; i < MAX_MB_PLANE; i++) { - xd->plane[i].dst.buf = orig_dst[i]; - xd->plane[i].dst.stride = orig_dst_stride[i]; - } + restore_dst_buf(xd, orig_dst, orig_dst_stride); return INT64_MAX; } } - if (cpi->common.mcomp_filter_type == SWITCHABLE) - *rate2 += get_switchable_rate(x); + if (cm->interp_filter == SWITCHABLE) + *rate2 += rs; - if (!is_comp_pred && cpi->enable_encode_breakout) { - if (cpi->active_map_enabled && x->active_ptr[0] == 0) - x->skip = 1; - else if (x->encode_breakout) { - const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]); - const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]); - unsigned int var, sse; - // Skipping threshold for ac. - unsigned int thresh_ac; - // The encode_breakout input - unsigned int encode_breakout = x->encode_breakout << 4; - unsigned int max_thresh = 36000; - - // Use extreme low threshold for static frames to limit skipping. - if (cpi->enable_encode_breakout == 2) - max_thresh = 128; - - // Calculate threshold according to dequant value. - thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9; - - // Use encode_breakout input if it is bigger than internal threshold. - if (thresh_ac < encode_breakout) - thresh_ac = encode_breakout; - - // Set a maximum for threshold to avoid big PSNR loss in low bitrate case. - if (thresh_ac > max_thresh) - thresh_ac = max_thresh; - - var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride, - xd->plane[0].dst.buf, - xd->plane[0].dst.stride, &sse); - - // Adjust threshold according to partition size. - thresh_ac >>= 8 - (b_width_log2_lookup[bsize] + - b_height_log2_lookup[bsize]); - - // Y skipping condition checking - if (sse < thresh_ac || sse == 0) { - // Skipping threshold for dc - unsigned int thresh_dc; - - thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6); - - // dc skipping checking - if ((sse - var) < thresh_dc || sse == var) { - unsigned int sse_u, sse_v; - unsigned int var_u, var_v; - - var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf, - x->plane[1].src.stride, - xd->plane[1].dst.buf, - xd->plane[1].dst.stride, &sse_u); - - // U skipping condition checking - if ((sse_u * 4 < thresh_ac || sse_u == 0) && - (sse_u - var_u < thresh_dc || sse_u == var_u)) { - var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf, - x->plane[2].src.stride, - xd->plane[2].dst.buf, - xd->plane[2].dst.stride, &sse_v); - - // V skipping condition checking - if ((sse_v * 4 < thresh_ac || sse_v == 0) && - (sse_v - var_v < thresh_dc || sse_v == var_v)) { - x->skip = 1; - - // The cost of skip bit needs to be added. - *rate2 += vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1); - - // Scaling factor for SSE from spatial domain to frequency domain - // is 16. Adjust distortion accordingly. - *distortion_uv = (sse_u + sse_v) << 4; - *distortion = (sse << 4) + *distortion_uv; - - *disable_skip = 1; - this_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion); - } - } - } - } - } + if (!is_comp_pred) { + if (cpi->allow_encode_breakout) + rd_encode_breakout_test(cpi, x, bsize, rate2, distortion, distortion_uv, + disable_skip); } + vpx_memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm)); + vpx_memcpy(x->bsse, bsse, sizeof(bsse)); + if (!x->skip) { int skippable_y, skippable_uv; int64_t sseuv = INT64_MAX; int64_t rdcosty = INT64_MAX; // Y cost and distortion + vp9_subtract_plane(x, bsize, 0); super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse, bsize, txfm_cache, ref_best_rd); if (*rate_y == INT_MAX) { *rate2 = INT_MAX; *distortion = INT64_MAX; - for (i = 0; i < MAX_MB_PLANE; i++) { - xd->plane[i].dst.buf = orig_dst[i]; - xd->plane[i].dst.stride = orig_dst_stride[i]; - } + restore_dst_buf(xd, orig_dst, orig_dst_stride); return INT64_MAX; } @@ -3036,10 +2434,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (*rate_uv == INT_MAX) { *rate2 = INT_MAX; *distortion = INT64_MAX; - for (i = 0; i < MAX_MB_PLANE; i++) { - xd->plane[i].dst.buf = orig_dst[i]; - xd->plane[i].dst.stride = orig_dst_stride[i]; - } + restore_dst_buf(xd, orig_dst, orig_dst_stride); return INT64_MAX; } @@ -3049,50 +2444,28 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, *skippable = skippable_y && skippable_uv; } - for (i = 0; i < MAX_MB_PLANE; i++) { - xd->plane[i].dst.buf = orig_dst[i]; - xd->plane[i].dst.stride = orig_dst_stride[i]; - } + if (!is_comp_pred) + single_skippable[this_mode][refs[0]] = *skippable; + restore_dst_buf(xd, orig_dst, orig_dst_stride); return this_rd; // if 0, this will be re-calculated by caller } -static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, - int max_plane) { - struct macroblock_plane *const p = x->plane; - struct macroblockd_plane *const pd = x->e_mbd.plane; - int i; - - for (i = 0; i < max_plane; ++i) { - p[i].coeff = ctx->coeff_pbuf[i][1]; - pd[i].qcoeff = ctx->qcoeff_pbuf[i][1]; - pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1]; - pd[i].eobs = ctx->eobs_pbuf[i][1]; - - ctx->coeff_pbuf[i][1] = ctx->coeff_pbuf[i][0]; - ctx->qcoeff_pbuf[i][1] = ctx->qcoeff_pbuf[i][0]; - ctx->dqcoeff_pbuf[i][1] = ctx->dqcoeff_pbuf[i][0]; - ctx->eobs_pbuf[i][1] = ctx->eobs_pbuf[i][0]; - - ctx->coeff_pbuf[i][0] = p[i].coeff; - ctx->qcoeff_pbuf[i][0] = pd[i].qcoeff; - ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff; - ctx->eobs_pbuf[i][0] = pd[i].eobs; - } -} - void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int *returnrate, int64_t *returndist, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; + struct macroblockd_plane *const pd = xd->plane; int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0; int y_skip = 0, uv_skip = 0; int64_t dist_y = 0, dist_uv = 0, tx_cache[TX_MODES] = { 0 }; + TX_SIZE max_uv_tx_size; x->skip_encode = 0; ctx->skip = 0; - xd->mi_8x8[0]->mbmi.ref_frame[0] = INTRA_FRAME; + xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME; + if (bsize >= BLOCK_8X8) { if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y, &y_skip, bsize, tx_cache, @@ -3100,8 +2473,11 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, *returnrate = INT_MAX; return; } + max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize, + pd[1].subsampling_x, + pd[1].subsampling_y); rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly, - &dist_uv, &uv_skip, bsize); + &dist_uv, &uv_skip, bsize, max_uv_tx_size); } else { y_skip = 0; if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly, @@ -3109,19 +2485,21 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, *returnrate = INT_MAX; return; } + max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize, + pd[1].subsampling_x, + pd[1].subsampling_y); rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly, - &dist_uv, &uv_skip, BLOCK_8X8); + &dist_uv, &uv_skip, BLOCK_8X8, max_uv_tx_size); } if (y_skip && uv_skip) { *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly + - vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1); + vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); *returndist = dist_y + dist_uv; vp9_zero(ctx->tx_rd_diff); } else { int i; - *returnrate = rate_y + rate_uv + - vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 0); + *returnrate = rate_y + rate_uv + vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0); *returndist = dist_y + dist_uv; if (cpi->sf.tx_size_search_method == USE_FULL_RD) for (i = 0; i < TX_MODES; i++) { @@ -3132,7 +2510,30 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } } - ctx->mic = *xd->mi_8x8[0]; + ctx->mic = *xd->mi[0]; +} + +// Updating rd_thresh_freq_fact[] here means that the different +// partition/block sizes are handled independently based on the best +// choice for the current partition. It may well be better to keep a scaled +// best rd so far value and update rd_thresh_freq_fact based on the mode/size +// combination that wins out. +static void update_rd_thresh_fact(VP9_COMP *cpi, int bsize, + int best_mode_index) { + if (cpi->sf.adaptive_rd_thresh > 0) { + const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES; + int mode; + for (mode = 0; mode < top_mode; ++mode) { + int *const fact = &cpi->rd.thresh_freq_fact[bsize][mode]; + + if (mode == best_mode_index) { + *fact -= (*fact >> 3); + } else { + *fact = MIN(*fact + RD_THRESH_INC, + cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT); + } + } + } } int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, @@ -3143,65 +2544,60 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) { - VP9_COMMON *cm = &cpi->common; - MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; - const struct segmentation *seg = &cm->seg; - const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]); - MB_PREDICTION_MODE this_mode; + VP9_COMMON *const cm = &cpi->common; + RD_OPT *const rd_opt = &cpi->rd; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const struct segmentation *const seg = &cm->seg; + struct macroblockd_plane *const pd = xd->plane; + PREDICTION_MODE this_mode; MV_REFERENCE_FRAME ref_frame, second_ref_frame; unsigned char segment_id = mbmi->segment_id; - int comp_pred, i; + int comp_pred, i, k; int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } }; + INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES]; + int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES]; static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG }; - int idx_list[4] = {0, - cpi->lst_fb_idx, - cpi->gld_fb_idx, - cpi->alt_fb_idx}; int64_t best_rd = best_rd_so_far; int64_t best_tx_rd[TX_MODES]; int64_t best_tx_diff[TX_MODES]; - int64_t best_pred_diff[NB_PREDICTION_TYPES]; - int64_t best_pred_rd[NB_PREDICTION_TYPES]; + int64_t best_pred_diff[REFERENCE_MODES]; + int64_t best_pred_rd[REFERENCE_MODES]; int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS]; int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; - MB_MODE_INFO best_mbmode = { 0 }; - int j; - int mode_index, best_mode_index = 0; + MB_MODE_INFO best_mbmode; + int best_mode_skippable = 0; + int mode_index, best_mode_index = -1; unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES]; vp9_prob comp_mode_p; int64_t best_intra_rd = INT64_MAX; int64_t best_inter_rd = INT64_MAX; - MB_PREDICTION_MODE best_intra_mode = DC_PRED; + PREDICTION_MODE best_intra_mode = DC_PRED; MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME; - INTERPOLATION_TYPE tmp_best_filter = SWITCHABLE; int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES]; int64_t dist_uv[TX_SIZES]; int skip_uv[TX_SIZES]; - MB_PREDICTION_MODE mode_uv[TX_SIZES]; - struct scale_factors scale_factor[4]; - unsigned int ref_frame_mask = 0; - unsigned int mode_mask = 0; - int64_t mode_distortions[MB_MODE_COUNT] = {-1}; - int64_t frame_distortions[MAX_REF_FRAMES] = {-1}; + PREDICTION_MODE mode_uv[TX_SIZES]; int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q); - const int bws = num_8x8_blocks_wide_lookup[bsize] / 2; - const int bhs = num_8x8_blocks_high_lookup[bsize] / 2; int best_skip2 = 0; - + int mode_skip_mask = 0; + int mode_skip_start = cpi->sf.mode_skip_start + 1; + const int *const rd_threshes = rd_opt->threshes[segment_id][bsize]; + const int *const rd_thresh_freq_fact = rd_opt->thresh_freq_fact[bsize]; + const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags; + const int intra_y_mode_mask = + cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]]; + int inter_mode_mask = cpi->sf.inter_mode_mask[bsize]; + vp9_zero(best_mbmode); x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; - // Everywhere the flag is set the error is much higher than its neighbors. - ctx->frames_with_high_error = 0; - ctx->modes_with_high_error = 0; - - estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp, + estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp, &comp_mode_p); - for (i = 0; i < NB_PREDICTION_TYPES; ++i) + for (i = 0; i < REFERENCE_MODES; ++i) best_pred_rd[i] = INT64_MAX; for (i = 0; i < TX_MODES; i++) best_tx_rd[i] = INT64_MAX; @@ -3209,53 +2605,83 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, best_filter_rd[i] = INT64_MAX; for (i = 0; i < TX_SIZES; i++) rate_uv_intra[i] = INT_MAX; + for (i = 0; i < MAX_REF_FRAMES; ++i) + x->pred_sse[i] = INT_MAX; + for (i = 0; i < MB_MODE_COUNT; ++i) { + for (k = 0; k < MAX_REF_FRAMES; ++k) { + single_inter_filter[i][k] = SWITCHABLE; + single_skippable[i][k] = 0; + } + } *returnrate = INT_MAX; - // Create a mask set to 1 for each reference frame used by a smaller - // resolution. - if (cpi->sf.use_avoid_tested_higherror) { - switch (block_size) { - case BLOCK_64X64: - for (i = 0; i < 4; i++) { - for (j = 0; j < 4; j++) { - ref_frame_mask |= x->mb_context[i][j].frames_with_high_error; - mode_mask |= x->mb_context[i][j].modes_with_high_error; - } - } - for (i = 0; i < 4; i++) { - ref_frame_mask |= x->sb32_context[i].frames_with_high_error; - mode_mask |= x->sb32_context[i].modes_with_high_error; - } - break; - case BLOCK_32X32: - for (i = 0; i < 4; i++) { - ref_frame_mask |= - x->mb_context[x->sb_index][i].frames_with_high_error; - mode_mask |= x->mb_context[x->sb_index][i].modes_with_high_error; - } - break; - default: - // Until we handle all block sizes set it to present; - ref_frame_mask = 0; - mode_mask = 0; - break; - } - ref_frame_mask = ~ref_frame_mask; - mode_mask = ~mode_mask; - } - - for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + x->pred_mv_sad[ref_frame] = INT_MAX; if (cpi->ref_frame_flags & flag_list[ref_frame]) { - setup_buffer_inter(cpi, x, tile, idx_list[ref_frame], ref_frame, - block_size, mi_row, mi_col, - frame_mv[NEARESTMV], frame_mv[NEARMV], - yv12_mb, scale_factor); + setup_buffer_inter(cpi, x, tile, ref_frame, bsize, mi_row, mi_col, + frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); } frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; frame_mv[ZEROMV][ref_frame].as_int = 0; } + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + // All modes from vp9_mode_order that use this frame as any ref + static const int ref_frame_mask_all[] = { + 0x0, 0x123291, 0x25c444, 0x39b722 + }; + // Fixed mv modes (NEARESTMV, NEARMV, ZEROMV) from vp9_mode_order that use + // this frame as their primary ref + static const int ref_frame_mask_fixedmv[] = { + 0x0, 0x121281, 0x24c404, 0x080102 + }; + if (!(cpi->ref_frame_flags & flag_list[ref_frame])) { + // Skip modes for missing references + mode_skip_mask |= ref_frame_mask_all[ref_frame]; + } else if (cpi->sf.reference_masking) { + for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + // Skip fixed mv modes for poor references + if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) { + mode_skip_mask |= ref_frame_mask_fixedmv[ref_frame]; + break; + } + } + } + // If the segment reference frame feature is enabled.... + // then do nothing if the current ref frame is not allowed.. + if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && + vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { + mode_skip_mask |= ref_frame_mask_all[ref_frame]; + } + } + + // Disable this drop out case if the ref frame + // segment level feature is enabled for this segment. This is to + // prevent the possibility that we end up unable to pick any mode. + if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { + // Only consider ZEROMV/ALTREF_FRAME for alt ref frame, + // unless ARNR filtering is enabled in which case we want + // an unfiltered alternative. We allow near/nearest as well + // because they may result in zero-zero MVs but be cheaper. + if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) { + mode_skip_mask = + ~((1 << THR_NEARESTA) | (1 << THR_NEARA) | (1 << THR_ZEROA)); + if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0) + mode_skip_mask |= (1 << THR_NEARA); + if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0) + mode_skip_mask |= (1 << THR_NEARESTA); + } + } + + if (bsize > cpi->sf.max_intra_bsize) { + const int all_intra_modes = (1 << THR_DC) | (1 << THR_TM) | + (1 << THR_H_PRED) | (1 << THR_V_PRED) | (1 << THR_D135_PRED) | + (1 << THR_D207_PRED) | (1 << THR_D153_PRED) | (1 << THR_D63_PRED) | + (1 << THR_D117_PRED) | (1 << THR_D45_PRED); + mode_skip_mask |= all_intra_modes; + } + for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) { int mode_excluded = 0; int64_t this_rd = INT64_MAX; @@ -3267,125 +2693,166 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int64_t tx_cache[TX_MODES]; int i; int this_skip2 = 0; - int64_t total_sse = INT_MAX; + int64_t total_sse = INT64_MAX; int early_term = 0; - for (i = 0; i < TX_MODES; ++i) - tx_cache[i] = INT64_MAX; - - x->skip = 0; this_mode = vp9_mode_order[mode_index].mode; - ref_frame = vp9_mode_order[mode_index].ref_frame; - second_ref_frame = vp9_mode_order[mode_index].second_ref_frame; + ref_frame = vp9_mode_order[mode_index].ref_frame[0]; + if (ref_frame != INTRA_FRAME && !(inter_mode_mask & (1 << this_mode))) + continue; + second_ref_frame = vp9_mode_order[mode_index].ref_frame[1]; // Look at the reference frame of the best mode so far and set the // skip mask to look at a subset of the remaining modes. - if (mode_index > cpi->sf.mode_skip_start) { - if (mode_index == (cpi->sf.mode_skip_start + 1)) { - switch (vp9_mode_order[best_mode_index].ref_frame) { - case INTRA_FRAME: - cpi->mode_skip_mask = 0; - break; - case LAST_FRAME: - cpi->mode_skip_mask = LAST_FRAME_MODE_MASK; - break; - case GOLDEN_FRAME: - cpi->mode_skip_mask = GOLDEN_FRAME_MODE_MASK; - break; - case ALTREF_FRAME: - cpi->mode_skip_mask = ALT_REF_MODE_MASK; - break; - case NONE: - case MAX_REF_FRAMES: - assert(!"Invalid Reference frame"); - } + if (mode_index == mode_skip_start && best_mode_index >= 0) { + switch (vp9_mode_order[best_mode_index].ref_frame[0]) { + case INTRA_FRAME: + break; + case LAST_FRAME: + mode_skip_mask |= LAST_FRAME_MODE_MASK; + break; + case GOLDEN_FRAME: + mode_skip_mask |= GOLDEN_FRAME_MODE_MASK; + break; + case ALTREF_FRAME: + mode_skip_mask |= ALT_REF_MODE_MASK; + break; + case NONE: + case MAX_REF_FRAMES: + assert(0 && "Invalid Reference frame"); + break; } - if (cpi->mode_skip_mask & ((int64_t)1 << mode_index)) + } + + if (cpi->sf.alt_ref_search_fp && cpi->rc.is_src_frame_alt_ref) { + mode_skip_mask = 0; + if (!(ref_frame == ALTREF_FRAME && second_ref_frame == NONE)) continue; } - // Skip if the current reference frame has been masked off - if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask && - (cpi->ref_frame_mask & (1 << ref_frame))) + if (mode_skip_mask & (1 << mode_index)) continue; // Test best rd so far against threshold for trying this mode. - if ((best_rd < ((int64_t)cpi->rd_threshes[segment_id][bsize][mode_index] * - cpi->rd_thresh_freq_fact[bsize][mode_index] >> 5)) || - cpi->rd_threshes[segment_id][bsize][mode_index] == INT_MAX) + if (rd_less_than_thresh(best_rd, rd_threshes[mode_index], + rd_thresh_freq_fact[mode_index])) continue; - // Do not allow compound prediction if the segment level reference - // frame feature is in use as in this case there can only be one reference. - if ((second_ref_frame > INTRA_FRAME) && - vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) - continue; + if (cpi->sf.motion_field_mode_search) { + const int mi_width = MIN(num_8x8_blocks_wide_lookup[bsize], + tile->mi_col_end - mi_col); + const int mi_height = MIN(num_8x8_blocks_high_lookup[bsize], + tile->mi_row_end - mi_row); + const int bsl = mi_width_log2(bsize); + int cb_partition_search_ctrl = (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_video_frame)) & 0x1; + MB_MODE_INFO *ref_mbmi; + int const_motion = 1; + int skip_ref_frame = !cb_partition_search_ctrl; + MV_REFERENCE_FRAME rf = NONE; + int_mv ref_mv; + ref_mv.as_int = INVALID_MV; - // Skip some checking based on small partitions' result. - if (x->fast_ms > 1 && !ref_frame) - continue; - if (x->fast_ms > 2 && ref_frame != x->subblock_ref) - continue; - - if (cpi->sf.use_avoid_tested_higherror && bsize >= BLOCK_8X8) { - if (!(ref_frame_mask & (1 << ref_frame))) { - continue; + if ((mi_row - 1) >= tile->mi_row_start) { + ref_mv = xd->mi[-xd->mi_stride]->mbmi.mv[0]; + rf = xd->mi[-xd->mi_stride]->mbmi.ref_frame[0]; + for (i = 0; i < mi_width; ++i) { + ref_mbmi = &xd->mi[-xd->mi_stride + i]->mbmi; + const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) && + (ref_frame == ref_mbmi->ref_frame[0]); + skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]); + } } - if (!(mode_mask & (1 << this_mode))) { - continue; - } - if (second_ref_frame != NONE - && !(ref_frame_mask & (1 << second_ref_frame))) { - continue; - } - } - mbmi->ref_frame[0] = ref_frame; - mbmi->ref_frame[1] = second_ref_frame; + if ((mi_col - 1) >= tile->mi_col_start) { + if (ref_mv.as_int == INVALID_MV) + ref_mv = xd->mi[-1]->mbmi.mv[0]; + if (rf == NONE) + rf = xd->mi[-1]->mbmi.ref_frame[0]; + for (i = 0; i < mi_height; ++i) { + ref_mbmi = &xd->mi[i * xd->mi_stride - 1]->mbmi; + const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) && + (ref_frame == ref_mbmi->ref_frame[0]); + skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]); + } + } - if (!(ref_frame == INTRA_FRAME - || (cpi->ref_frame_flags & flag_list[ref_frame]))) { - continue; - } - if (!(second_ref_frame == NONE - || (cpi->ref_frame_flags & flag_list[second_ref_frame]))) { - continue; + if (skip_ref_frame && this_mode != NEARESTMV && this_mode != NEWMV) + if (rf > INTRA_FRAME) + if (ref_frame != rf) + continue; + + if (const_motion) + if (this_mode == NEARMV || this_mode == ZEROMV) + continue; } comp_pred = second_ref_frame > INTRA_FRAME; if (comp_pred) { - if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) - if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) - continue; - if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) - if (ref_frame != best_inter_ref_frame && - second_ref_frame != best_inter_ref_frame) - continue; + if (!cm->allow_comp_inter_inter) + continue; + + if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) && + best_mode_index >=0 && + vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME) + continue; + if ((mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) && + ref_frame != best_inter_ref_frame && + second_ref_frame != best_inter_ref_frame) + continue; + mode_excluded = cm->reference_mode == SINGLE_REFERENCE; + } else { + if (ref_frame != INTRA_FRAME) + mode_excluded = cm->reference_mode == COMPOUND_REFERENCE; } - set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor); - mbmi->uv_mode = DC_PRED; + if (ref_frame == INTRA_FRAME) { + if (cpi->sf.adaptive_mode_search) + if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_intra_rd) + continue; + if (!(intra_y_mode_mask & (1 << this_mode))) + continue; + if (this_mode != DC_PRED) { + // Disable intra modes other than DC_PRED for blocks with low variance + // Threshold for intra skipping based on source variance + // TODO(debargha): Specialize the threshold for super block sizes + const unsigned int skip_intra_var_thresh = 64; + if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) && + x->source_variance < skip_intra_var_thresh) + continue; + // Only search the oblique modes if the best so far is + // one of the neighboring directional modes + if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) && + (this_mode >= D45_PRED && this_mode <= TM_PRED)) { + if (best_mode_index >= 0 && + vp9_mode_order[best_mode_index].ref_frame[0] > INTRA_FRAME) + continue; + } + if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { + if (conditional_skipintra(this_mode, best_intra_mode)) + continue; + } + } + } else { + const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame}; + if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv, + inter_mode_mask, this_mode, ref_frames)) + continue; + } + + mbmi->mode = this_mode; + mbmi->uv_mode = DC_PRED; + mbmi->ref_frame[0] = ref_frame; + mbmi->ref_frame[1] = second_ref_frame; // Evaluate all sub-pel filters irrespective of whether we can use // them for this frame. - mbmi->interp_filter = cm->mcomp_filter_type; - vp9_setup_interp_filters(xd, mbmi->interp_filter, cm); + mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP + : cm->interp_filter; + mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0; - if (comp_pred) { - if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) - continue; - set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor); - - mode_excluded = mode_excluded - ? mode_excluded - : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY; - } else { - if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) { - mode_excluded = - mode_excluded ? - mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY; - } - } + x->skip = 0; + set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); // Select prediction reference frames. for (i = 0; i < MAX_MB_PLANE; i++) { @@ -3394,91 +2861,23 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i]; } - // If the segment reference frame feature is enabled.... - // then do nothing if the current ref frame is not allowed.. - if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && - vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != - (int)ref_frame) { - continue; - // If the segment skip feature is enabled.... - // then do nothing if the current mode is not allowed.. - } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) && - (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) { - continue; - // Disable this drop out case if the ref frame - // segment level feature is enabled for this segment. This is to - // prevent the possibility that we end up unable to pick any mode. - } else if (!vp9_segfeature_active(seg, segment_id, - SEG_LVL_REF_FRAME)) { - // Only consider ZEROMV/ALTREF_FRAME for alt ref frame, - // unless ARNR filtering is enabled in which case we want - // an unfiltered alternative. We allow near/nearest as well - // because they may result in zero-zero MVs but be cheaper. - if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) { - if ((this_mode != ZEROMV && - !(this_mode == NEARMV && - frame_mv[NEARMV][ALTREF_FRAME].as_int == 0) && - !(this_mode == NEARESTMV && - frame_mv[NEARESTMV][ALTREF_FRAME].as_int == 0)) || - ref_frame != ALTREF_FRAME) { - continue; - } - } - } - // TODO(JBB): This is to make up for the fact that we don't have sad - // functions that work when the block size reads outside the umv. We - // should fix this either by making the motion search just work on - // a representative block in the boundary ( first ) and then implement a - // function that does sads when inside the border.. - if (((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) && - this_mode == NEWMV) { - continue; - } - -#ifdef MODE_TEST_HIT_STATS - // TEST/DEBUG CODE - // Keep a rcord of the number of test hits at each size - cpi->mode_test_hits[bsize]++; -#endif - + for (i = 0; i < TX_MODES; ++i) + tx_cache[i] = INT64_MAX; if (ref_frame == INTRA_FRAME) { TX_SIZE uv_tx; - // Disable intra modes other than DC_PRED for blocks with low variance - // Threshold for intra skipping based on source variance - // TODO(debargha): Specialize the threshold for super block sizes - static const unsigned int skip_intra_var_thresh[BLOCK_SIZES] = { - 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, - }; - if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) && - this_mode != DC_PRED && - x->source_variance < skip_intra_var_thresh[mbmi->sb_type]) - continue; - // Only search the oblique modes if the best so far is - // one of the neighboring directional modes - if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) && - (this_mode >= D45_PRED && this_mode <= TM_PRED)) { - if (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME) - continue; - } - mbmi->mode = this_mode; - if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { - if (conditional_skipintra(mbmi->mode, best_intra_mode)) - continue; - } - - super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, - bsize, tx_cache, best_rd); + super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, + NULL, bsize, tx_cache, best_rd); if (rate_y == INT_MAX) continue; - uv_tx = MIN(mbmi->tx_size, max_uv_txsize_lookup[bsize]); + uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize, pd[1].subsampling_x, + pd[1].subsampling_y); if (rate_uv_intra[uv_tx] == INT_MAX) { - choose_intra_uv_mode(cpi, ctx, bsize, &rate_uv_intra[uv_tx], - &rate_uv_tokenonly[uv_tx], - &dist_uv[uv_tx], &skip_uv[uv_tx], - &mode_uv[uv_tx]); + choose_intra_uv_mode(cpi, ctx, bsize, uv_tx, + &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx], + &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]); } rate_uv = rate_uv_tokenonly[uv_tx]; @@ -3486,77 +2885,59 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, skippable = skippable && skip_uv[uv_tx]; mbmi->uv_mode = mode_uv[uv_tx]; - rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx]; + rate2 = rate_y + cpi->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx]; if (this_mode != DC_PRED && this_mode != TM_PRED) rate2 += intra_cost_penalty; distortion2 = distortion_y + distortion_uv; } else { - mbmi->mode = this_mode; - compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME); - this_rd = handle_inter_mode(cpi, x, tile, bsize, + this_rd = handle_inter_mode(cpi, x, bsize, tx_cache, &rate2, &distortion2, &skippable, &rate_y, &distortion_y, &rate_uv, &distortion_uv, - &mode_excluded, &disable_skip, - &tmp_best_filter, frame_mv, + &disable_skip, frame_mv, mi_row, mi_col, - single_newmv, &total_sse, best_rd); + single_newmv, single_inter_filter, + single_skippable, &total_sse, best_rd); if (this_rd == INT64_MAX) continue; - } - if (cm->comp_pred_mode == HYBRID_PREDICTION) { - rate2 += compmode_cost; + compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred); + + if (cm->reference_mode == REFERENCE_MODE_SELECT) + rate2 += compmode_cost; } // Estimate the reference frame signaling cost and add it // to the rolling cost variable. - if (second_ref_frame > INTRA_FRAME) { + if (comp_pred) { rate2 += ref_costs_comp[ref_frame]; } else { rate2 += ref_costs_single[ref_frame]; } if (!disable_skip) { - // Test for the condition where skip block will be activated - // because there are no non zero coefficients and make any - // necessary adjustment for rate. Ignore if skip is coded at - // segment level as the cost wont have been added in. - // Is Mb level skip allowed (i.e. not coded at segment level). - const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id, - SEG_LVL_SKIP); - if (skippable) { + vp9_prob skip_prob = vp9_get_skip_prob(cm, xd); + // Back out the coefficient coding costs rate2 -= (rate_y + rate_uv); // for best yrd calculation rate_uv = 0; - if (mb_skip_allowed) { - int prob_skip_cost; - - // Cost the skip mb case - vp9_prob skip_prob = - vp9_get_pred_prob_mbskip(cm, xd); - - if (skip_prob) { - prob_skip_cost = vp9_cost_bit(skip_prob, 1); - rate2 += prob_skip_cost; - } + // Cost the skip mb case + if (skip_prob) { + int prob_skip_cost = vp9_cost_bit(skip_prob, 1); + rate2 += prob_skip_cost; } - } else if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) { + } else if (ref_frame != INTRA_FRAME && !xd->lossless) { if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) < RDCOST(x->rdmult, x->rddiv, 0, total_sse)) { // Add in the cost of the no skip flag. - int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), - 0); - rate2 += prob_skip_cost; + rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0); } else { // FIXME(rbultje) make this work for splitmv also - int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), - 1); - rate2 += prob_skip_cost; + rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); distortion2 = total_sse; assert(total_sse >= 0); rate2 -= (rate_y + rate_uv); @@ -3564,49 +2945,36 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, rate_uv = 0; this_skip2 = 1; } - } else if (mb_skip_allowed) { + } else { // Add in the cost of the no skip flag. - int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), - 0); - rate2 += prob_skip_cost; + rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0); } // Calculate the final RD estimate for this mode. this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); } + if (ref_frame == INTRA_FRAME) { // Keep record of best intra rd - if (!is_inter_block(&xd->mi_8x8[0]->mbmi) && - this_rd < best_intra_rd) { - best_intra_rd = this_rd; - best_intra_mode = xd->mi_8x8[0]->mbmi.mode; - } - - // Keep record of best inter rd with single reference - if (is_inter_block(&xd->mi_8x8[0]->mbmi) && - !has_second_ref(&xd->mi_8x8[0]->mbmi) && - !mode_excluded && this_rd < best_inter_rd) { - best_inter_rd = this_rd; - best_inter_ref_frame = ref_frame; + if (this_rd < best_intra_rd) { + best_intra_rd = this_rd; + best_intra_mode = mbmi->mode; + } + } else { + // Keep record of best inter rd with single reference + if (!comp_pred && !mode_excluded && this_rd < best_inter_rd) { + best_inter_rd = this_rd; + best_inter_ref_frame = ref_frame; + } } if (!disable_skip && ref_frame == INTRA_FRAME) { - for (i = 0; i < NB_PREDICTION_TYPES; ++i) + for (i = 0; i < REFERENCE_MODES; ++i) best_pred_rd[i] = MIN(best_pred_rd[i], this_rd); for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) best_filter_rd[i] = MIN(best_filter_rd[i], this_rd); } - // Store the respective mode distortions for later use. - if (mode_distortions[this_mode] == -1 - || distortion2 < mode_distortions[this_mode]) { - mode_distortions[this_mode] = distortion2; - } - if (frame_distortions[ref_frame] == -1 - || distortion2 < frame_distortions[ref_frame]) { - frame_distortions[ref_frame] = distortion2; - } - // Did this mode help.. i.e. is it the new best mode if (this_rd < best_rd || x->skip) { int max_plane = MAX_MB_PLANE; @@ -3618,6 +2986,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, /* required for left and above block mv */ mbmi->mv[0].as_int = 0; max_plane = 1; + } else { + best_intra_rd = x->pred_sse[ref_frame]; } *returnrate = rate2; @@ -3625,14 +2995,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, best_rd = this_rd; best_mbmode = *mbmi; best_skip2 = this_skip2; - if (!x->select_txfm_size) - swap_block_ptr(x, ctx, max_plane); + best_mode_skippable = skippable; + + if (!x->select_tx_size) + swap_block_ptr(x, ctx, 1, 0, 0, max_plane); vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size], sizeof(uint8_t) * ctx->num_4x4_blk); // TODO(debargha): enhance this test with a better distortion prediction // based on qp, activity mask and history - if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) && + if ((mode_search_skip_flags & FLAG_EARLY_TERMINATE) && (mode_index > MIN_EARLY_TERM_INDEX)) { const int qstep = xd->plane[0].dequant[1]; // TODO(debargha): Enhance this by specializing for each mode_index @@ -3651,9 +3023,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, /* keep record of best compound/single-only prediction */ if (!disable_skip && ref_frame != INTRA_FRAME) { - int single_rd, hybrid_rd, single_rate, hybrid_rate; + int64_t single_rd, hybrid_rd, single_rate, hybrid_rate; - if (cm->comp_pred_mode == HYBRID_PREDICTION) { + if (cm->reference_mode == REFERENCE_MODE_SELECT) { single_rate = rate2 - compmode_cost; hybrid_rate = rate2; } else { @@ -3664,40 +3036,39 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2); hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2); - if (second_ref_frame <= INTRA_FRAME && - single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) { - best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd; - } else if (second_ref_frame > INTRA_FRAME && - single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) { - best_pred_rd[COMP_PREDICTION_ONLY] = single_rd; - } - if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION]) - best_pred_rd[HYBRID_PREDICTION] = hybrid_rd; - } - - /* keep record of best filter type */ - if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME && - cm->mcomp_filter_type != BILINEAR) { - int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ? - SWITCHABLE_FILTERS : cm->mcomp_filter_type]; - for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { - int64_t adj_rd; - // In cases of poor prediction, filter_cache[] can contain really big - // values, which actually are bigger than this_rd itself. This can - // cause negative best_filter_rd[] values, which is obviously silly. - // Therefore, if filter_cache < ref, we do an adjusted calculation. - if (cpi->rd_filter_cache[i] >= ref) { - adj_rd = this_rd + cpi->rd_filter_cache[i] - ref; - } else { - // FIXME(rbultje) do this for comppsred also - // - // To prevent out-of-range computation in - // adj_rd = cpi->rd_filter_cache[i] * this_rd / ref - // cpi->rd_filter_cache[i] / ref is converted to a 256 based ratio. - int tmp = cpi->rd_filter_cache[i] * 256 / ref; - adj_rd = (this_rd * tmp) >> 8; + if (!comp_pred) { + if (single_rd < best_pred_rd[SINGLE_REFERENCE]) { + best_pred_rd[SINGLE_REFERENCE] = single_rd; + } + } else { + if (single_rd < best_pred_rd[COMPOUND_REFERENCE]) { + best_pred_rd[COMPOUND_REFERENCE] = single_rd; + } + } + if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT]) + best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd; + + /* keep record of best filter type */ + if (!mode_excluded && cm->interp_filter != BILINEAR) { + int64_t ref = rd_opt->filter_cache[cm->interp_filter == SWITCHABLE ? + SWITCHABLE_FILTERS : cm->interp_filter]; + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { + int64_t adj_rd; + if (ref == INT64_MAX) + adj_rd = 0; + else if (rd_opt->filter_cache[i] == INT64_MAX) + // when early termination is triggered, the encoder does not have + // access to the rate-distortion cost. it only knows that the cost + // should be above the maximum valid value. hence it takes the known + // maximum plus an arbitrary constant as the rate-distortion cost. + adj_rd = rd_opt->mask_filter - ref + 10; + else + adj_rd = rd_opt->filter_cache[i] - ref; + + adj_rd += this_rd; + best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd); } - best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd); } } @@ -3725,76 +3096,58 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, break; } - if (best_rd >= best_rd_so_far) + // The inter modes' rate costs are not calculated precisely in some cases. + // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and + // ZEROMV. Here, checks are added for those cases, and the mode decisions + // are corrected. + if (best_mbmode.mode == NEWMV) { + const MV_REFERENCE_FRAME refs[2] = {best_mbmode.ref_frame[0], + best_mbmode.ref_frame[1]}; + int comp_pred_mode = refs[1] > INTRA_FRAME; + + if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int && + ((comp_pred_mode && frame_mv[NEARESTMV][refs[1]].as_int == + best_mbmode.mv[1].as_int) || !comp_pred_mode)) + best_mbmode.mode = NEARESTMV; + else if (frame_mv[NEARMV][refs[0]].as_int == best_mbmode.mv[0].as_int && + ((comp_pred_mode && frame_mv[NEARMV][refs[1]].as_int == + best_mbmode.mv[1].as_int) || !comp_pred_mode)) + best_mbmode.mode = NEARMV; + else if (best_mbmode.mv[0].as_int == 0 && + ((comp_pred_mode && best_mbmode.mv[1].as_int == 0) || !comp_pred_mode)) + best_mbmode.mode = ZEROMV; + } + + if (best_mode_index < 0 || best_rd >= best_rd_so_far) return INT64_MAX; // If we used an estimate for the uv intra rd in the loop above... if (cpi->sf.use_uv_intra_rd_estimate) { // Do Intra UV best rd mode selection if best mode choice above was intra. - if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) { - TX_SIZE uv_tx_size = get_uv_tx_size(mbmi); + if (vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME) { + TX_SIZE uv_tx_size; + *mbmi = best_mbmode; + uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]); rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size], &rate_uv_tokenonly[uv_tx_size], &dist_uv[uv_tx_size], &skip_uv[uv_tx_size], - bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize); + bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, + uv_tx_size); } } - // If we are using reference masking and the set mask flag is set then - // create the reference frame mask. - if (cpi->sf.reference_masking && cpi->set_ref_frame_mask) - cpi->ref_frame_mask = ~(1 << vp9_mode_order[best_mode_index].ref_frame); + assert((cm->interp_filter == SWITCHABLE) || + (cm->interp_filter == best_mbmode.interp_filter) || + !is_inter_block(&best_mbmode)); - // Flag all modes that have a distortion thats > 2x the best we found at - // this level. - for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) { - if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV) - continue; - - if (mode_distortions[mode_index] > 2 * *returndistortion) { - ctx->modes_with_high_error |= (1 << mode_index); - } - } - - // Flag all ref frames that have a distortion thats > 2x the best we found at - // this level. - for (ref_frame = INTRA_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { - if (frame_distortions[ref_frame] > 2 * *returndistortion) { - ctx->frames_with_high_error |= (1 << ref_frame); - } - } - - assert((cm->mcomp_filter_type == SWITCHABLE) || - (cm->mcomp_filter_type == best_mbmode.interp_filter) || - (best_mbmode.ref_frame[0] == INTRA_FRAME)); - - // Updating rd_thresh_freq_fact[] here means that the different - // partition/block sizes are handled independently based on the best - // choice for the current partition. It may well be better to keep a scaled - // best rd so far value and update rd_thresh_freq_fact based on the mode/size - // combination that wins out. - if (cpi->sf.adaptive_rd_thresh) { - for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) { - if (mode_index == best_mode_index) { - cpi->rd_thresh_freq_fact[bsize][mode_index] -= - (cpi->rd_thresh_freq_fact[bsize][mode_index] >> 3); - } else { - cpi->rd_thresh_freq_fact[bsize][mode_index] += RD_THRESH_INC; - if (cpi->rd_thresh_freq_fact[bsize][mode_index] > - (cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT)) { - cpi->rd_thresh_freq_fact[bsize][mode_index] = - cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT; - } - } - } - } + update_rd_thresh_fact(cpi, bsize, best_mode_index); // macroblock modes *mbmi = best_mbmode; x->skip |= best_skip2; - for (i = 0; i < NB_PREDICTION_TYPES; ++i) { + for (i = 0; i < REFERENCE_MODES; ++i) { if (best_pred_rd[i] == INT64_MAX) best_pred_diff[i] = INT_MIN; else @@ -3808,13 +3161,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, else best_filter_diff[i] = best_rd - best_filter_rd[i]; } - if (cm->mcomp_filter_type == SWITCHABLE) + if (cm->interp_filter == SWITCHABLE) assert(best_filter_diff[SWITCHABLE_FILTERS] == 0); - } else { - vp9_zero(best_filter_diff); - } - - if (!x->skip) { for (i = 0; i < TX_MODES; i++) { if (best_tx_rd[i] == INT64_MAX) best_tx_diff[i] = 0; @@ -3822,20 +3170,121 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, best_tx_diff[i] = best_rd - best_tx_rd[i]; } } else { + vp9_zero(best_filter_diff); vp9_zero(best_tx_diff); } - set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1], - scale_factor); - store_coding_context(x, ctx, best_mode_index, - &mbmi->ref_mvs[mbmi->ref_frame[0]][0], - &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 : - mbmi->ref_frame[1]][0], - best_pred_diff, best_tx_diff, best_filter_diff); + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + store_coding_context(x, ctx, best_mode_index, best_pred_diff, + best_tx_diff, best_filter_diff, best_mode_skippable); return best_rd; } +int64_t vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x, + int *returnrate, + int64_t *returndistortion, + BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far) { + VP9_COMMON *const cm = &cpi->common; + RD_OPT *const rd_opt = &cpi->rd; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + unsigned char segment_id = mbmi->segment_id; + const int comp_pred = 0; + int i; + int64_t best_tx_diff[TX_MODES]; + int64_t best_pred_diff[REFERENCE_MODES]; + int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; + unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES]; + vp9_prob comp_mode_p; + INTERP_FILTER best_filter = SWITCHABLE; + int64_t this_rd = INT64_MAX; + int rate2 = 0; + const int64_t distortion2 = 0; + + x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; + + estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp, + &comp_mode_p); + + for (i = 0; i < MAX_REF_FRAMES; ++i) + x->pred_sse[i] = INT_MAX; + for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i) + x->pred_mv_sad[i] = INT_MAX; + + *returnrate = INT_MAX; + + assert(vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)); + + mbmi->mode = ZEROMV; + mbmi->uv_mode = DC_PRED; + mbmi->ref_frame[0] = LAST_FRAME; + mbmi->ref_frame[1] = NONE; + mbmi->mv[0].as_int = 0; + x->skip = 1; + + // Search for best switchable filter by checking the variance of + // pred error irrespective of whether the filter will be used + rd_opt->mask_filter = 0; + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) + rd_opt->filter_cache[i] = INT64_MAX; + + if (cm->interp_filter != BILINEAR) { + best_filter = EIGHTTAP; + if (cm->interp_filter == SWITCHABLE && + x->source_variance >= cpi->sf.disable_filter_search_var_thresh) { + int rs; + int best_rs = INT_MAX; + for (i = 0; i < SWITCHABLE_FILTERS; ++i) { + mbmi->interp_filter = i; + rs = vp9_get_switchable_rate(cpi); + if (rs < best_rs) { + best_rs = rs; + best_filter = mbmi->interp_filter; + } + } + } + } + // Set the appropriate filter + if (cm->interp_filter == SWITCHABLE) { + mbmi->interp_filter = best_filter; + rate2 += vp9_get_switchable_rate(cpi); + } else { + mbmi->interp_filter = cm->interp_filter; + } + + if (cm->reference_mode == REFERENCE_MODE_SELECT) + rate2 += vp9_cost_bit(comp_mode_p, comp_pred); + + // Estimate the reference frame signaling cost and add it + // to the rolling cost variable. + rate2 += ref_costs_single[LAST_FRAME]; + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); + + *returnrate = rate2; + *returndistortion = distortion2; + + if (this_rd >= best_rd_so_far) + return INT64_MAX; + + assert((cm->interp_filter == SWITCHABLE) || + (cm->interp_filter == mbmi->interp_filter)); + + update_rd_thresh_fact(cpi, bsize, THR_ZEROMV); + + vp9_zero(best_pred_diff); + vp9_zero(best_filter_diff); + vp9_zero(best_tx_diff); + + if (!x->select_tx_size) + swap_block_ptr(x, ctx, 1, 0, 0, MAX_MB_PLANE); + store_coding_context(x, ctx, THR_ZEROMV, + best_pred_diff, best_tx_diff, best_filter_diff, 0); + + return this_rd; +} int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, const TileInfo *const tile, @@ -3845,11 +3294,11 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) { - VP9_COMMON *cm = &cpi->common; - MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; - const struct segmentation *seg = &cm->seg; - const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]); + VP9_COMMON *const cm = &cpi->common; + RD_OPT *const rd_opt = &cpi->rd; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; + const struct segmentation *const seg = &cm->seg; MV_REFERENCE_FRAME ref_frame, second_ref_frame; unsigned char segment_id = mbmi->segment_id; int comp_pred, i; @@ -3857,40 +3306,33 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, struct buf_2d yv12_mb[4][MAX_MB_PLANE]; static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG }; - int idx_list[4] = {0, - cpi->lst_fb_idx, - cpi->gld_fb_idx, - cpi->alt_fb_idx}; int64_t best_rd = best_rd_so_far; int64_t best_yrd = best_rd_so_far; // FIXME(rbultje) more precise - int64_t best_tx_rd[TX_MODES]; - int64_t best_tx_diff[TX_MODES]; - int64_t best_pred_diff[NB_PREDICTION_TYPES]; - int64_t best_pred_rd[NB_PREDICTION_TYPES]; + static const int64_t best_tx_diff[TX_MODES] = { 0 }; + int64_t best_pred_diff[REFERENCE_MODES]; + int64_t best_pred_rd[REFERENCE_MODES]; int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS]; int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]; - MB_MODE_INFO best_mbmode = { 0 }; - int mode_index, best_mode_index = 0; + MB_MODE_INFO best_mbmode; + int ref_index, best_ref_index = 0; unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES]; vp9_prob comp_mode_p; int64_t best_inter_rd = INT64_MAX; MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME; - INTERPOLATION_TYPE tmp_best_filter = SWITCHABLE; - int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES]; - int64_t dist_uv[TX_SIZES]; - int skip_uv[TX_SIZES]; - MB_PREDICTION_MODE mode_uv[TX_SIZES] = { 0 }; - struct scale_factors scale_factor[4]; - unsigned int ref_frame_mask = 0; - unsigned int mode_mask = 0; - int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex, - cpi->common.y_dc_delta_q); + INTERP_FILTER tmp_best_filter = SWITCHABLE; + int rate_uv_intra, rate_uv_tokenonly; + int64_t dist_uv; + int skip_uv; + PREDICTION_MODE mode_uv = DC_PRED; + int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q); int_mv seg_mvs[4][MAX_REF_FRAMES]; b_mode_info best_bmodes[4]; int best_skip2 = 0; + int mode_skip_mask = 0; x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4); + vp9_zero(best_mbmode); for (i = 0; i < 4; i++) { int j; @@ -3898,41 +3340,29 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, seg_mvs[i][j].as_int = INVALID_MV; } - estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp, + estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp, &comp_mode_p); - for (i = 0; i < NB_PREDICTION_TYPES; ++i) + for (i = 0; i < REFERENCE_MODES; ++i) best_pred_rd[i] = INT64_MAX; - for (i = 0; i < TX_MODES; i++) - best_tx_rd[i] = INT64_MAX; for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) best_filter_rd[i] = INT64_MAX; - for (i = 0; i < TX_SIZES; i++) - rate_uv_intra[i] = INT_MAX; + rate_uv_intra = INT_MAX; *returnrate = INT_MAX; - // Create a mask set to 1 for each reference frame used by a smaller - // resolution. - if (cpi->sf.use_avoid_tested_higherror) { - ref_frame_mask = 0; - mode_mask = 0; - ref_frame_mask = ~ref_frame_mask; - mode_mask = ~mode_mask; - } - for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { if (cpi->ref_frame_flags & flag_list[ref_frame]) { - setup_buffer_inter(cpi, x, tile, idx_list[ref_frame], ref_frame, - block_size, mi_row, mi_col, - frame_mv[NEARESTMV], frame_mv[NEARMV], - yv12_mb, scale_factor); + setup_buffer_inter(cpi, x, tile, + ref_frame, bsize, mi_row, mi_col, + frame_mv[NEARESTMV], frame_mv[NEARMV], + yv12_mb); } frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; frame_mv[ZEROMV][ref_frame].as_int = 0; } - for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) { + for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) { int mode_excluded = 0; int64_t this_rd = INT64_MAX; int disable_skip = 0; @@ -3940,126 +3370,86 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int rate2 = 0, rate_y = 0, rate_uv = 0; int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0; int skippable = 0; - int64_t tx_cache[TX_MODES]; int i; int this_skip2 = 0; int64_t total_sse = INT_MAX; int early_term = 0; - for (i = 0; i < TX_MODES; ++i) - tx_cache[i] = INT64_MAX; - - x->skip = 0; - ref_frame = vp9_ref_order[mode_index].ref_frame; - second_ref_frame = vp9_ref_order[mode_index].second_ref_frame; + ref_frame = vp9_ref_order[ref_index].ref_frame[0]; + second_ref_frame = vp9_ref_order[ref_index].ref_frame[1]; // Look at the reference frame of the best mode so far and set the // skip mask to look at a subset of the remaining modes. - if (mode_index > 2 && cpi->sf.mode_skip_start < MAX_MODES) { - if (mode_index == 3) { - switch (vp9_ref_order[best_mode_index].ref_frame) { + if (ref_index > 2 && cpi->sf.mode_skip_start < MAX_MODES) { + if (ref_index == 3) { + switch (vp9_ref_order[best_ref_index].ref_frame[0]) { case INTRA_FRAME: - cpi->mode_skip_mask = 0; + mode_skip_mask = 0; break; case LAST_FRAME: - cpi->mode_skip_mask = 0x0010; + mode_skip_mask = 0x0010; break; case GOLDEN_FRAME: - cpi->mode_skip_mask = 0x0008; + mode_skip_mask = 0x0008; break; case ALTREF_FRAME: - cpi->mode_skip_mask = 0x0000; + mode_skip_mask = 0x0000; break; case NONE: case MAX_REF_FRAMES: - assert(!"Invalid Reference frame"); + assert(0 && "Invalid Reference frame"); + break; } } - if (cpi->mode_skip_mask & ((int64_t)1 << mode_index)) + if (mode_skip_mask & (1 << ref_index)) continue; } - // Skip if the current reference frame has been masked off - if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask && - (cpi->ref_frame_mask & (1 << ref_frame))) - continue; - // Test best rd so far against threshold for trying this mode. - if ((best_rd < - ((int64_t)cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] * - cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >> 5)) || - cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] == INT_MAX) + if (rd_less_than_thresh(best_rd, + rd_opt->threshes[segment_id][bsize][ref_index], + rd_opt->thresh_freq_fact[bsize][ref_index])) continue; - // Do not allow compound prediction if the segment level reference - // frame feature is in use as in this case there can only be one reference. - if ((second_ref_frame > INTRA_FRAME) && - vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) - continue; - - mbmi->ref_frame[0] = ref_frame; - mbmi->ref_frame[1] = second_ref_frame; - - if (!(ref_frame == INTRA_FRAME - || (cpi->ref_frame_flags & flag_list[ref_frame]))) { - continue; - } - if (!(second_ref_frame == NONE - || (cpi->ref_frame_flags & flag_list[second_ref_frame]))) { + if (ref_frame > INTRA_FRAME && + !(cpi->ref_frame_flags & flag_list[ref_frame])) { continue; } comp_pred = second_ref_frame > INTRA_FRAME; if (comp_pred) { - if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) - if (vp9_ref_order[best_mode_index].ref_frame == INTRA_FRAME) - continue; - if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) - if (ref_frame != best_inter_ref_frame && - second_ref_frame != best_inter_ref_frame) - continue; + if (!cm->allow_comp_inter_inter) + continue; + + if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) + continue; + // Do not allow compound prediction if the segment level reference frame + // feature is in use as in this case there can only be one reference. + if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) + continue; + if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) && + vp9_ref_order[best_ref_index].ref_frame[0] == INTRA_FRAME) + continue; + if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) && + ref_frame != best_inter_ref_frame && + second_ref_frame != best_inter_ref_frame) + continue; } // TODO(jingning, jkoleszar): scaling reference frame not supported for // sub8x8 blocks. - if (ref_frame > 0 && - vp9_is_scaled(scale_factor[ref_frame].sfc)) + if (ref_frame > INTRA_FRAME && + vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf)) continue; - if (second_ref_frame > 0 && - vp9_is_scaled(scale_factor[second_ref_frame].sfc)) + if (second_ref_frame > INTRA_FRAME && + vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf)) continue; - set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor); - mbmi->uv_mode = DC_PRED; - - // Evaluate all sub-pel filters irrespective of whether we can use - // them for this frame. - mbmi->interp_filter = cm->mcomp_filter_type; - vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); - - if (comp_pred) { - if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) - continue; - set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor); - - mode_excluded = mode_excluded - ? mode_excluded - : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY; - } else { - if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) { - mode_excluded = - mode_excluded ? - mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY; - } - } - - // Select prediction reference frames. - for (i = 0; i < MAX_MB_PLANE; i++) { - xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; - if (comp_pred) - xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i]; - } + if (comp_pred) + mode_excluded = cm->reference_mode == SINGLE_REFERENCE; + else if (ref_frame != INTRA_FRAME) + mode_excluded = cm->reference_mode == COMPOUND_REFERENCE; // If the segment reference frame feature is enabled.... // then do nothing if the current ref frame is not allowed.. @@ -4067,11 +3457,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { continue; - // If the segment skip feature is enabled.... - // then do nothing if the current mode is not allowed.. - } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) && - ref_frame != INTRA_FRAME) { - continue; // Disable this drop out case if the ref frame // segment level feature is enabled for this segment. This is to // prevent the possibility that we end up unable to pick any mode. @@ -4081,19 +3466,30 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, // unless ARNR filtering is enabled in which case we want // an unfiltered alternative. We allow near/nearest as well // because they may result in zero-zero MVs but be cheaper. - if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) + if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) continue; } -#ifdef MODE_TEST_HIT_STATS - // TEST/DEBUG CODE - // Keep a rcord of the number of test hits at each size - cpi->mode_test_hits[bsize]++; -#endif + mbmi->tx_size = TX_4X4; + mbmi->uv_mode = DC_PRED; + mbmi->ref_frame[0] = ref_frame; + mbmi->ref_frame[1] = second_ref_frame; + // Evaluate all sub-pel filters irrespective of whether we can use + // them for this frame. + mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP + : cm->interp_filter; + x->skip = 0; + set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); + + // Select prediction reference frames. + for (i = 0; i < MAX_MB_PLANE; i++) { + xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; + if (comp_pred) + xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i]; + } if (ref_frame == INTRA_FRAME) { int rate; - mbmi->tx_size = TX_4X4; if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y, &distortion_y, best_rd) >= best_rd) continue; @@ -4101,20 +3497,18 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, rate2 += intra_cost_penalty; distortion2 += distortion_y; - if (rate_uv_intra[TX_4X4] == INT_MAX) { - choose_intra_uv_mode(cpi, ctx, bsize, &rate_uv_intra[TX_4X4], - &rate_uv_tokenonly[TX_4X4], - &dist_uv[TX_4X4], &skip_uv[TX_4X4], - &mode_uv[TX_4X4]); + if (rate_uv_intra == INT_MAX) { + choose_intra_uv_mode(cpi, ctx, bsize, TX_4X4, + &rate_uv_intra, + &rate_uv_tokenonly, + &dist_uv, &skip_uv, + &mode_uv); } - rate2 += rate_uv_intra[TX_4X4]; - rate_uv = rate_uv_tokenonly[TX_4X4]; - distortion2 += dist_uv[TX_4X4]; - distortion_uv = dist_uv[TX_4X4]; - mbmi->uv_mode = mode_uv[TX_4X4]; - tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); - for (i = 0; i < TX_MODES; ++i) - tx_cache[i] = tx_cache[ONLY_4X4]; + rate2 += rate_uv_intra; + rate_uv = rate_uv_tokenonly; + distortion2 += dist_uv; + distortion_uv = dist_uv; + mbmi->uv_mode = mode_uv; } else { int rate; int64_t distortion; @@ -4133,19 +3527,24 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int uv_skippable; this_rd_thresh = (ref_frame == LAST_FRAME) ? - cpi->rd_thresh_sub8x8[segment_id][bsize][THR_LAST] : - cpi->rd_thresh_sub8x8[segment_id][bsize][THR_ALTR]; + rd_opt->threshes[segment_id][bsize][THR_LAST] : + rd_opt->threshes[segment_id][bsize][THR_ALTR]; this_rd_thresh = (ref_frame == GOLDEN_FRAME) ? - cpi->rd_thresh_sub8x8[segment_id][bsize][THR_GOLD] : this_rd_thresh; - xd->mi_8x8[0]->mbmi.tx_size = TX_4X4; + rd_opt->threshes[segment_id][bsize][THR_GOLD] : this_rd_thresh; + rd_opt->mask_filter = 0; + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) + rd_opt->filter_cache[i] = INT64_MAX; - cpi->rd_filter_cache[SWITCHABLE_FILTERS] = INT64_MAX; - if (cm->mcomp_filter_type != BILINEAR) { + if (cm->interp_filter != BILINEAR) { tmp_best_filter = EIGHTTAP; - if (x->source_variance < - cpi->sf.disable_filter_search_var_thresh) { + if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) { tmp_best_filter = EIGHTTAP; - vp9_zero(cpi->rd_filter_cache); + } else if (cpi->sf.adaptive_pred_interp_filter == 1 && + ctx->pred_interp_filter < SWITCHABLE) { + tmp_best_filter = ctx->pred_interp_filter; + } else if (cpi->sf.adaptive_pred_interp_filter == 2) { + tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE ? + ctx->pred_interp_filter : 0; } else { for (switchable_filter_index = 0; switchable_filter_index < SWITCHABLE_FILTERS; @@ -4153,37 +3552,36 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int newbest, rs; int64_t rs_rd; mbmi->interp_filter = switchable_filter_index; - vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); - - tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile, - &mbmi->ref_mvs[ref_frame][0], - second_ref, - best_yrd, - &rate, &rate_y, &distortion, - &skippable, &total_sse, - (int)this_rd_thresh, seg_mvs, - bsi, switchable_filter_index, - mi_row, mi_col); + tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile, + &mbmi->ref_mvs[ref_frame][0], + second_ref, best_yrd, &rate, + &rate_y, &distortion, + &skippable, &total_sse, + (int) this_rd_thresh, seg_mvs, + bsi, switchable_filter_index, + mi_row, mi_col); if (tmp_rd == INT64_MAX) continue; - cpi->rd_filter_cache[switchable_filter_index] = tmp_rd; - rs = get_switchable_rate(x); + rs = vp9_get_switchable_rate(cpi); rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0); - cpi->rd_filter_cache[SWITCHABLE_FILTERS] = - MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS], + rd_opt->filter_cache[switchable_filter_index] = tmp_rd; + rd_opt->filter_cache[SWITCHABLE_FILTERS] = + MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS], tmp_rd + rs_rd); - if (cm->mcomp_filter_type == SWITCHABLE) + if (cm->interp_filter == SWITCHABLE) tmp_rd += rs_rd; + rd_opt->mask_filter = MAX(rd_opt->mask_filter, tmp_rd); + newbest = (tmp_rd < tmp_best_rd); if (newbest) { tmp_best_filter = mbmi->interp_filter; tmp_best_rd = tmp_rd; } - if ((newbest && cm->mcomp_filter_type == SWITCHABLE) || - (mbmi->interp_filter == cm->mcomp_filter_type && - cm->mcomp_filter_type != SWITCHABLE)) { + if ((newbest && cm->interp_filter == SWITCHABLE) || + (mbmi->interp_filter == cm->interp_filter && + cm->interp_filter != SWITCHABLE)) { tmp_best_rdu = tmp_rd; tmp_best_rate = rate; tmp_best_ratey = rate_y; @@ -4192,8 +3590,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, tmp_best_skippable = skippable; tmp_best_mbmode = *mbmi; for (i = 0; i < 4; i++) { - tmp_best_bmodes[i] = xd->mi_8x8[0]->bmi[i]; - x->zcoeff_blk[TX_4X4][i] = !xd->plane[0].eobs[i]; + tmp_best_bmodes[i] = xd->mi[0]->bmi[i]; + x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i]; } pred_exists = 1; if (switchable_filter_index == 0 && @@ -4212,32 +3610,23 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, } } - if (tmp_best_rdu == INT64_MAX) + if (tmp_best_rdu == INT64_MAX && pred_exists) continue; - mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ? - tmp_best_filter : cm->mcomp_filter_type); - vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common); + mbmi->interp_filter = (cm->interp_filter == SWITCHABLE ? + tmp_best_filter : cm->interp_filter); if (!pred_exists) { // Handles the special case when a filter that is not in the // switchable list (bilinear, 6-tap) is indicated at the frame level - tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile, - &mbmi->ref_mvs[ref_frame][0], - second_ref, - best_yrd, - &rate, &rate_y, &distortion, - &skippable, &total_sse, - (int)this_rd_thresh, seg_mvs, - bsi, 0, - mi_row, mi_col); + tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile, + &mbmi->ref_mvs[ref_frame][0], + second_ref, best_yrd, &rate, &rate_y, + &distortion, &skippable, &total_sse, + (int) this_rd_thresh, seg_mvs, bsi, 0, + mi_row, mi_col); if (tmp_rd == INT64_MAX) continue; } else { - if (cpi->common.mcomp_filter_type == SWITCHABLE) { - int rs = get_switchable_rate(x); - tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0); - } - tmp_rd = tmp_best_rdu; total_sse = tmp_best_sse; rate = tmp_best_rate; rate_y = tmp_best_ratey; @@ -4245,21 +3634,19 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, skippable = tmp_best_skippable; *mbmi = tmp_best_mbmode; for (i = 0; i < 4; i++) - xd->mi_8x8[0]->bmi[i] = tmp_best_bmodes[i]; + xd->mi[0]->bmi[i] = tmp_best_bmodes[i]; } rate2 += rate; distortion2 += distortion; - if (cpi->common.mcomp_filter_type == SWITCHABLE) - rate2 += get_switchable_rate(x); + if (cm->interp_filter == SWITCHABLE) + rate2 += vp9_get_switchable_rate(cpi); + + if (!mode_excluded) + mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE + : cm->reference_mode == COMPOUND_REFERENCE; - if (!mode_excluded) { - if (comp_pred) - mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY; - else - mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY; - } compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred); tmp_best_rdu = best_rd - @@ -4279,16 +3666,11 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, distortion2 += distortion_uv; skippable = skippable && uv_skippable; total_sse += uv_sse; - - tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); - for (i = 0; i < TX_MODES; ++i) - tx_cache[i] = tx_cache[ONLY_4X4]; } } - if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) { + if (cm->reference_mode == REFERENCE_MODE_SELECT) rate2 += compmode_cost; - } // Estimate the reference frame signaling cost and add it // to the rolling cost variable. @@ -4299,26 +3681,17 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, } if (!disable_skip) { - // Test for the condition where skip block will be activated - // because there are no non zero coefficients and make any - // necessary adjustment for rate. Ignore if skip is coded at - // segment level as the cost wont have been added in. - // Is Mb level skip allowed (i.e. not coded at segment level). - const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id, - SEG_LVL_SKIP); + // Skip is never coded at the segment level for sub8x8 blocks and instead + // always coded in the bitstream at the mode info level. - if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) { + if (ref_frame != INTRA_FRAME && !xd->lossless) { if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) < RDCOST(x->rdmult, x->rddiv, 0, total_sse)) { // Add in the cost of the no skip flag. - int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), - 0); - rate2 += prob_skip_cost; + rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0); } else { // FIXME(rbultje) make this work for splitmv also - int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), - 1); - rate2 += prob_skip_cost; + rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); distortion2 = total_sse; assert(total_sse >= 0); rate2 -= (rate_y + rate_uv); @@ -4326,11 +3699,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, rate_uv = 0; this_skip2 = 1; } - } else if (mb_skip_allowed) { + } else { // Add in the cost of the no skip flag. - int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), - 0); - rate2 += prob_skip_cost; + rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0); } // Calculate the final RD estimate for this mode. @@ -4338,8 +3709,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, } // Keep record of best inter rd with single reference - if (xd->mi_8x8[0]->mbmi.ref_frame[0] > INTRA_FRAME && - xd->mi_8x8[0]->mbmi.ref_frame[1] == NONE && + if (is_inter_block(mbmi) && + !has_second_ref(mbmi) && !mode_excluded && this_rd < best_inter_rd) { best_inter_rd = this_rd; @@ -4347,7 +3718,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, } if (!disable_skip && ref_frame == INTRA_FRAME) { - for (i = 0; i < NB_PREDICTION_TYPES; ++i) + for (i = 0; i < REFERENCE_MODES; ++i) best_pred_rd[i] = MIN(best_pred_rd[i], this_rd); for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) best_filter_rd[i] = MIN(best_filter_rd[i], this_rd); @@ -4358,7 +3729,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, if (!mode_excluded) { int max_plane = MAX_MB_PLANE; // Note index of best mode so far - best_mode_index = mode_index; + best_ref_index = ref_index; if (ref_frame == INTRA_FRAME) { /* required for left and above block mv */ @@ -4373,18 +3744,18 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv); best_mbmode = *mbmi; best_skip2 = this_skip2; - if (!x->select_txfm_size) - swap_block_ptr(x, ctx, max_plane); - vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size], + if (!x->select_tx_size) + swap_block_ptr(x, ctx, 1, 0, 0, max_plane); + vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4], sizeof(uint8_t) * ctx->num_4x4_blk); for (i = 0; i < 4; i++) - best_bmodes[i] = xd->mi_8x8[0]->bmi[i]; + best_bmodes[i] = xd->mi[0]->bmi[i]; // TODO(debargha): enhance this test with a better distortion prediction // based on qp, activity mask and history if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) && - (mode_index > MIN_EARLY_TERM_INDEX)) { + (ref_index > MIN_EARLY_TERM_INDEX)) { const int qstep = xd->plane[0].dequant[1]; // TODO(debargha): Enhance this by specializing for each mode_index int scale = 4; @@ -4402,9 +3773,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, /* keep record of best compound/single-only prediction */ if (!disable_skip && ref_frame != INTRA_FRAME) { - int single_rd, hybrid_rd, single_rate, hybrid_rate; + int64_t single_rd, hybrid_rd, single_rate, hybrid_rate; - if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) { + if (cm->reference_mode == REFERENCE_MODE_SELECT) { single_rate = rate2 - compmode_cost; hybrid_rate = rate2; } else { @@ -4415,54 +3786,35 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2); hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2); - if (second_ref_frame <= INTRA_FRAME && - single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) { - best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd; - } else if (second_ref_frame > INTRA_FRAME && - single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) { - best_pred_rd[COMP_PREDICTION_ONLY] = single_rd; + if (!comp_pred && single_rd < best_pred_rd[SINGLE_REFERENCE]) { + best_pred_rd[SINGLE_REFERENCE] = single_rd; + } else if (comp_pred && single_rd < best_pred_rd[COMPOUND_REFERENCE]) { + best_pred_rd[COMPOUND_REFERENCE] = single_rd; } - if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION]) - best_pred_rd[HYBRID_PREDICTION] = hybrid_rd; + if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT]) + best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd; } /* keep record of best filter type */ if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME && - cm->mcomp_filter_type != BILINEAR) { - int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ? - SWITCHABLE_FILTERS : cm->mcomp_filter_type]; + cm->interp_filter != BILINEAR) { + int64_t ref = rd_opt->filter_cache[cm->interp_filter == SWITCHABLE ? + SWITCHABLE_FILTERS : cm->interp_filter]; + int64_t adj_rd; for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) { - int64_t adj_rd; - // In cases of poor prediction, filter_cache[] can contain really big - // values, which actually are bigger than this_rd itself. This can - // cause negative best_filter_rd[] values, which is obviously silly. - // Therefore, if filter_cache < ref, we do an adjusted calculation. - if (cpi->rd_filter_cache[i] >= ref) - adj_rd = this_rd + cpi->rd_filter_cache[i] - ref; - else // FIXME(rbultje) do this for comppred also - adj_rd = this_rd - (ref - cpi->rd_filter_cache[i]) * this_rd / ref; - best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd); - } - } - - /* keep record of best txfm size */ - if (bsize < BLOCK_32X32) { - if (bsize < BLOCK_16X16) { - tx_cache[ALLOW_8X8] = tx_cache[ONLY_4X4]; - tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8]; - } - tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16]; - } - if (!mode_excluded && this_rd != INT64_MAX) { - for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) { - int64_t adj_rd = INT64_MAX; - if (ref_frame > INTRA_FRAME) - adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode]; + if (ref == INT64_MAX) + adj_rd = 0; + else if (rd_opt->filter_cache[i] == INT64_MAX) + // when early termination is triggered, the encoder does not have + // access to the rate-distortion cost. it only knows that the cost + // should be above the maximum valid value. hence it takes the known + // maximum plus an arbitrary constant as the rate-distortion cost. + adj_rd = rd_opt->mask_filter - ref + 10; else - adj_rd = this_rd; + adj_rd = rd_opt->filter_cache[i] - ref; - if (adj_rd < best_tx_rd[i]) - best_tx_rd[i] = adj_rd; + adj_rd += this_rd; + best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd); } } @@ -4479,67 +3831,43 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, // If we used an estimate for the uv intra rd in the loop above... if (cpi->sf.use_uv_intra_rd_estimate) { // Do Intra UV best rd mode selection if best mode choice above was intra. - if (vp9_ref_order[best_mode_index].ref_frame == INTRA_FRAME) { - TX_SIZE uv_tx_size = get_uv_tx_size(mbmi); - rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size], - &rate_uv_tokenonly[uv_tx_size], - &dist_uv[uv_tx_size], - &skip_uv[uv_tx_size], - BLOCK_8X8); + if (vp9_ref_order[best_ref_index].ref_frame[0] == INTRA_FRAME) { + *mbmi = best_mbmode; + rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra, + &rate_uv_tokenonly, + &dist_uv, + &skip_uv, + BLOCK_8X8, TX_4X4); } } - // If we are using reference masking and the set mask flag is set then - // create the reference frame mask. - if (cpi->sf.reference_masking && cpi->set_ref_frame_mask) - cpi->ref_frame_mask = ~(1 << vp9_ref_order[best_mode_index].ref_frame); - - if (best_rd == INT64_MAX && bsize < BLOCK_8X8) { + if (best_rd == INT64_MAX) { *returnrate = INT_MAX; - *returndistortion = INT_MAX; + *returndistortion = INT64_MAX; return best_rd; } - assert((cm->mcomp_filter_type == SWITCHABLE) || - (cm->mcomp_filter_type == best_mbmode.interp_filter) || - (best_mbmode.ref_frame[0] == INTRA_FRAME)); + assert((cm->interp_filter == SWITCHABLE) || + (cm->interp_filter == best_mbmode.interp_filter) || + !is_inter_block(&best_mbmode)); - // Updating rd_thresh_freq_fact[] here means that the different - // partition/block sizes are handled independently based on the best - // choice for the current partition. It may well be better to keep a scaled - // best rd so far value and update rd_thresh_freq_fact based on the mode/size - // combination that wins out. - if (cpi->sf.adaptive_rd_thresh) { - for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) { - if (mode_index == best_mode_index) { - cpi->rd_thresh_freq_sub8x8[bsize][mode_index] -= - (cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >> 3); - } else { - cpi->rd_thresh_freq_sub8x8[bsize][mode_index] += RD_THRESH_INC; - if (cpi->rd_thresh_freq_sub8x8[bsize][mode_index] > - (cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT)) { - cpi->rd_thresh_freq_sub8x8[bsize][mode_index] = - cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT; - } - } - } - } + update_rd_thresh_fact(cpi, bsize, best_ref_index); // macroblock modes *mbmi = best_mbmode; x->skip |= best_skip2; - if (best_mbmode.ref_frame[0] == INTRA_FRAME) { + if (!is_inter_block(&best_mbmode)) { for (i = 0; i < 4; i++) - xd->mi_8x8[0]->bmi[i].as_mode = best_bmodes[i].as_mode; + xd->mi[0]->bmi[i].as_mode = best_bmodes[i].as_mode; } else { for (i = 0; i < 4; ++i) - vpx_memcpy(&xd->mi_8x8[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info)); + vpx_memcpy(&xd->mi[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info)); - mbmi->mv[0].as_int = xd->mi_8x8[0]->bmi[3].as_mv[0].as_int; - mbmi->mv[1].as_int = xd->mi_8x8[0]->bmi[3].as_mv[1].as_int; + mbmi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int; + mbmi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int; } - for (i = 0; i < NB_PREDICTION_TYPES; ++i) { + for (i = 0; i < REFERENCE_MODES; ++i) { if (best_pred_rd[i] == INT64_MAX) best_pred_diff[i] = INT_MIN; else @@ -4553,30 +3881,15 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, else best_filter_diff[i] = best_rd - best_filter_rd[i]; } - if (cm->mcomp_filter_type == SWITCHABLE) + if (cm->interp_filter == SWITCHABLE) assert(best_filter_diff[SWITCHABLE_FILTERS] == 0); } else { vp9_zero(best_filter_diff); } - if (!x->skip) { - for (i = 0; i < TX_MODES; i++) { - if (best_tx_rd[i] == INT64_MAX) - best_tx_diff[i] = 0; - else - best_tx_diff[i] = best_rd - best_tx_rd[i]; - } - } else { - vp9_zero(best_tx_diff); - } - - set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1], - scale_factor); - store_coding_context(x, ctx, best_mode_index, - &mbmi->ref_mvs[mbmi->ref_frame[0]][0], - &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 : - mbmi->ref_frame[1]][0], - best_pred_diff, best_tx_diff, best_filter_diff); + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + store_coding_context(x, ctx, best_ref_index, + best_pred_diff, best_tx_diff, best_filter_diff, 0); return best_rd; } diff --git a/media/libvpx/vp9/encoder/vp9_rdopt.h b/media/libvpx/vp9/encoder/vp9_rdopt.h index 92fb23548e05..52c603fb6484 100644 --- a/media/libvpx/vp9/encoder/vp9_rdopt.h +++ b/media/libvpx/vp9/encoder/vp9_rdopt.h @@ -8,29 +8,27 @@ * be found in the AUTHORS file in the root of the source tree. */ - #ifndef VP9_ENCODER_VP9_RDOPT_H_ #define VP9_ENCODER_VP9_RDOPT_H_ -#define RDDIV_BITS 7 +#include "vp9/common/vp9_blockd.h" -#define RDCOST(RM, DM, R, D) \ - (((128 + ((int64_t)R) * (RM)) >> 8) + (D << DM)) -#define QIDX_SKIP_THRESH 115 +#include "vp9/encoder/vp9_block.h" +#include "vp9/encoder/vp9_context_tree.h" + +#ifdef __cplusplus +extern "C" { +#endif struct TileInfo; +struct VP9_COMP; +struct macroblock; -int vp9_compute_rd_mult(VP9_COMP *cpi, int qindex); - -void vp9_initialize_rd_consts(VP9_COMP *cpi); - -void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex); - -void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, +void vp9_rd_pick_intra_mode_sb(struct VP9_COMP *cpi, struct macroblock *x, int *r, int64_t *d, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd); -int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, +int64_t vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi, struct macroblock *x, const struct TileInfo *const tile, int mi_row, int mi_col, int *returnrate, @@ -39,7 +37,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far); -int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, +int64_t vp9_rd_pick_inter_mode_sb_seg_skip(struct VP9_COMP *cpi, + struct macroblock *x, + int *returnrate, + int64_t *returndistortion, + BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far); + +int64_t vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi, + struct macroblock *x, const struct TileInfo *const tile, int mi_row, int mi_col, int *returnrate, @@ -48,14 +55,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far); -void vp9_init_me_luts(); - -void vp9_set_mbmode_and_mvs(MACROBLOCK *x, - MB_PREDICTION_MODE mb, int_mv *mv); - -void vp9_get_entropy_contexts(TX_SIZE tx_size, - ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16], - const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left, - int num_4x4_w, int num_4x4_h); +#ifdef __cplusplus +} // extern "C" +#endif #endif // VP9_ENCODER_VP9_RDOPT_H_ diff --git a/media/libvpx/vp9/encoder/vp9_resize.c b/media/libvpx/vp9/encoder/vp9_resize.c new file mode 100644 index 000000000000..4e6efaeb969f --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_resize.c @@ -0,0 +1,576 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include +#include +#include + +#include "vp9/common/vp9_common.h" +#include "vp9/encoder/vp9_resize.h" + +#define FILTER_BITS 7 + +#define INTERP_TAPS 8 +#define SUBPEL_BITS 5 +#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1) +#define INTERP_PRECISION_BITS 32 + +typedef int16_t interp_kernel[INTERP_TAPS]; + +// Filters for interpolation (0.5-band) - note this also filters integer pels. +const interp_kernel vp9_filteredinterp_filters500[(1 << SUBPEL_BITS)] = { + {-3, 0, 35, 64, 35, 0, -3, 0}, + {-3, -1, 34, 64, 36, 1, -3, 0}, + {-3, -1, 32, 64, 38, 1, -3, 0}, + {-2, -2, 31, 63, 39, 2, -3, 0}, + {-2, -2, 29, 63, 41, 2, -3, 0}, + {-2, -2, 28, 63, 42, 3, -4, 0}, + {-2, -3, 27, 63, 43, 4, -4, 0}, + {-2, -3, 25, 62, 45, 5, -4, 0}, + {-2, -3, 24, 62, 46, 5, -4, 0}, + {-2, -3, 23, 61, 47, 6, -4, 0}, + {-2, -3, 21, 60, 49, 7, -4, 0}, + {-1, -4, 20, 60, 50, 8, -4, -1}, + {-1, -4, 19, 59, 51, 9, -4, -1}, + {-1, -4, 17, 58, 52, 10, -4, 0}, + {-1, -4, 16, 57, 53, 12, -4, -1}, + {-1, -4, 15, 56, 54, 13, -4, -1}, + {-1, -4, 14, 55, 55, 14, -4, -1}, + {-1, -4, 13, 54, 56, 15, -4, -1}, + {-1, -4, 12, 53, 57, 16, -4, -1}, + {0, -4, 10, 52, 58, 17, -4, -1}, + {-1, -4, 9, 51, 59, 19, -4, -1}, + {-1, -4, 8, 50, 60, 20, -4, -1}, + {0, -4, 7, 49, 60, 21, -3, -2}, + {0, -4, 6, 47, 61, 23, -3, -2}, + {0, -4, 5, 46, 62, 24, -3, -2}, + {0, -4, 5, 45, 62, 25, -3, -2}, + {0, -4, 4, 43, 63, 27, -3, -2}, + {0, -4, 3, 42, 63, 28, -2, -2}, + {0, -3, 2, 41, 63, 29, -2, -2}, + {0, -3, 2, 39, 63, 31, -2, -2}, + {0, -3, 1, 38, 64, 32, -1, -3}, + {0, -3, 1, 36, 64, 34, -1, -3} +}; + +// Filters for interpolation (0.625-band) - note this also filters integer pels. +const interp_kernel vp9_filteredinterp_filters625[(1 << SUBPEL_BITS)] = { + {-1, -8, 33, 80, 33, -8, -1, 0}, + {-1, -8, 30, 80, 35, -8, -1, 1}, + {-1, -8, 28, 80, 37, -7, -2, 1}, + {0, -8, 26, 79, 39, -7, -2, 1}, + {0, -8, 24, 79, 41, -7, -2, 1}, + {0, -8, 22, 78, 43, -6, -2, 1}, + {0, -8, 20, 78, 45, -5, -3, 1}, + {0, -8, 18, 77, 48, -5, -3, 1}, + {0, -8, 16, 76, 50, -4, -3, 1}, + {0, -8, 15, 75, 52, -3, -4, 1}, + {0, -7, 13, 74, 54, -3, -4, 1}, + {0, -7, 11, 73, 56, -2, -4, 1}, + {0, -7, 10, 71, 58, -1, -4, 1}, + {1, -7, 8, 70, 60, 0, -5, 1}, + {1, -6, 6, 68, 62, 1, -5, 1}, + {1, -6, 5, 67, 63, 2, -5, 1}, + {1, -6, 4, 65, 65, 4, -6, 1}, + {1, -5, 2, 63, 67, 5, -6, 1}, + {1, -5, 1, 62, 68, 6, -6, 1}, + {1, -5, 0, 60, 70, 8, -7, 1}, + {1, -4, -1, 58, 71, 10, -7, 0}, + {1, -4, -2, 56, 73, 11, -7, 0}, + {1, -4, -3, 54, 74, 13, -7, 0}, + {1, -4, -3, 52, 75, 15, -8, 0}, + {1, -3, -4, 50, 76, 16, -8, 0}, + {1, -3, -5, 48, 77, 18, -8, 0}, + {1, -3, -5, 45, 78, 20, -8, 0}, + {1, -2, -6, 43, 78, 22, -8, 0}, + {1, -2, -7, 41, 79, 24, -8, 0}, + {1, -2, -7, 39, 79, 26, -8, 0}, + {1, -2, -7, 37, 80, 28, -8, -1}, + {1, -1, -8, 35, 80, 30, -8, -1}, +}; + +// Filters for interpolation (0.75-band) - note this also filters integer pels. +const interp_kernel vp9_filteredinterp_filters750[(1 << SUBPEL_BITS)] = { + {2, -11, 25, 96, 25, -11, 2, 0}, + {2, -11, 22, 96, 28, -11, 2, 0}, + {2, -10, 19, 95, 31, -11, 2, 0}, + {2, -10, 17, 95, 34, -12, 2, 0}, + {2, -9, 14, 94, 37, -12, 2, 0}, + {2, -8, 12, 93, 40, -12, 1, 0}, + {2, -8, 9, 92, 43, -12, 1, 1}, + {2, -7, 7, 91, 46, -12, 1, 0}, + {2, -7, 5, 90, 49, -12, 1, 0}, + {2, -6, 3, 88, 52, -12, 0, 1}, + {2, -5, 1, 86, 55, -12, 0, 1}, + {2, -5, -1, 84, 58, -11, 0, 1}, + {2, -4, -2, 82, 61, -11, -1, 1}, + {2, -4, -4, 80, 64, -10, -1, 1}, + {1, -3, -5, 77, 67, -9, -1, 1}, + {1, -3, -6, 75, 70, -8, -2, 1}, + {1, -2, -7, 72, 72, -7, -2, 1}, + {1, -2, -8, 70, 75, -6, -3, 1}, + {1, -1, -9, 67, 77, -5, -3, 1}, + {1, -1, -10, 64, 80, -4, -4, 2}, + {1, -1, -11, 61, 82, -2, -4, 2}, + {1, 0, -11, 58, 84, -1, -5, 2}, + {1, 0, -12, 55, 86, 1, -5, 2}, + {1, 0, -12, 52, 88, 3, -6, 2}, + {0, 1, -12, 49, 90, 5, -7, 2}, + {0, 1, -12, 46, 91, 7, -7, 2}, + {1, 1, -12, 43, 92, 9, -8, 2}, + {0, 1, -12, 40, 93, 12, -8, 2}, + {0, 2, -12, 37, 94, 14, -9, 2}, + {0, 2, -12, 34, 95, 17, -10, 2}, + {0, 2, -11, 31, 95, 19, -10, 2}, + {0, 2, -11, 28, 96, 22, -11, 2} +}; + +// Filters for interpolation (0.875-band) - note this also filters integer pels. +const interp_kernel vp9_filteredinterp_filters875[(1 << SUBPEL_BITS)] = { + {3, -8, 13, 112, 13, -8, 3, 0}, + {3, -7, 10, 112, 17, -9, 3, -1}, + {2, -6, 7, 111, 21, -9, 3, -1}, + {2, -5, 4, 111, 24, -10, 3, -1}, + {2, -4, 1, 110, 28, -11, 3, -1}, + {1, -3, -1, 108, 32, -12, 4, -1}, + {1, -2, -3, 106, 36, -13, 4, -1}, + {1, -1, -6, 105, 40, -14, 4, -1}, + {1, -1, -7, 102, 44, -14, 4, -1}, + {1, 0, -9, 100, 48, -15, 4, -1}, + {1, 1, -11, 97, 53, -16, 4, -1}, + {0, 1, -12, 95, 57, -16, 4, -1}, + {0, 2, -13, 91, 61, -16, 4, -1}, + {0, 2, -14, 88, 65, -16, 4, -1}, + {0, 3, -15, 84, 69, -17, 4, 0}, + {0, 3, -16, 81, 73, -16, 3, 0}, + {0, 3, -16, 77, 77, -16, 3, 0}, + {0, 3, -16, 73, 81, -16, 3, 0}, + {0, 4, -17, 69, 84, -15, 3, 0}, + {-1, 4, -16, 65, 88, -14, 2, 0}, + {-1, 4, -16, 61, 91, -13, 2, 0}, + {-1, 4, -16, 57, 95, -12, 1, 0}, + {-1, 4, -16, 53, 97, -11, 1, 1}, + {-1, 4, -15, 48, 100, -9, 0, 1}, + {-1, 4, -14, 44, 102, -7, -1, 1}, + {-1, 4, -14, 40, 105, -6, -1, 1}, + {-1, 4, -13, 36, 106, -3, -2, 1}, + {-1, 4, -12, 32, 108, -1, -3, 1}, + {-1, 3, -11, 28, 110, 1, -4, 2}, + {-1, 3, -10, 24, 111, 4, -5, 2}, + {-1, 3, -9, 21, 111, 7, -6, 2}, + {-1, 3, -9, 17, 112, 10, -7, 3} +}; + +// Filters for interpolation (full-band) - no filtering for integer pixels +const interp_kernel vp9_filteredinterp_filters1000[(1 << SUBPEL_BITS)] = { + {0, 0, 0, 128, 0, 0, 0, 0}, + {0, 1, -3, 128, 3, -1, 0, 0}, + {-1, 2, -6, 127, 7, -2, 1, 0}, + {-1, 3, -9, 126, 12, -4, 1, 0}, + {-1, 4, -12, 125, 16, -5, 1, 0}, + {-1, 4, -14, 123, 20, -6, 2, 0}, + {-1, 5, -15, 120, 25, -8, 2, 0}, + {-1, 5, -17, 118, 30, -9, 3, -1}, + {-1, 6, -18, 114, 35, -10, 3, -1}, + {-1, 6, -19, 111, 41, -12, 3, -1}, + {-1, 6, -20, 107, 46, -13, 4, -1}, + {-1, 6, -21, 103, 52, -14, 4, -1}, + {-1, 6, -21, 99, 57, -16, 5, -1}, + {-1, 6, -21, 94, 63, -17, 5, -1}, + {-1, 6, -20, 89, 68, -18, 5, -1}, + {-1, 6, -20, 84, 73, -19, 6, -1}, + {-1, 6, -20, 79, 79, -20, 6, -1}, + {-1, 6, -19, 73, 84, -20, 6, -1}, + {-1, 5, -18, 68, 89, -20, 6, -1}, + {-1, 5, -17, 63, 94, -21, 6, -1}, + {-1, 5, -16, 57, 99, -21, 6, -1}, + {-1, 4, -14, 52, 103, -21, 6, -1}, + {-1, 4, -13, 46, 107, -20, 6, -1}, + {-1, 3, -12, 41, 111, -19, 6, -1}, + {-1, 3, -10, 35, 114, -18, 6, -1}, + {-1, 3, -9, 30, 118, -17, 5, -1}, + {0, 2, -8, 25, 120, -15, 5, -1}, + {0, 2, -6, 20, 123, -14, 4, -1}, + {0, 1, -5, 16, 125, -12, 4, -1}, + {0, 1, -4, 12, 126, -9, 3, -1}, + {0, 1, -2, 7, 127, -6, 2, -1}, + {0, 0, -1, 3, 128, -3, 1, 0} +}; + +// Filters for factor of 2 downsampling. +static const int16_t vp9_down2_symeven_half_filter[] = {56, 12, -3, -1}; +static const int16_t vp9_down2_symodd_half_filter[] = {64, 35, 0, -3}; + +static const interp_kernel *choose_interp_filter(int inlength, int outlength) { + int outlength16 = outlength * 16; + if (outlength16 >= inlength * 16) + return vp9_filteredinterp_filters1000; + else if (outlength16 >= inlength * 13) + return vp9_filteredinterp_filters875; + else if (outlength16 >= inlength * 11) + return vp9_filteredinterp_filters750; + else if (outlength16 >= inlength * 9) + return vp9_filteredinterp_filters625; + else + return vp9_filteredinterp_filters500; +} + +static void interpolate(const uint8_t *const input, int inlength, + uint8_t *output, int outlength) { + const int64_t delta = (((uint64_t)inlength << 32) + outlength / 2) / + outlength; + const int64_t offset = inlength > outlength ? + (((int64_t)(inlength - outlength) << 31) + outlength / 2) / outlength : + -(((int64_t)(outlength - inlength) << 31) + outlength / 2) / outlength; + uint8_t *optr = output; + int x, x1, x2, sum, k, int_pel, sub_pel; + int64_t y; + + const interp_kernel *interp_filters = + choose_interp_filter(inlength, outlength); + + x = 0; + y = offset; + while ((y >> INTERP_PRECISION_BITS) < (INTERP_TAPS / 2 - 1)) { + x++; + y += delta; + } + x1 = x; + x = outlength - 1; + y = delta * x + offset; + while ((y >> INTERP_PRECISION_BITS) + + (int64_t)(INTERP_TAPS / 2) >= inlength) { + x--; + y -= delta; + } + x2 = x; + if (x1 > x2) { + for (x = 0, y = offset; x < outlength; ++x, y += delta) { + const int16_t *filter; + int_pel = y >> INTERP_PRECISION_BITS; + sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK; + filter = interp_filters[sub_pel]; + sum = 0; + for (k = 0; k < INTERP_TAPS; ++k) { + const int pk = int_pel - INTERP_TAPS / 2 + 1 + k; + sum += filter[k] * input[(pk < 0 ? 0 : + (pk >= inlength ? inlength - 1 : pk))]; + } + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + } else { + // Initial part. + for (x = 0, y = offset; x < x1; ++x, y += delta) { + const int16_t *filter; + int_pel = y >> INTERP_PRECISION_BITS; + sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK; + filter = interp_filters[sub_pel]; + sum = 0; + for (k = 0; k < INTERP_TAPS; ++k) + sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0 ? + 0 : + int_pel - INTERP_TAPS / 2 + 1 + k)]; + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + // Middle part. + for (; x <= x2; ++x, y += delta) { + const int16_t *filter; + int_pel = y >> INTERP_PRECISION_BITS; + sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK; + filter = interp_filters[sub_pel]; + sum = 0; + for (k = 0; k < INTERP_TAPS; ++k) + sum += filter[k] * input[int_pel - INTERP_TAPS / 2 + 1 + k]; + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + // End part. + for (; x < outlength; ++x, y += delta) { + const int16_t *filter; + int_pel = y >> INTERP_PRECISION_BITS; + sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK; + filter = interp_filters[sub_pel]; + sum = 0; + for (k = 0; k < INTERP_TAPS; ++k) + sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k >= + inlength ? inlength - 1 : + int_pel - INTERP_TAPS / 2 + 1 + k)]; + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + } +} + +static void down2_symeven(const uint8_t *const input, int length, + uint8_t *output) { + // Actual filter len = 2 * filter_len_half. + static const int16_t *filter = vp9_down2_symeven_half_filter; + const int filter_len_half = sizeof(vp9_down2_symeven_half_filter) / 2; + int i, j; + uint8_t *optr = output; + int l1 = filter_len_half; + int l2 = (length - filter_len_half); + l1 += (l1 & 1); + l2 += (l2 & 1); + if (l1 > l2) { + // Short input length. + for (i = 0; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[(i - j < 0 ? 0 : i - j)] + + input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } else { + // Initial part. + for (i = 0; i < l1; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + 1 + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // Middle part. + for (; i < l2; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[i - j] + input[i + 1 + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // End part. + for (; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[i - j] + + input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } +} + +static void down2_symodd(const uint8_t *const input, int length, + uint8_t *output) { + // Actual filter len = 2 * filter_len_half - 1. + static const int16_t *filter = vp9_down2_symodd_half_filter; + const int filter_len_half = sizeof(vp9_down2_symodd_half_filter) / 2; + int i, j; + uint8_t *optr = output; + int l1 = filter_len_half - 1; + int l2 = (length - filter_len_half + 1); + l1 += (l1 & 1); + l2 += (l2 & 1); + if (l1 > l2) { + // Short input length. + for (i = 0; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[(i - j < 0 ? 0 : i - j)] + + input[(i + j >= length ? length - 1 : i + j)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } else { + // Initial part. + for (i = 0; i < l1; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // Middle part. + for (; i < l2; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[i - j] + input[i + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // End part. + for (; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } +} + +static int get_down2_length(int length, int steps) { + int s; + for (s = 0; s < steps; ++s) + length = (length + 1) >> 1; + return length; +} + +int get_down2_steps(int in_length, int out_length) { + int steps = 0; + int proj_in_length; + while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) { + ++steps; + in_length = proj_in_length; + } + return steps; +} + +static void resize_multistep(const uint8_t *const input, + int length, + uint8_t *output, + int olength, + uint8_t *buf) { + int steps; + if (length == olength) { + memcpy(output, input, sizeof(uint8_t) * length); + return; + } + steps = get_down2_steps(length, olength); + + if (steps > 0) { + int s; + uint8_t *out = NULL; + uint8_t *tmpbuf = NULL; + uint8_t *otmp, *otmp2; + int filteredlength = length; + if (!tmpbuf) { + tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) * length); + otmp = tmpbuf; + } else { + otmp = buf; + } + otmp2 = otmp + get_down2_length(length, 1); + for (s = 0; s < steps; ++s) { + const int proj_filteredlength = get_down2_length(filteredlength, 1); + const uint8_t *const in = (s == 0 ? input : out); + if (s == steps - 1 && proj_filteredlength == olength) + out = output; + else + out = (s & 1 ? otmp2 : otmp); + if (filteredlength & 1) + down2_symodd(in, filteredlength, out); + else + down2_symeven(in, filteredlength, out); + filteredlength = proj_filteredlength; + } + if (filteredlength != olength) { + interpolate(out, filteredlength, output, olength); + } + if (tmpbuf) + free(tmpbuf); + } else { + interpolate(input, length, output, olength); + } +} + +static void fill_col_to_arr(uint8_t *img, int stride, int len, uint8_t *arr) { + int i; + uint8_t *iptr = img; + uint8_t *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *aptr++ = *iptr; + } +} + +static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) { + int i; + uint8_t *iptr = img; + uint8_t *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *iptr = *aptr++; + } +} + +void vp9_resize_plane(const uint8_t *const input, + int height, + int width, + int in_stride, + uint8_t *output, + int height2, + int width2, + int out_stride) { + int i; + uint8_t *intbuf = (uint8_t *)malloc(sizeof(uint8_t) * width2 * height); + uint8_t *tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) * + (width < height ? height : width)); + uint8_t *arrbuf = (uint8_t *)malloc(sizeof(uint8_t) * (height + height2)); + for (i = 0; i < height; ++i) + resize_multistep(input + in_stride * i, width, + intbuf + width2 * i, width2, tmpbuf); + for (i = 0; i < width2; ++i) { + fill_col_to_arr(intbuf + i, width2, height, arrbuf); + resize_multistep(arrbuf, height, arrbuf + height, height2, tmpbuf); + fill_arr_to_col(output + i, out_stride, height2, arrbuf + height); + } + free(intbuf); + free(tmpbuf); + free(arrbuf); +} + +void vp9_resize_frame420(const uint8_t *const y, + int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, + int height, int width, + uint8_t *oy, int oy_stride, + uint8_t *ou, uint8_t *ov, int ouv_stride, + int oheight, int owidth) { + vp9_resize_plane(y, height, width, y_stride, + oy, oheight, owidth, oy_stride); + vp9_resize_plane(u, height / 2, width / 2, uv_stride, + ou, oheight / 2, owidth / 2, ouv_stride); + vp9_resize_plane(v, height / 2, width / 2, uv_stride, + ov, oheight / 2, owidth / 2, ouv_stride); +} + +void vp9_resize_frame422(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, + int height, int width, + uint8_t *oy, int oy_stride, + uint8_t *ou, uint8_t *ov, int ouv_stride, + int oheight, int owidth) { + vp9_resize_plane(y, height, width, y_stride, + oy, oheight, owidth, oy_stride); + vp9_resize_plane(u, height, width / 2, uv_stride, + ou, oheight, owidth / 2, ouv_stride); + vp9_resize_plane(v, height, width / 2, uv_stride, + ov, oheight, owidth / 2, ouv_stride); +} + +void vp9_resize_frame444(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, + int height, int width, + uint8_t *oy, int oy_stride, + uint8_t *ou, uint8_t *ov, int ouv_stride, + int oheight, int owidth) { + vp9_resize_plane(y, height, width, y_stride, + oy, oheight, owidth, oy_stride); + vp9_resize_plane(u, height, width, uv_stride, + ou, oheight, owidth, ouv_stride); + vp9_resize_plane(v, height, width, uv_stride, + ov, oheight, owidth, ouv_stride); +} diff --git a/media/libvpx/vp9/encoder/vp9_resize.h b/media/libvpx/vp9/encoder/vp9_resize.h new file mode 100644 index 000000000000..1818cd47efbc --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_resize.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_ENCODER_VP9_RESIZE_H_ +#define VP9_ENCODER_VP9_RESIZE_H_ + +#include +#include "vpx/vpx_integer.h" + +void vp9_resize_plane(const uint8_t *const input, + int height, + int width, + int in_stride, + uint8_t *output, + int height2, + int width2, + int out_stride); +void vp9_resize_frame420(const uint8_t *const y, + int y_stride, + const uint8_t *const u, + const uint8_t *const v, + int uv_stride, + int height, + int width, + uint8_t *oy, + int oy_stride, + uint8_t *ou, + uint8_t *ov, + int ouv_stride, + int oheight, + int owidth); +void vp9_resize_frame422(const uint8_t *const y, + int y_stride, + const uint8_t *const u, + const uint8_t *const v, + int uv_stride, + int height, + int width, + uint8_t *oy, + int oy_stride, + uint8_t *ou, + uint8_t *ov, + int ouv_stride, + int oheight, + int owidth); +void vp9_resize_frame444(const uint8_t *const y, + int y_stride, + const uint8_t *const u, + const uint8_t *const v, + int uv_stride, + int height, + int width, + uint8_t *oy, + int oy_stride, + uint8_t *ou, + uint8_t *ov, + int ouv_stride, + int oheight, + int owidth); + +#endif // VP9_ENCODER_VP9_RESIZE_H_ diff --git a/media/libvpx/vp9/encoder/vp9_sad.c b/media/libvpx/vp9/encoder/vp9_sad.c new file mode 100644 index 000000000000..d06263676fa4 --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_sad.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_variance.h" + +static INLINE unsigned int sad(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int width, int height) { + int y, x; + unsigned int sad = 0; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + sad += abs(a[x] - b[x]); + + a += a_stride; + b += b_stride; + } + + return sad; +} + +#define sadMxN(m, n) \ +unsigned int vp9_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return sad(src, src_stride, ref, ref_stride, m, n); \ +} \ +unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + uint8_t comp_pred[m * n]; \ + vp9_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \ + return sad(src, src_stride, comp_pred, m, m, n); \ +} + +#define sadMxNxK(m, n, k) \ +void vp9_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + unsigned int *sads) { \ + int i; \ + for (i = 0; i < k; ++i) \ + sads[i] = vp9_sad##m##x##n##_c(src, src_stride, &ref[i], ref_stride); \ +} + +#define sadMxNx4D(m, n) \ +void vp9_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ + const uint8_t *const refs[], int ref_stride, \ + unsigned int *sads) { \ + int i; \ + for (i = 0; i < 4; ++i) \ + sads[i] = vp9_sad##m##x##n##_c(src, src_stride, refs[i], ref_stride); \ +} + +// 64x64 +sadMxN(64, 64) +sadMxNxK(64, 64, 3) +sadMxNxK(64, 64, 8) +sadMxNx4D(64, 64) + +// 64x32 +sadMxN(64, 32) +sadMxNx4D(64, 32) + +// 32x64 +sadMxN(32, 64) +sadMxNx4D(32, 64) + +// 32x32 +sadMxN(32, 32) +sadMxNxK(32, 32, 3) +sadMxNxK(32, 32, 8) +sadMxNx4D(32, 32) + +// 32x16 +sadMxN(32, 16) +sadMxNx4D(32, 16) + +// 16x32 +sadMxN(16, 32) +sadMxNx4D(16, 32) + +// 16x16 +sadMxN(16, 16) +sadMxNxK(16, 16, 3) +sadMxNxK(16, 16, 8) +sadMxNx4D(16, 16) + +// 16x8 +sadMxN(16, 8) +sadMxNxK(16, 8, 3) +sadMxNxK(16, 8, 8) +sadMxNx4D(16, 8) + +// 8x16 +sadMxN(8, 16) +sadMxNxK(8, 16, 3) +sadMxNxK(8, 16, 8) +sadMxNx4D(8, 16) + +// 8x8 +sadMxN(8, 8) +sadMxNxK(8, 8, 3) +sadMxNxK(8, 8, 8) +sadMxNx4D(8, 8) + +// 8x4 +sadMxN(8, 4) +sadMxNxK(8, 4, 8) +sadMxNx4D(8, 4) + +// 4x8 +sadMxN(4, 8) +sadMxNxK(4, 8, 8) +sadMxNx4D(4, 8) + +// 4x4 +sadMxN(4, 4) +sadMxNxK(4, 4, 3) +sadMxNxK(4, 4, 8) +sadMxNx4D(4, 4) diff --git a/media/libvpx/vp9/encoder/vp9_sad_c.c b/media/libvpx/vp9/encoder/vp9_sad_c.c deleted file mode 100644 index 42ddb21a51b1..000000000000 --- a/media/libvpx/vp9/encoder/vp9_sad_c.c +++ /dev/null @@ -1,615 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include -#include "vp9/common/vp9_sadmxn.h" -#include "vp9/encoder/vp9_variance.h" -#include "./vpx_config.h" -#include "vpx/vpx_integer.h" -#include "./vp9_rtcd.h" - -#define sad_mxn_func(m, n) \ -unsigned int vp9_sad##m##x##n##_c(const uint8_t *src_ptr, \ - int src_stride, \ - const uint8_t *ref_ptr, \ - int ref_stride, \ - unsigned int max_sad) { \ - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ -} \ -unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src_ptr, \ - int src_stride, \ - const uint8_t *ref_ptr, \ - int ref_stride, \ - const uint8_t *second_pred, \ - unsigned int max_sad) { \ - uint8_t comp_pred[m * n]; \ - comp_avg_pred(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \ - return sad_mx_n_c(src_ptr, src_stride, comp_pred, m, m, n); \ -} - -sad_mxn_func(64, 64) -sad_mxn_func(64, 32) -sad_mxn_func(32, 64) -sad_mxn_func(32, 32) -sad_mxn_func(32, 16) -sad_mxn_func(16, 32) -sad_mxn_func(16, 16) -sad_mxn_func(16, 8) -sad_mxn_func(8, 16) -sad_mxn_func(8, 8) -sad_mxn_func(8, 4) -sad_mxn_func(4, 8) -sad_mxn_func(4, 4) - -void vp9_sad64x32x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, - unsigned int *sad_array) { - sad_array[0] = vp9_sad64x32(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad64x32(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad64x32(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad64x32(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); -} - -void vp9_sad32x64x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, - unsigned int *sad_array) { - sad_array[0] = vp9_sad32x64(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad32x64(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad32x64(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad32x64(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); -} - -void vp9_sad32x16x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, - unsigned int *sad_array) { - sad_array[0] = vp9_sad32x16(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad32x16(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad32x16(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad32x16(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); -} - -void vp9_sad16x32x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, - unsigned int *sad_array) { - sad_array[0] = vp9_sad16x32(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad16x32(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad16x32(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad16x32(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); -} - -void vp9_sad64x64x3_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int *sad_array) { - sad_array[0] = vp9_sad64x64(src_ptr, src_stride, ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = vp9_sad64x64(src_ptr, src_stride, ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = vp9_sad64x64(src_ptr, src_stride, ref_ptr + 2, ref_stride, - 0x7fffffff); -} - -void vp9_sad32x32x3_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int *sad_array) { - sad_array[0] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); -} - -void vp9_sad64x64x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int *sad_array) { - sad_array[0] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); -} - -void vp9_sad32x32x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int *sad_array) { - sad_array[0] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); -} - -void vp9_sad16x16x3_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int *sad_array) { - sad_array[0] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); -} - -void vp9_sad16x16x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - uint32_t *sad_array) { - sad_array[0] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); -} - -void vp9_sad16x8x3_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int *sad_array) { - sad_array[0] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); -} - -void vp9_sad16x8x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - uint32_t *sad_array) { - sad_array[0] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); -} - -void vp9_sad8x8x3_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int *sad_array) { - sad_array[0] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); -} - -void vp9_sad8x8x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - uint32_t *sad_array) { - sad_array[0] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); -} - -void vp9_sad8x16x3_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int *sad_array) { - sad_array[0] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); -} - -void vp9_sad8x16x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - uint32_t *sad_array) { - sad_array[0] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); -} - -void vp9_sad4x4x3_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int *sad_array) { - sad_array[0] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); -} - -void vp9_sad4x4x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - uint32_t *sad_array) { - sad_array[0] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); -} - -void vp9_sad64x64x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, - unsigned int *sad_array) { - sad_array[0] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); -} - -void vp9_sad32x32x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, - unsigned int *sad_array) { - sad_array[0] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); -} - -void vp9_sad16x16x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, - unsigned int *sad_array) { - sad_array[0] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); -} - -void vp9_sad16x8x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, - unsigned int *sad_array) { - sad_array[0] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); -} - -void vp9_sad8x8x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, - unsigned int *sad_array) { - sad_array[0] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); -} - -void vp9_sad8x16x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, - unsigned int *sad_array) { - sad_array[0] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); -} - -void vp9_sad8x4x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, - unsigned int *sad_array) { - sad_array[0] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); -} - -void vp9_sad8x4x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - uint32_t *sad_array) { - sad_array[0] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); -} - -void vp9_sad4x8x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, - unsigned int *sad_array) { - sad_array[0] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); -} - -void vp9_sad4x8x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - uint32_t *sad_array) { - sad_array[0] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); -} - -void vp9_sad4x4x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, - unsigned int *sad_array) { - sad_array[0] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); -} diff --git a/media/libvpx/vp9/encoder/vp9_segmentation.c b/media/libvpx/vp9/encoder/vp9_segmentation.c index 24f011f83093..d5676c3d1b94 100644 --- a/media/libvpx/vp9/encoder/vp9_segmentation.c +++ b/media/libvpx/vp9/encoder/vp9_segmentation.c @@ -10,46 +10,30 @@ #include + #include "vpx_mem/vpx_mem.h" -#include "vp9/encoder/vp9_segmentation.h" + #include "vp9/common/vp9_pred_common.h" #include "vp9/common/vp9_tile_common.h" -void vp9_enable_segmentation(VP9_PTR ptr) { - VP9_COMP *cpi = (VP9_COMP *)ptr; - struct segmentation *const seg = &cpi->common.seg; +#include "vp9/encoder/vp9_cost.h" +#include "vp9/encoder/vp9_segmentation.h" +void vp9_enable_segmentation(struct segmentation *seg) { seg->enabled = 1; seg->update_map = 1; seg->update_data = 1; } -void vp9_disable_segmentation(VP9_PTR ptr) { - VP9_COMP *cpi = (VP9_COMP *)ptr; - struct segmentation *const seg = &cpi->common.seg; +void vp9_disable_segmentation(struct segmentation *seg) { seg->enabled = 0; + seg->update_map = 0; + seg->update_data = 0; } -void vp9_set_segmentation_map(VP9_PTR ptr, - unsigned char *segmentation_map) { - VP9_COMP *cpi = (VP9_COMP *)ptr; - struct segmentation *const seg = &cpi->common.seg; - - // Copy in the new segmentation map - vpx_memcpy(cpi->segmentation_map, segmentation_map, - (cpi->common.mi_rows * cpi->common.mi_cols)); - - // Signal that the map should be updated. - seg->update_map = 1; - seg->update_data = 1; -} - -void vp9_set_segment_data(VP9_PTR ptr, +void vp9_set_segment_data(struct segmentation *seg, signed char *feature_data, unsigned char abs_delta) { - VP9_COMP *cpi = (VP9_COMP *)ptr; - struct segmentation *const seg = &cpi->common.seg; - seg->abs_delta = abs_delta; vpx_memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data)); @@ -58,6 +42,15 @@ void vp9_set_segment_data(VP9_PTR ptr, // vpx_memcpy(cpi->mb.e_mbd.segment_feature_mask, 0, // sizeof(cpi->mb.e_mbd.segment_feature_mask)); } +void vp9_disable_segfeature(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + seg->feature_mask[segment_id] &= ~(1 << feature_id); +} + +void vp9_clear_segdata(struct segmentation *seg, int segment_id, + SEG_LVL_FEATURES feature_id) { + seg->feature_data[segment_id][feature_id] = 0; +} // Based on set of segment counts calculate a probability tree static void calc_segtree_probs(int *segcounts, vp9_prob *segment_tree_probs) { @@ -117,21 +110,19 @@ static int cost_segmap(int *segcounts, vp9_prob *probs) { return cost; } -static void count_segs(VP9_COMP *cpi, const TileInfo *const tile, - MODE_INFO **mi_8x8, +static void count_segs(const VP9_COMMON *cm, MACROBLOCKD *xd, + const TileInfo *tile, MODE_INFO **mi, int *no_pred_segcounts, int (*temporal_predictor_count)[2], int *t_unpred_seg_counts, int bw, int bh, int mi_row, int mi_col) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &cpi->mb.e_mbd; int segment_id; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - xd->mi_8x8 = mi_8x8; - segment_id = xd->mi_8x8[0]->mbmi.segment_id; + xd->mi = mi; + segment_id = xd->mi[0]->mbmi.segment_id; set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols); @@ -140,7 +131,7 @@ static void count_segs(VP9_COMP *cpi, const TileInfo *const tile, // Temporal prediction not allowed on key frames if (cm->frame_type != KEY_FRAME) { - const BLOCK_SIZE bsize = mi_8x8[0]->mbmi.sb_type; + const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type; // Test to see if the segment id matches the predicted value. const int pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col); @@ -149,46 +140,45 @@ static void count_segs(VP9_COMP *cpi, const TileInfo *const tile, // Store the prediction status for this mb and update counts // as appropriate - vp9_set_pred_flag_seg_id(xd, pred_flag); + xd->mi[0]->mbmi.seg_id_predicted = pred_flag; temporal_predictor_count[pred_context][pred_flag]++; + // Update the "unpredicted" segment count if (!pred_flag) - // Update the "unpredicted" segment count t_unpred_seg_counts[segment_id]++; } } -static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile, - MODE_INFO **mi_8x8, +static void count_segs_sb(const VP9_COMMON *cm, MACROBLOCKD *xd, + const TileInfo *tile, MODE_INFO **mi, int *no_pred_segcounts, int (*temporal_predictor_count)[2], int *t_unpred_seg_counts, int mi_row, int mi_col, BLOCK_SIZE bsize) { - const VP9_COMMON *const cm = &cpi->common; - const int mis = cm->mode_info_stride; + const int mis = cm->mi_stride; int bw, bh; const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - bw = num_8x8_blocks_wide_lookup[mi_8x8[0]->mbmi.sb_type]; - bh = num_8x8_blocks_high_lookup[mi_8x8[0]->mbmi.sb_type]; + bw = num_8x8_blocks_wide_lookup[mi[0]->mbmi.sb_type]; + bh = num_8x8_blocks_high_lookup[mi[0]->mbmi.sb_type]; if (bw == bs && bh == bs) { - count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count, + count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, bs, bs, mi_row, mi_col); } else if (bw == bs && bh < bs) { - count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count, + count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, bs, hbs, mi_row, mi_col); - count_segs(cpi, tile, mi_8x8 + hbs * mis, no_pred_segcounts, + count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, bs, hbs, mi_row + hbs, mi_col); } else if (bw < bs && bh == bs) { - count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count, + count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row, mi_col); - count_segs(cpi, tile, mi_8x8 + hbs, + count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row, mi_col + hbs); } else { @@ -201,7 +191,7 @@ static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile, const int mi_dc = hbs * (n & 1); const int mi_dr = hbs * (n >> 1); - count_segs_sb(cpi, tile, &mi_8x8[mi_dr * mis + mi_dc], + count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc], no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, mi_row + mi_dr, mi_col + mi_dc, subsize); @@ -209,8 +199,7 @@ static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile, } } -void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; +void vp9_choose_segmap_coding_method(VP9_COMMON *cm, MACROBLOCKD *xd) { struct segmentation *seg = &cm->seg; int no_pred_cost; @@ -226,9 +215,6 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { vp9_prob t_pred_tree[SEG_TREE_PROBS]; vp9_prob t_nopred_prob[PREDICTION_PROBS]; - const int mis = cm->mode_info_stride; - MODE_INFO **mi_ptr, **mi; - // Set default state for the segment tree probabilities and the // temporal coding probabilities vpx_memset(seg->tree_probs, 255, sizeof(seg->tree_probs)); @@ -238,15 +224,16 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { // predicts this one for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) { TileInfo tile; - + MODE_INFO **mi_ptr; vp9_tile_init(&tile, cm, 0, tile_col); + mi_ptr = cm->mi_grid_visible + tile.mi_col_start; for (mi_row = 0; mi_row < cm->mi_rows; - mi_row += 8, mi_ptr += 8 * mis) { - mi = mi_ptr; + mi_row += 8, mi_ptr += 8 * cm->mi_stride) { + MODE_INFO **mi = mi_ptr; for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end; mi_col += 8, mi += 8) - count_segs_sb(cpi, &tile, mi, no_pred_segcounts, + count_segs_sb(cm, xd, &tile, mi, no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, mi_row, mi_col, BLOCK_64X64); } @@ -287,3 +274,12 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { vpx_memcpy(seg->tree_probs, no_pred_tree, sizeof(no_pred_tree)); } } + +void vp9_reset_segment_features(struct segmentation *seg) { + // Set up default state for MB feature flags + seg->enabled = 0; + seg->update_map = 0; + seg->update_data = 0; + vpx_memset(seg->tree_probs, 255, sizeof(seg->tree_probs)); + vp9_clearall_segfeatures(seg); +} diff --git a/media/libvpx/vp9/encoder/vp9_segmentation.h b/media/libvpx/vp9/encoder/vp9_segmentation.h index 2183771c4597..8c6944ad13ce 100644 --- a/media/libvpx/vp9/encoder/vp9_segmentation.h +++ b/media/libvpx/vp9/encoder/vp9_segmentation.h @@ -13,14 +13,21 @@ #define VP9_ENCODER_VP9_SEGMENTATION_H_ #include "vp9/common/vp9_blockd.h" -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_encoder.h" -void vp9_enable_segmentation(VP9_PTR ptr); -void vp9_disable_segmentation(VP9_PTR ptr); +#ifdef __cplusplus +extern "C" { +#endif -// Valid values for a segment are 0 to 3 -// Segmentation map is arrange as [Rows][Columns] -void vp9_set_segmentation_map(VP9_PTR ptr, unsigned char *segmentation_map); +void vp9_enable_segmentation(struct segmentation *seg); +void vp9_disable_segmentation(struct segmentation *seg); + +void vp9_disable_segfeature(struct segmentation *seg, + int segment_id, + SEG_LVL_FEATURES feature_id); +void vp9_clear_segdata(struct segmentation *seg, + int segment_id, + SEG_LVL_FEATURES feature_id); // The values given for each segment can be either deltas (from the default // value chosen for the frame) or absolute values. @@ -32,9 +39,15 @@ void vp9_set_segmentation_map(VP9_PTR ptr, unsigned char *segmentation_map); // // abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use // the absolute values given). -void vp9_set_segment_data(VP9_PTR ptr, signed char *feature_data, +void vp9_set_segment_data(struct segmentation *seg, signed char *feature_data, unsigned char abs_delta); -void vp9_choose_segmap_coding_method(VP9_COMP *cpi); +void vp9_choose_segmap_coding_method(VP9_COMMON *cm, MACROBLOCKD *xd); + +void vp9_reset_segment_features(struct segmentation *seg); + +#ifdef __cplusplus +} // extern "C" +#endif #endif // VP9_ENCODER_VP9_SEGMENTATION_H_ diff --git a/media/libvpx/vp9/encoder/vp9_speed_features.c b/media/libvpx/vp9/encoder/vp9_speed_features.c new file mode 100644 index 000000000000..dbf4ae96ad48 --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_speed_features.c @@ -0,0 +1,482 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_speed_features.h" + +enum { + INTRA_ALL = (1 << DC_PRED) | + (1 << V_PRED) | (1 << H_PRED) | + (1 << D45_PRED) | (1 << D135_PRED) | + (1 << D117_PRED) | (1 << D153_PRED) | + (1 << D207_PRED) | (1 << D63_PRED) | + (1 << TM_PRED), + INTRA_DC = (1 << DC_PRED), + INTRA_DC_TM = (1 << DC_PRED) | (1 << TM_PRED), + INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED), + INTRA_DC_TM_H_V = (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) | + (1 << H_PRED) +}; + +enum { + INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV), + INTER_NEAREST = (1 << NEARESTMV), + INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV) +}; + +enum { + DISABLE_ALL_INTER_SPLIT = (1 << THR_COMP_GA) | + (1 << THR_COMP_LA) | + (1 << THR_ALTR) | + (1 << THR_GOLD) | + (1 << THR_LAST), + + DISABLE_ALL_SPLIT = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT, + + DISABLE_COMPOUND_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA), + + LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) | + (1 << THR_COMP_LA) | + (1 << THR_ALTR) | + (1 << THR_GOLD) +}; + +// Intra only frames, golden frames (except alt ref overlays) and +// alt ref frames tend to be coded at a higher than ambient quality +static int frame_is_boosted(const VP9_COMP *cpi) { + return frame_is_intra_only(&cpi->common) || + cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref) || + vp9_is_upper_layer_key_frame(cpi); +} + + +static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, + SPEED_FEATURES *sf, int speed) { + const int boosted = frame_is_boosted(cpi); + + sf->adaptive_rd_thresh = 1; + sf->allow_skip_recode = 1; + + if (speed >= 1) { + sf->use_square_partition_only = !frame_is_intra_only(cm); + sf->less_rectangular_check = 1; + + if (MIN(cm->width, cm->height) >= 720) + sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT + : DISABLE_ALL_INTER_SPLIT; + else + sf->disable_split_mask = DISABLE_COMPOUND_SPLIT; + sf->use_rd_breakout = 1; + sf->adaptive_motion_search = 1; + sf->mv.auto_mv_step_size = 1; + sf->adaptive_rd_thresh = 2; + sf->mv.subpel_iters_per_step = 1; + sf->mode_skip_start = 10; + sf->adaptive_pred_interp_filter = 1; + + sf->recode_loop = ALLOW_RECODE_KFARFGF; + sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V; + sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V; + + sf->tx_size_search_breakout = 1; + + if (MIN(cm->width, cm->height) >= 720) + sf->partition_search_breakout_dist_thr = (1 << 23); + else + sf->partition_search_breakout_dist_thr = (1 << 21); + sf->partition_search_breakout_rate_thr = 500; + } + + if (speed >= 2) { + sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD + : USE_LARGESTALL; + + if (MIN(cm->width, cm->height) >= 720) { + sf->lf_motion_threshold = LOW_MOTION_THRESHOLD; + sf->last_partitioning_redo_frequency = 3; + sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT + : DISABLE_ALL_INTER_SPLIT; + sf->adaptive_pred_interp_filter = 0; + } else { + sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY; + sf->last_partitioning_redo_frequency = 2; + sf->lf_motion_threshold = NO_MOTION_THRESHOLD; + } + + sf->reference_masking = 1; + sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH | + FLAG_SKIP_INTRA_BESTINTER | + FLAG_SKIP_COMP_BESTINTRA | + FLAG_SKIP_INTRA_LOWVAR; + sf->disable_filter_search_var_thresh = 100; + sf->comp_inter_joint_search_thresh = BLOCK_SIZES; + sf->auto_min_max_partition_size = CONSTRAIN_NEIGHBORING_MIN_MAX; + sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION; + sf->adjust_partitioning_from_last_frame = 1; + + if (MIN(cm->width, cm->height) >= 720) + sf->partition_search_breakout_dist_thr = (1 << 24); + else + sf->partition_search_breakout_dist_thr = (1 << 22); + sf->partition_search_breakout_rate_thr = 700; + } + + if (speed >= 3) { + sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD + : USE_LARGESTALL; + if (MIN(cm->width, cm->height) >= 720) { + sf->disable_split_mask = DISABLE_ALL_SPLIT; + } else { + sf->max_intra_bsize = BLOCK_32X32; + sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT; + } + sf->adaptive_pred_interp_filter = 0; + sf->adaptive_mode_search = 1; + sf->cb_partition_search = !boosted; + sf->cb_pred_filter_search = 1; + sf->alt_ref_search_fp = 1; + sf->motion_field_mode_search = !boosted; + sf->lf_motion_threshold = LOW_MOTION_THRESHOLD; + sf->last_partitioning_redo_frequency = 2; + sf->recode_loop = ALLOW_RECODE_KFMAXBW; + sf->adaptive_rd_thresh = 3; + sf->mode_skip_start = 6; + sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; + sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC; + sf->adaptive_interp_filter_search = 1; + + if (MIN(cm->width, cm->height) >= 720) + sf->partition_search_breakout_dist_thr = (1 << 25); + else + sf->partition_search_breakout_dist_thr = (1 << 23); + sf->partition_search_breakout_rate_thr = 1000; + } + + if (speed >= 4) { + sf->use_square_partition_only = 1; + sf->tx_size_search_method = USE_LARGESTALL; + sf->disable_split_mask = DISABLE_ALL_SPLIT; + sf->adaptive_rd_thresh = 4; + sf->mode_search_skip_flags |= FLAG_SKIP_COMP_REFMISMATCH | + FLAG_EARLY_TERMINATE; + sf->disable_filter_search_var_thresh = 200; + sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL; + sf->use_lp32x32fdct = 1; + sf->use_fast_coef_updates = ONE_LOOP_REDUCED; + sf->use_fast_coef_costing = 1; + + if (MIN(cm->width, cm->height) >= 720) + sf->partition_search_breakout_dist_thr = (1 << 26); + else + sf->partition_search_breakout_dist_thr = (1 << 24); + sf->partition_search_breakout_rate_thr = 1500; + } + + if (speed >= 5) { + int i; + + sf->partition_search_type = FIXED_PARTITION; + sf->optimize_coefficients = 0; + sf->mv.search_method = HEX; + sf->disable_filter_search_var_thresh = 500; + for (i = 0; i < TX_SIZES; ++i) { + sf->intra_y_mode_mask[i] = INTRA_DC; + sf->intra_uv_mode_mask[i] = INTRA_DC; + } + cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; + } + if (speed >= 6) { + sf->mv.reduce_first_step_size = 1; + } +} + +static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, + int speed, vp9e_tune_content content) { + VP9_COMMON *const cm = &cpi->common; + const int is_keyframe = cm->frame_type == KEY_FRAME; + const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key; + sf->static_segmentation = 0; + sf->adaptive_rd_thresh = 1; + sf->use_fast_coef_costing = 1; + + if (speed >= 1) { + sf->use_square_partition_only = !frame_is_intra_only(cm); + sf->less_rectangular_check = 1; + sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD + : USE_LARGESTALL; + + if (MIN(cm->width, cm->height) >= 720) + sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT + : DISABLE_ALL_INTER_SPLIT; + else + sf->disable_split_mask = DISABLE_COMPOUND_SPLIT; + + sf->use_rd_breakout = 1; + + sf->adaptive_motion_search = 1; + sf->adaptive_pred_interp_filter = 1; + sf->mv.auto_mv_step_size = 1; + sf->adaptive_rd_thresh = 2; + sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V; + } + + if (speed >= 2) { + if (MIN(cm->width, cm->height) >= 720) + sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT + : DISABLE_ALL_INTER_SPLIT; + else + sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY; + + sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH | + FLAG_SKIP_INTRA_BESTINTER | + FLAG_SKIP_COMP_BESTINTRA | + FLAG_SKIP_INTRA_LOWVAR; + sf->adaptive_pred_interp_filter = 2; + sf->reference_masking = 1; + sf->disable_filter_search_var_thresh = 50; + sf->comp_inter_joint_search_thresh = BLOCK_SIZES; + sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; + sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION; + sf->lf_motion_threshold = LOW_MOTION_THRESHOLD; + sf->adjust_partitioning_from_last_frame = 1; + sf->last_partitioning_redo_frequency = 3; + sf->use_lp32x32fdct = 1; + sf->mode_skip_start = 11; + sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; + } + + if (speed >= 3) { + sf->use_square_partition_only = 1; + sf->disable_filter_search_var_thresh = 100; + sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL; + sf->constrain_copy_partition = 1; + sf->use_uv_intra_rd_estimate = 1; + sf->skip_encode_sb = 1; + sf->mv.subpel_iters_per_step = 1; + sf->use_fast_coef_updates = ONE_LOOP_REDUCED; + sf->adaptive_rd_thresh = 4; + sf->mode_skip_start = 6; + sf->allow_skip_recode = 0; + sf->optimize_coefficients = 0; + sf->disable_split_mask = DISABLE_ALL_SPLIT; + sf->lpf_pick = LPF_PICK_FROM_Q; + } + + if (speed >= 4) { + int i; + sf->last_partitioning_redo_frequency = 4; + sf->adaptive_rd_thresh = 5; + sf->use_fast_coef_costing = 0; + sf->auto_min_max_partition_size = STRICT_NEIGHBORING_MIN_MAX; + sf->adjust_partitioning_from_last_frame = + cm->last_frame_type != cm->frame_type || (0 == + (frames_since_key + 1) % sf->last_partitioning_redo_frequency); + sf->mv.subpel_force_stop = 1; + for (i = 0; i < TX_SIZES; i++) { + sf->intra_y_mode_mask[i] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[i] = INTRA_DC; + } + sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; + sf->frame_parameter_update = 0; + sf->mv.search_method = FAST_HEX; + sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEAR_NEW; + sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST; + sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST; + sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST; + sf->max_intra_bsize = BLOCK_32X32; + sf->allow_skip_recode = 1; + } + + if (speed >= 5) { + sf->use_quant_fp = !is_keyframe; + sf->auto_min_max_partition_size = is_keyframe ? RELAXED_NEIGHBORING_MIN_MAX + : STRICT_NEIGHBORING_MIN_MAX; + sf->max_partition_size = BLOCK_32X32; + sf->min_partition_size = BLOCK_8X8; + sf->partition_check = + (frames_since_key % sf->last_partitioning_redo_frequency == 1); + sf->force_frame_boost = is_keyframe || + (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1); + sf->max_delta_qindex = is_keyframe ? 20 : 15; + sf->partition_search_type = REFERENCE_PARTITION; + sf->use_nonrd_pick_mode = 1; + sf->allow_skip_recode = 0; + } + + if (speed >= 6) { + if (content == VP9E_CONTENT_SCREEN) { + int i; + // Allow fancy modes at all sizes since SOURCE_VAR_BASED_PARTITION is used + for (i = 0; i < BLOCK_SIZES; ++i) + sf->inter_mode_mask[i] = INTER_ALL; + } + + // Adaptively switch between SOURCE_VAR_BASED_PARTITION and FIXED_PARTITION. + sf->partition_search_type = SOURCE_VAR_BASED_PARTITION; + sf->search_type_check_frequency = 50; + + sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8; + + // This feature is only enabled when partition search is disabled. + sf->reuse_inter_pred_sby = 1; + + // Increase mode checking threshold for NEWMV. + sf->elevate_newmv_thresh = 2000; + + sf->mv.reduce_first_step_size = 1; + } + + if (speed >= 7) { + sf->mv.search_method = FAST_DIAMOND; + sf->mv.fullpel_search_step_param = 10; + sf->lpf_pick = LPF_PICK_MINIMAL_LPF; + sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ? + 800 : 300; + sf->elevate_newmv_thresh = 2500; + } + + if (speed >= 12) { + sf->elevate_newmv_thresh = 4000; + sf->mv.subpel_force_stop = 2; + } + + if (speed >= 13) { + int i; + sf->max_intra_bsize = BLOCK_32X32; + for (i = 0; i < BLOCK_SIZES; ++i) + sf->inter_mode_mask[i] = INTER_NEAREST; + } +} + +void vp9_set_speed_features(VP9_COMP *cpi) { + SPEED_FEATURES *const sf = &cpi->sf; + VP9_COMMON *const cm = &cpi->common; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + int i; + + // best quality defaults + sf->frame_parameter_update = 1; + sf->mv.search_method = NSTEP; + sf->recode_loop = ALLOW_RECODE; + sf->mv.subpel_search_method = SUBPEL_TREE; + sf->mv.subpel_iters_per_step = 2; + sf->mv.subpel_force_stop = 0; + sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf); + sf->mv.reduce_first_step_size = 0; + sf->mv.auto_mv_step_size = 0; + sf->mv.fullpel_search_step_param = 6; + sf->comp_inter_joint_search_thresh = BLOCK_4X4; + sf->adaptive_rd_thresh = 0; + sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_OFF; + sf->tx_size_search_method = USE_FULL_RD; + sf->use_lp32x32fdct = 0; + sf->adaptive_motion_search = 0; + sf->adaptive_pred_interp_filter = 0; + sf->adaptive_mode_search = 0; + sf->cb_pred_filter_search = 0; + sf->cb_partition_search = 0; + sf->motion_field_mode_search = 0; + sf->alt_ref_search_fp = 0; + sf->use_quant_fp = 0; + sf->reference_masking = 0; + sf->partition_search_type = SEARCH_PARTITION; + sf->less_rectangular_check = 0; + sf->use_square_partition_only = 0; + sf->auto_min_max_partition_size = NOT_IN_USE; + sf->max_partition_size = BLOCK_64X64; + sf->min_partition_size = BLOCK_4X4; + sf->adjust_partitioning_from_last_frame = 0; + sf->last_partitioning_redo_frequency = 4; + sf->constrain_copy_partition = 0; + sf->disable_split_mask = 0; + sf->mode_search_skip_flags = 0; + sf->force_frame_boost = 0; + sf->max_delta_qindex = 0; + sf->disable_filter_search_var_thresh = 0; + sf->adaptive_interp_filter_search = 0; + + for (i = 0; i < TX_SIZES; i++) { + sf->intra_y_mode_mask[i] = INTRA_ALL; + sf->intra_uv_mode_mask[i] = INTRA_ALL; + } + sf->use_rd_breakout = 0; + sf->skip_encode_sb = 0; + sf->use_uv_intra_rd_estimate = 0; + sf->allow_skip_recode = 0; + sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE; + sf->use_fast_coef_updates = TWO_LOOP; + sf->use_fast_coef_costing = 0; + sf->mode_skip_start = MAX_MODES; // Mode index at which mode skip mask set + sf->use_nonrd_pick_mode = 0; + for (i = 0; i < BLOCK_SIZES; ++i) + sf->inter_mode_mask[i] = INTER_ALL; + sf->max_intra_bsize = BLOCK_64X64; + sf->reuse_inter_pred_sby = 0; + // This setting only takes effect when partition_search_type is set + // to FIXED_PARTITION. + sf->always_this_block_size = BLOCK_16X16; + sf->search_type_check_frequency = 50; + sf->encode_breakout_thresh = 0; + sf->elevate_newmv_thresh = 0; + // Recode loop tolerence %. + sf->recode_tolerance = 25; + sf->default_interp_filter = SWITCHABLE; + sf->tx_size_search_breakout = 0; + sf->partition_search_breakout_dist_thr = 0; + sf->partition_search_breakout_rate_thr = 0; + + if (oxcf->mode == REALTIME) + set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content); + else if (oxcf->mode == GOOD) + set_good_speed_feature(cpi, cm, sf, oxcf->speed); + + cpi->full_search_sad = vp9_full_search_sad; + cpi->diamond_search_sad = oxcf->mode == BEST ? vp9_full_range_search + : vp9_diamond_search_sad; + cpi->refining_search_sad = vp9_refining_search_sad; + + + // Slow quant, dct and trellis not worthwhile for first pass + // so make sure they are always turned off. + if (oxcf->pass == 1) + sf->optimize_coefficients = 0; + + // No recode for 1 pass. + if (oxcf->pass == 0) { + sf->recode_loop = DISALLOW_RECODE; + sf->optimize_coefficients = 0; + } + + if (sf->mv.subpel_search_method == SUBPEL_TREE) { + cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree; + } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) { + cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned; + } + + cpi->mb.optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1; + + if (sf->disable_split_mask == DISABLE_ALL_SPLIT) + sf->adaptive_pred_interp_filter = 0; + + if (!cpi->oxcf.frame_periodic_boost) { + sf->max_delta_qindex = 0; + } + + if (cpi->encode_breakout && oxcf->mode == REALTIME && + sf->encode_breakout_thresh > cpi->encode_breakout) + cpi->encode_breakout = sf->encode_breakout_thresh; +} diff --git a/media/libvpx/vp9/encoder/vp9_speed_features.h b/media/libvpx/vp9/encoder/vp9_speed_features.h new file mode 100644 index 000000000000..33c441f62e0d --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_speed_features.h @@ -0,0 +1,414 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_ENCODER_VP9_SPEED_FEATURES_H_ +#define VP9_ENCODER_VP9_SPEED_FEATURES_H_ + +#include "vp9/common/vp9_enums.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + DIAMOND = 0, + NSTEP = 1, + HEX = 2, + BIGDIA = 3, + SQUARE = 4, + FAST_HEX = 5, + FAST_DIAMOND = 6 +} SEARCH_METHODS; + +typedef enum { + // No recode. + DISALLOW_RECODE = 0, + // Allow recode for KF and exceeding maximum frame bandwidth. + ALLOW_RECODE_KFMAXBW = 1, + // Allow recode only for KF/ARF/GF frames. + ALLOW_RECODE_KFARFGF = 2, + // Allow recode for all frames based on bitrate constraints. + ALLOW_RECODE = 3, +} RECODE_LOOP_TYPE; + +typedef enum { + SUBPEL_TREE = 0, + SUBPEL_TREE_PRUNED = 1, + // Other methods to come +} SUBPEL_SEARCH_METHODS; + +typedef enum { + NO_MOTION_THRESHOLD = 0, + LOW_MOTION_THRESHOLD = 7 +} MOTION_THRESHOLD; + +typedef enum { + LAST_FRAME_PARTITION_OFF = 0, + LAST_FRAME_PARTITION_LOW_MOTION = 1, + LAST_FRAME_PARTITION_ALL = 2 +} LAST_FRAME_PARTITION_METHOD; + +typedef enum { + USE_FULL_RD = 0, + USE_LARGESTALL, + USE_TX_8X8 +} TX_SIZE_SEARCH_METHOD; + +typedef enum { + NOT_IN_USE = 0, + RELAXED_NEIGHBORING_MIN_MAX = 1, + CONSTRAIN_NEIGHBORING_MIN_MAX = 2, + STRICT_NEIGHBORING_MIN_MAX = 3 +} AUTO_MIN_MAX_MODE; + +typedef enum { + // Try the full image with different values. + LPF_PICK_FROM_FULL_IMAGE, + // Try a small portion of the image with different values. + LPF_PICK_FROM_SUBIMAGE, + // Estimate the level based on quantizer and frame type + LPF_PICK_FROM_Q, + // Pick 0 to disable LPF if LPF was enabled last frame + LPF_PICK_MINIMAL_LPF +} LPF_PICK_METHOD; + +typedef enum { + // Terminate search early based on distortion so far compared to + // qp step, distortion in the neighborhood of the frame, etc. + FLAG_EARLY_TERMINATE = 1 << 0, + + // Skips comp inter modes if the best so far is an intra mode. + FLAG_SKIP_COMP_BESTINTRA = 1 << 1, + + // Skips comp inter modes if the best single intermode so far does + // not have the same reference as one of the two references being + // tested. + FLAG_SKIP_COMP_REFMISMATCH = 1 << 2, + + // Skips oblique intra modes if the best so far is an inter mode. + FLAG_SKIP_INTRA_BESTINTER = 1 << 3, + + // Skips oblique intra modes at angles 27, 63, 117, 153 if the best + // intra so far is not one of the neighboring directions. + FLAG_SKIP_INTRA_DIRMISMATCH = 1 << 4, + + // Skips intra modes other than DC_PRED if the source variance is small + FLAG_SKIP_INTRA_LOWVAR = 1 << 5, +} MODE_SEARCH_SKIP_LOGIC; + +typedef enum { + FLAG_SKIP_EIGHTTAP = 1 << EIGHTTAP, + FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH, + FLAG_SKIP_EIGHTTAP_SHARP = 1 << EIGHTTAP_SHARP, +} INTERP_FILTER_MASK; + +typedef enum { + // Search partitions using RD/NONRD criterion + SEARCH_PARTITION = 0, + + // Always use a fixed size partition + FIXED_PARTITION = 1, + + // Use a fixed size partition in every 64X64 SB, where the size is + // determined based on source variance + VAR_BASED_FIXED_PARTITION = 2, + + REFERENCE_PARTITION = 3, + + // Use an arbitrary partitioning scheme based on source variance within + // a 64X64 SB + VAR_BASED_PARTITION, + + // Use non-fixed partitions based on source variance + SOURCE_VAR_BASED_PARTITION +} PARTITION_SEARCH_TYPE; + +typedef enum { + // Does a dry run to see if any of the contexts need to be updated or not, + // before the final run. + TWO_LOOP = 0, + + // No dry run conducted. + ONE_LOOP = 1, + + // No dry run, also only half the coef contexts and bands are updated. + // The rest are not updated at all. + ONE_LOOP_REDUCED = 2 +} FAST_COEFF_UPDATE; + +typedef struct MV_SPEED_FEATURES { + // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc). + SEARCH_METHODS search_method; + + // This parameter controls which step in the n-step process we start at. + // It's changed adaptively based on circumstances. + int reduce_first_step_size; + + // If this is set to 1, we limit the motion search range to 2 times the + // largest motion vector found in the last frame. + int auto_mv_step_size; + + // Subpel_search_method can only be subpel_tree which does a subpixel + // logarithmic search that keeps stepping at 1/2 pixel units until + // you stop getting a gain, and then goes on to 1/4 and repeats + // the same process. Along the way it skips many diagonals. + SUBPEL_SEARCH_METHODS subpel_search_method; + + // Maximum number of steps in logarithmic subpel search before giving up. + int subpel_iters_per_step; + + // Control when to stop subpel search + int subpel_force_stop; + + // This variable sets the step_param used in full pel motion search. + int fullpel_search_step_param; +} MV_SPEED_FEATURES; + +typedef struct SPEED_FEATURES { + MV_SPEED_FEATURES mv; + + // Frame level coding parameter update + int frame_parameter_update; + + RECODE_LOOP_TYPE recode_loop; + + // Trellis (dynamic programming) optimization of quantized values (+1, 0). + int optimize_coefficients; + + // Always set to 0. If on it enables 0 cost background transmission + // (except for the initial transmission of the segmentation). The feature is + // disabled because the addition of very large block sizes make the + // backgrounds very to cheap to encode, and the segmentation we have + // adds overhead. + int static_segmentation; + + // If 1 we iterate finding a best reference for 2 ref frames together - via + // a log search that iterates 4 times (check around mv for last for best + // error of combined predictor then check around mv for alt). If 0 we + // we just use the best motion vector found for each frame by itself. + BLOCK_SIZE comp_inter_joint_search_thresh; + + // This variable is used to cap the maximum number of times we skip testing a + // mode to be evaluated. A high value means we will be faster. + int adaptive_rd_thresh; + + // Enables skipping the reconstruction step (idct, recon) in the + // intermediate steps assuming the last frame didn't have too many intra + // blocks and the q is less than a threshold. + int skip_encode_sb; + int skip_encode_frame; + // Speed feature to allow or disallow skipping of recode at block + // level within a frame. + int allow_skip_recode; + + // This variable allows us to reuse the last frames partition choices + // (64x64 v 32x32 etc) for this frame. It can be set to only use the last + // frame as a starting point in low motion scenes or always use it. If set + // we use last partitioning_redo frequency to determine how often to redo + // the partitioning from scratch. Adjust_partitioning_from_last_frame + // enables us to adjust up or down one partitioning from the last frames + // partitioning. + LAST_FRAME_PARTITION_METHOD use_lastframe_partitioning; + + // The threshold is to determine how slow the motino is, it is used when + // use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION + MOTION_THRESHOLD lf_motion_threshold; + + // Determine which method we use to determine transform size. We can choose + // between options like full rd, largest for prediction size, largest + // for intra and model coefs for the rest. + TX_SIZE_SEARCH_METHOD tx_size_search_method; + + // Low precision 32x32 fdct keeps everything in 16 bits and thus is less + // precise but significantly faster than the non lp version. + int use_lp32x32fdct; + + // TODO(JBB): remove this as its no longer used. + + // After looking at the first set of modes (set by index here), skip + // checking modes for reference frames that don't match the reference frame + // of the best so far. + int mode_skip_start; + + // TODO(JBB): Remove this. + int reference_masking; + + PARTITION_SEARCH_TYPE partition_search_type; + + // Used if partition_search_type = FIXED_SIZE_PARTITION + BLOCK_SIZE always_this_block_size; + + // Skip rectangular partition test when partition type none gives better + // rd than partition type split. + int less_rectangular_check; + + // Disable testing non square partitions. (eg 16x32) + int use_square_partition_only; + + // Sets min and max partition sizes for this 64x64 region based on the + // same 64x64 in last encoded frame, and the left and above neighbor. + AUTO_MIN_MAX_MODE auto_min_max_partition_size; + + // Min and max partition size we enable (block_size) as per auto + // min max, but also used by adjust partitioning, and pick_partitioning. + BLOCK_SIZE min_partition_size; + BLOCK_SIZE max_partition_size; + + // Whether or not we allow partitions one smaller or one greater than the last + // frame's partitioning. Only used if use_lastframe_partitioning is set. + int adjust_partitioning_from_last_frame; + + // How frequently we re do the partitioning from scratch. Only used if + // use_lastframe_partitioning is set. + int last_partitioning_redo_frequency; + + // This enables constrained copy partitioning, which, given an input block + // size bsize, will copy previous partition for partitions less than bsize, + // otherwise bsize partition is used. bsize is currently set to 16x16. + // Used for the case where motion is detected in superblock. + int constrain_copy_partition; + + // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable + // it always, to allow it for only Last frame and Intra, disable it for all + // inter modes or to enable it always. + int disable_split_mask; + + // TODO(jingning): combine the related motion search speed features + // This allows us to use motion search at other sizes as a starting + // point for this motion search and limits the search range around it. + int adaptive_motion_search; + + // Allows sub 8x8 modes to use the prediction filter that was determined + // best for 8x8 mode. If set to 0 we always re check all the filters for + // sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter + // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected. + int adaptive_pred_interp_filter; + + // Adaptive prediction mode search + int adaptive_mode_search; + + // Chessboard pattern prediction filter type search + int cb_pred_filter_search; + + int cb_partition_search; + + int motion_field_mode_search; + + int alt_ref_search_fp; + + // Fast quantization process path + int use_quant_fp; + + // Search through variable block partition types in non-RD mode decision + // encoding process for RTC. + int partition_check; + + // Use finer quantizer in every other few frames that run variable block + // partition type search. + int force_frame_boost; + + // Maximally allowed base quantization index fluctuation. + int max_delta_qindex; + + // Implements various heuristics to skip searching modes + // The heuristics selected are based on flags + // defined in the MODE_SEARCH_SKIP_HEURISTICS enum + unsigned int mode_search_skip_flags; + + // A source variance threshold below which filter search is disabled + // Choose a very large value (UINT_MAX) to use 8-tap always + unsigned int disable_filter_search_var_thresh; + + // These bit masks allow you to enable or disable intra modes for each + // transform size separately. + int intra_y_mode_mask[TX_SIZES]; + int intra_uv_mode_mask[TX_SIZES]; + + // This variable enables an early break out of mode testing if the model for + // rd built from the prediction signal indicates a value that's much + // higher than the best rd we've seen so far. + int use_rd_breakout; + + // This enables us to use an estimate for intra rd based on dc mode rather + // than choosing an actual uv mode in the stage of encoding before the actual + // final encode. + int use_uv_intra_rd_estimate; + + // This feature controls how the loop filter level is determined. + LPF_PICK_METHOD lpf_pick; + + // This feature limits the number of coefficients updates we actually do + // by only looking at counts from 1/2 the bands. + FAST_COEFF_UPDATE use_fast_coef_updates; + + // This flag controls the use of non-RD mode decision. + int use_nonrd_pick_mode; + + // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV + // modes are used in order from LSB to MSB for each BLOCK_SIZE. + int inter_mode_mask[BLOCK_SIZES]; + + // This feature controls whether we do the expensive context update and + // calculation in the rd coefficient costing loop. + int use_fast_coef_costing; + + // This feature controls the tolerence vs target used in deciding whether to + // recode a frame. It has no meaning if recode is disabled. + int recode_tolerance; + + // This variable controls the maximum block size where intra blocks can be + // used in inter frames. + // TODO(aconverse): Fold this into one of the other many mode skips + BLOCK_SIZE max_intra_bsize; + + // The frequency that we check if SOURCE_VAR_BASED_PARTITION or + // FIXED_PARTITION search type should be used. + int search_type_check_frequency; + + // When partition is pre-set, the inter prediction result from pick_inter_mode + // can be reused in final block encoding process. It is enabled only for real- + // time mode speed 6. + int reuse_inter_pred_sby; + + // This variable sets the encode_breakout threshold. Currently, it is only + // enabled in real time mode. + int encode_breakout_thresh; + + // In real time encoding, increase the threshold for NEWMV. + int elevate_newmv_thresh; + + // default interp filter choice + INTERP_FILTER default_interp_filter; + + // Early termination in transform size search, which only applies while + // tx_size_search_method is USE_FULL_RD. + int tx_size_search_breakout; + + // adaptive interp_filter search to allow skip of certain filter types. + int adaptive_interp_filter_search; + + // mask for skip evaluation of certain interp_filter type. + INTERP_FILTER_MASK interp_filter_search_mask; + + // Partition search early breakout thresholds. + int64_t partition_search_breakout_dist_thr; + int partition_search_breakout_rate_thr; +} SPEED_FEATURES; + +struct VP9_COMP; + +void vp9_set_speed_features(struct VP9_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_ENCODER_VP9_SPEED_FEATURES_H_ + diff --git a/media/libvpx/vp9/encoder/vp9_ssim.h b/media/libvpx/vp9/encoder/vp9_ssim.h new file mode 100644 index 000000000000..d1dd1b722371 --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_ssim.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_ENCODER_VP9_SSIM_H_ +#define VP9_ENCODER_VP9_SSIM_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "vpx_scale/yv12config.h" + +double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, + double *weight); + +double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, + double *ssim_y, double *ssim_u, double *ssim_v); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_ENCODER_VP9_SSIM_H_ diff --git a/media/libvpx/vp9/encoder/vp9_subexp.c b/media/libvpx/vp9/encoder/vp9_subexp.c index 387fc9056b59..530b5923bed2 100644 --- a/media/libvpx/vp9/encoder/vp9_subexp.c +++ b/media/libvpx/vp9/encoder/vp9_subexp.c @@ -11,33 +11,29 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_entropy.h" -#include "vp9/encoder/vp9_boolhuff.h" -#include "vp9/encoder/vp9_treewriter.h" +#include "vp9/encoder/vp9_cost.h" +#include "vp9/encoder/vp9_writer.h" -#define vp9_cost_upd ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)) >> 8) #define vp9_cost_upd256 ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd))) -static int update_bits[255]; - -static int count_uniform(int v, int n) { - int l = get_unsigned_bits(n); - int m; - if (l == 0) return 0; - m = (1 << l) - n; - if (v < m) - return l - 1; - else - return l; -} - -static int split_index(int i, int n, int modulus) { - int max1 = (n - 1 - modulus / 2) / modulus + 1; - if (i % modulus == modulus / 2) - i = i / modulus; - else - i = max1 + i - (i + modulus - modulus / 2) / modulus; - return i; -} +static const int update_bits[255] = { + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 0, +}; static int recenter_nonneg(int v, int m) { if (v > (m << 1)) @@ -82,42 +78,14 @@ static int remap_prob(int v, int m) { return i; } -static int count_term_subexp(int word, int k, int num_syms) { - int count = 0; - int i = 0; - int mk = 0; - while (1) { - int b = (i ? k + i - 1 : k); - int a = (1 << b); - if (num_syms <= mk + 3 * a) { - count += count_uniform(word - mk, num_syms - mk); - break; - } else { - int t = (word >= mk + a); - count++; - if (t) { - i = i + 1; - mk += a; - } else { - count += b; - break; - } - } - } - return count; -} - static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) { int delp = remap_prob(newp, oldp); return update_bits[delp] * 256; } -static void encode_uniform(vp9_writer *w, int v, int n) { - int l = get_unsigned_bits(n); - int m; - if (l == 0) - return; - m = (1 << l) - n; +static void encode_uniform(vp9_writer *w, int v) { + const int l = 8; + const int m = (1 << l) - 191; if (v < m) { vp9_write_literal(w, v, l - 1); } else { @@ -126,38 +94,26 @@ static void encode_uniform(vp9_writer *w, int v, int n) { } } -static void encode_term_subexp(vp9_writer *w, int word, int k, int num_syms) { - int i = 0; - int mk = 0; - while (1) { - int b = (i ? k + i - 1 : k); - int a = (1 << b); - if (num_syms <= mk + 3 * a) { - encode_uniform(w, word - mk, num_syms - mk); - break; - } else { - int t = (word >= mk + a); - vp9_write_literal(w, t, 1); - if (t) { - i = i + 1; - mk += a; - } else { - vp9_write_literal(w, word - mk, b); - break; - } - } +static INLINE int write_bit_gte(vp9_writer *w, int word, int test) { + vp9_write_literal(w, word >= test, 1); + return word >= test; +} + +static void encode_term_subexp(vp9_writer *w, int word) { + if (!write_bit_gte(w, word, 16)) { + vp9_write_literal(w, word, 4); + } else if (!write_bit_gte(w, word, 32)) { + vp9_write_literal(w, word - 16, 4); + } else if (!write_bit_gte(w, word, 64)) { + vp9_write_literal(w, word - 32, 5); + } else { + encode_uniform(w, word - 64); } } void vp9_write_prob_diff_update(vp9_writer *w, vp9_prob newp, vp9_prob oldp) { const int delp = remap_prob(newp, oldp); - encode_term_subexp(w, delp, SUBEXP_PARAM, 255); -} - -void vp9_compute_update_table() { - int i; - for (i = 0; i < 254; i++) - update_bits[i] = count_term_subexp(i, SUBEXP_PARAM, 255); + encode_term_subexp(w, delp); } int vp9_prob_diff_update_savings_search(const unsigned int *ct, @@ -184,8 +140,7 @@ int vp9_prob_diff_update_savings_search(const unsigned int *ct, int vp9_prob_diff_update_savings_search_model(const unsigned int *ct, const vp9_prob *oldp, vp9_prob *bestp, - vp9_prob upd, - int b, int r) { + vp9_prob upd) { int i, old_b, new_b, update_b, savings, bestsavings, step; int newp; vp9_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES]; diff --git a/media/libvpx/vp9/encoder/vp9_subexp.h b/media/libvpx/vp9/encoder/vp9_subexp.h index 521c7778d3a3..8e02a1d0d5ef 100644 --- a/media/libvpx/vp9/encoder/vp9_subexp.h +++ b/media/libvpx/vp9/encoder/vp9_subexp.h @@ -9,11 +9,12 @@ */ -#ifndef VP9_DECODER_VP9_SUBEXP_H_ -#define VP9_DECODER_VP9_SUBEXP_H_ - -void vp9_compute_update_table(); +#ifndef VP9_ENCODER_VP9_SUBEXP_H_ +#define VP9_ENCODER_VP9_SUBEXP_H_ +#ifdef __cplusplus +extern "C" { +#endif void vp9_write_prob_diff_update(vp9_writer *w, vp9_prob newp, vp9_prob oldp); @@ -29,7 +30,10 @@ int vp9_prob_diff_update_savings_search(const unsigned int *ct, int vp9_prob_diff_update_savings_search_model(const unsigned int *ct, const vp9_prob *oldp, vp9_prob *bestp, - vp9_prob upd, - int b, int r); + vp9_prob upd); -#endif // VP9_DECODER_VP9_SUBEXP_H_ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_ENCODER_VP9_SUBEXP_H_ diff --git a/media/libvpx/vp9/encoder/vp9_svc_layercontext.c b/media/libvpx/vp9/encoder/vp9_svc_layercontext.c new file mode 100644 index 000000000000..7545d87b30ca --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_svc_layercontext.c @@ -0,0 +1,390 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_svc_layercontext.h" +#include "vp9/encoder/vp9_extend.h" + +void vp9_init_layer_context(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + int layer; + int layer_end; + int alt_ref_idx = svc->number_spatial_layers * svc->number_temporal_layers; + + svc->spatial_layer_id = 0; + svc->temporal_layer_id = 0; + + if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) { + layer_end = svc->number_temporal_layers; + } else { + layer_end = svc->number_spatial_layers; + } + + for (layer = 0; layer < layer_end; ++layer) { + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + int i; + lc->current_video_frame_in_layer = 0; + lc->layer_size = 0; + lc->frames_from_key_frame = 0; + lc->last_frame_type = FRAME_TYPES; + lrc->ni_av_qi = oxcf->worst_allowed_q; + lrc->total_actual_bits = 0; + lrc->total_target_vs_actual = 0; + lrc->ni_tot_qi = 0; + lrc->tot_q = 0.0; + lrc->avg_q = 0.0; + lrc->ni_frames = 0; + lrc->decimation_count = 0; + lrc->decimation_factor = 0; + + for (i = 0; i < RATE_FACTOR_LEVELS; ++i) { + lrc->rate_correction_factors[i] = 1.0; + } + + if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) { + lc->target_bandwidth = oxcf->ts_target_bitrate[layer]; + lrc->last_q[INTER_FRAME] = oxcf->worst_allowed_q; + lrc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q; + } else { + lc->target_bandwidth = oxcf->ss_target_bitrate[layer]; + lrc->last_q[KEY_FRAME] = oxcf->best_allowed_q; + lrc->last_q[INTER_FRAME] = oxcf->best_allowed_q; + lrc->avg_frame_qindex[KEY_FRAME] = (oxcf->worst_allowed_q + + oxcf->best_allowed_q) / 2; + lrc->avg_frame_qindex[INTER_FRAME] = (oxcf->worst_allowed_q + + oxcf->best_allowed_q) / 2; + if (oxcf->ss_play_alternate[layer]) + lc->alt_ref_idx = alt_ref_idx++; + else + lc->alt_ref_idx = -1; + lc->gold_ref_idx = -1; + } + + lrc->buffer_level = oxcf->starting_buffer_level_ms * + lc->target_bandwidth / 1000; + lrc->bits_off_target = lrc->buffer_level; + } + + // Still have extra buffer for base layer golden frame + if (!(svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) + && alt_ref_idx < REF_FRAMES) + svc->layer_context[0].gold_ref_idx = alt_ref_idx; +} + +// Update the layer context from a change_config() call. +void vp9_update_layer_context_change_config(VP9_COMP *const cpi, + const int target_bandwidth) { + SVC *const svc = &cpi->svc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const RATE_CONTROL *const rc = &cpi->rc; + int layer; + int layer_end; + float bitrate_alloc = 1.0; + + if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) { + layer_end = svc->number_temporal_layers; + } else { + layer_end = svc->number_spatial_layers; + } + + for (layer = 0; layer < layer_end; ++layer) { + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + + if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) { + lc->target_bandwidth = oxcf->ts_target_bitrate[layer]; + } else { + lc->target_bandwidth = oxcf->ss_target_bitrate[layer]; + } + bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; + // Update buffer-related quantities. + lrc->starting_buffer_level = + (int64_t)(rc->starting_buffer_level * bitrate_alloc); + lrc->optimal_buffer_level = + (int64_t)(rc->optimal_buffer_level * bitrate_alloc); + lrc->maximum_buffer_size = + (int64_t)(rc->maximum_buffer_size * bitrate_alloc); + lrc->bits_off_target = MIN(lrc->bits_off_target, lrc->maximum_buffer_size); + lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size); + // Update framerate-related quantities. + if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) { + lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[layer]; + } else { + lc->framerate = cpi->framerate; + } + lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); + lrc->max_frame_bandwidth = rc->max_frame_bandwidth; + // Update qp-related quantities. + lrc->worst_quality = rc->worst_quality; + lrc->best_quality = rc->best_quality; + } +} + +static LAYER_CONTEXT *get_layer_context(VP9_COMP *const cpi) { + return (cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ? + &cpi->svc.layer_context[cpi->svc.temporal_layer_id] : + &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; +} + +void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + LAYER_CONTEXT *const lc = get_layer_context(cpi); + RATE_CONTROL *const lrc = &lc->rc; + const int layer = svc->temporal_layer_id; + + lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[layer]; + lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); + lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth; + // Update the average layer frame size (non-cumulative per-frame-bw). + if (layer == 0) { + lc->avg_frame_size = lrc->avg_frame_bandwidth; + } else { + const double prev_layer_framerate = + cpi->framerate / oxcf->ts_rate_decimator[layer - 1]; + const int prev_layer_target_bandwidth = oxcf->ts_target_bitrate[layer - 1]; + lc->avg_frame_size = + (int)((lc->target_bandwidth - prev_layer_target_bandwidth) / + (lc->framerate - prev_layer_framerate)); + } +} + +void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) { + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + LAYER_CONTEXT *const lc = get_layer_context(cpi); + RATE_CONTROL *const lrc = &lc->rc; + + lc->framerate = framerate; + lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate); + lrc->min_frame_bandwidth = (int)(lrc->avg_frame_bandwidth * + oxcf->two_pass_vbrmin_section / 100); + lrc->max_frame_bandwidth = (int)(((int64_t)lrc->avg_frame_bandwidth * + oxcf->two_pass_vbrmax_section) / 100); + vp9_rc_set_gf_max_interval(cpi, lrc); +} + +void vp9_restore_layer_context(VP9_COMP *const cpi) { + LAYER_CONTEXT *const lc = get_layer_context(cpi); + const int old_frame_since_key = cpi->rc.frames_since_key; + const int old_frame_to_key = cpi->rc.frames_to_key; + + cpi->rc = lc->rc; + cpi->twopass = lc->twopass; + cpi->oxcf.target_bandwidth = lc->target_bandwidth; + cpi->alt_ref_source = lc->alt_ref_source; + // Reset the frames_since_key and frames_to_key counters to their values + // before the layer restore. Keep these defined for the stream (not layer). + if (cpi->svc.number_temporal_layers > 1) { + cpi->rc.frames_since_key = old_frame_since_key; + cpi->rc.frames_to_key = old_frame_to_key; + } +} + +void vp9_save_layer_context(VP9_COMP *const cpi) { + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + LAYER_CONTEXT *const lc = get_layer_context(cpi); + + lc->rc = cpi->rc; + lc->twopass = cpi->twopass; + lc->target_bandwidth = (int)oxcf->target_bandwidth; + lc->alt_ref_source = cpi->alt_ref_source; +} + +void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) { + SVC *const svc = &cpi->svc; + int i; + + for (i = 0; i < svc->number_spatial_layers; ++i) { + TWO_PASS *const twopass = &svc->layer_context[i].twopass; + + svc->spatial_layer_id = i; + vp9_init_second_pass(cpi); + + twopass->total_stats.spatial_layer_id = i; + twopass->total_left_stats.spatial_layer_id = i; + } + svc->spatial_layer_id = 0; +} + +void vp9_inc_frame_in_layer(VP9_COMP *const cpi) { + LAYER_CONTEXT *const lc = + (cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ? + &cpi->svc.layer_context[cpi->svc.temporal_layer_id] : + &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; + ++lc->current_video_frame_in_layer; + ++lc->frames_from_key_frame; +} + +int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) { + return is_two_pass_svc(cpi) && + cpi->svc.spatial_layer_id > 0 && + cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame; +} + +#if CONFIG_SPATIAL_SVC +int vp9_svc_lookahead_push(const VP9_COMP *const cpi, struct lookahead_ctx *ctx, + YV12_BUFFER_CONFIG *src, int64_t ts_start, + int64_t ts_end, unsigned int flags) { + struct lookahead_entry *buf; + int i, index; + + if (vp9_lookahead_push(ctx, src, ts_start, ts_end, flags)) + return 1; + + index = ctx->write_idx - 1; + if (index < 0) + index += ctx->max_sz; + + buf = ctx->buf + index; + + if (buf == NULL) + return 1; + + // Store svc parameters for each layer + for (i = 0; i < cpi->svc.number_spatial_layers; ++i) + buf->svc_params[i] = cpi->svc.layer_context[i].svc_params_received; + + return 0; +} + +static int copy_svc_params(VP9_COMP *const cpi, struct lookahead_entry *buf) { + int layer_id; + vpx_svc_parameters_t *layer_param; + LAYER_CONTEXT *lc; + int count = 1 << (cpi->svc.number_temporal_layers - 1); + + // Find the next layer to be encoded + for (layer_id = 0; layer_id < cpi->svc.number_spatial_layers; ++layer_id) { + if (buf->svc_params[layer_id].spatial_layer >=0) + break; + } + + if (layer_id == cpi->svc.number_spatial_layers) + return 1; + + layer_param = &buf->svc_params[layer_id]; + cpi->svc.spatial_layer_id = layer_param->spatial_layer; + cpi->svc.temporal_layer_id = layer_param->temporal_layer; + cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG; + + lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; + + cpi->svc.temporal_layer_id = 0; + while ((lc->current_video_frame_in_layer % count) != 0) { + ++cpi->svc.temporal_layer_id; + count >>= 1; + } + + cpi->lst_fb_idx = + cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers + + cpi->svc.temporal_layer_id; + if (lc->frames_from_key_frame < cpi->svc.number_temporal_layers) + cpi->ref_frame_flags &= ~VP9_LAST_FLAG; + + if (cpi->svc.spatial_layer_id == 0) { + if (cpi->svc.temporal_layer_id == 0) + cpi->gld_fb_idx = lc->gold_ref_idx >= 0 ? + lc->gold_ref_idx : cpi->lst_fb_idx; + else + cpi->gld_fb_idx = cpi->lst_fb_idx - 1; + } else { + if (cpi->svc.temporal_layer_id == 0) + cpi->gld_fb_idx = cpi->svc.spatial_layer_id - + cpi->svc.number_temporal_layers; + else + cpi->gld_fb_idx = cpi->lst_fb_idx - 1; + } + + if (lc->current_video_frame_in_layer == 0) { + if (cpi->svc.spatial_layer_id >= 2) { + cpi->alt_fb_idx = + cpi->svc.spatial_layer_id - 2 * cpi->svc.number_temporal_layers; + } else { + cpi->alt_fb_idx = cpi->lst_fb_idx; + cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_ALT_FLAG); + } + } else { + if (cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id]) { + cpi->alt_fb_idx = lc->alt_ref_idx; + if (!lc->has_alt_frame) + cpi->ref_frame_flags &= (~VP9_ALT_FLAG); + } else { + // Find a proper alt_fb_idx for layers that don't have alt ref frame + if (cpi->svc.spatial_layer_id == 0) { + cpi->alt_fb_idx = cpi->lst_fb_idx; + } else { + LAYER_CONTEXT *lc_lower = + &cpi->svc.layer_context[cpi->svc.spatial_layer_id - 1]; + + if (cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id - 1] && + lc_lower->alt_ref_source != NULL) + cpi->alt_fb_idx = lc_lower->alt_ref_idx; + else if (cpi->svc.spatial_layer_id >= 2) + cpi->alt_fb_idx = + cpi->svc.spatial_layer_id - 2 * cpi->svc.number_temporal_layers; + else + cpi->alt_fb_idx = cpi->lst_fb_idx; + } + } + } + + if (vp9_set_size_literal(cpi, layer_param->width, layer_param->height) != 0) + return VPX_CODEC_INVALID_PARAM; + + cpi->oxcf.worst_allowed_q = + vp9_quantizer_to_qindex(layer_param->max_quantizer); + cpi->oxcf.best_allowed_q = + vp9_quantizer_to_qindex(layer_param->min_quantizer); + + vp9_change_config(cpi, &cpi->oxcf); + + vp9_set_high_precision_mv(cpi, 1); + + cpi->alt_ref_source = get_layer_context(cpi)->alt_ref_source; + + return 0; +} + +struct lookahead_entry *vp9_svc_lookahead_peek(VP9_COMP *const cpi, + struct lookahead_ctx *ctx, + int index, int copy_params) { + struct lookahead_entry *buf = vp9_lookahead_peek(ctx, index); + + if (buf != NULL && copy_params != 0) { + if (copy_svc_params(cpi, buf) != 0) + return NULL; + } + return buf; +} + +struct lookahead_entry *vp9_svc_lookahead_pop(VP9_COMP *const cpi, + struct lookahead_ctx *ctx, + int drain) { + struct lookahead_entry *buf = NULL; + + if (ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) { + buf = vp9_svc_lookahead_peek(cpi, ctx, 0, 1); + if (buf != NULL) { + // Only remove the buffer when pop the highest layer. Simply set the + // spatial_layer to -1 for lower layers. + buf->svc_params[cpi->svc.spatial_layer_id].spatial_layer = -1; + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) { + vp9_lookahead_pop(ctx, drain); + } + } + } + + return buf; +} +#endif diff --git a/media/libvpx/vp9/encoder/vp9_svc_layercontext.h b/media/libvpx/vp9/encoder/vp9_svc_layercontext.h new file mode 100644 index 000000000000..1fc43a4271b2 --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_svc_layercontext.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_ +#define VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_ + +#include "vpx/vpx_encoder.h" + +#include "vp9/encoder/vp9_ratectrl.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + RATE_CONTROL rc; + int target_bandwidth; + double framerate; + int avg_frame_size; + TWO_PASS twopass; + vpx_fixed_buf_t rc_twopass_stats_in; + unsigned int current_video_frame_in_layer; + int is_key_frame; + int frames_from_key_frame; + FRAME_TYPE last_frame_type; + vpx_svc_parameters_t svc_params_received; + struct lookahead_entry *alt_ref_source; + int alt_ref_idx; + int gold_ref_idx; + int has_alt_frame; + size_t layer_size; +} LAYER_CONTEXT; + +typedef struct { + int spatial_layer_id; + int temporal_layer_id; + int number_spatial_layers; + int number_temporal_layers; + + // Store scaled source frames to be used for temporal filter to generate + // a alt ref frame. + YV12_BUFFER_CONFIG scaled_frames[MAX_LAG_BUFFERS]; + + // Layer context used for rate control in one pass temporal CBR mode or + // two pass spatial mode. Defined for temporal or spatial layers for now. + // Does not support temporal combined with spatial RC. + LAYER_CONTEXT layer_context[MAX(VPX_TS_MAX_LAYERS, VPX_SS_MAX_LAYERS)]; +} SVC; + +struct VP9_COMP; + +// Initialize layer context data from init_config(). +void vp9_init_layer_context(struct VP9_COMP *const cpi); + +// Update the layer context from a change_config() call. +void vp9_update_layer_context_change_config(struct VP9_COMP *const cpi, + const int target_bandwidth); + +// Prior to encoding the frame, update framerate-related quantities +// for the current temporal layer. +void vp9_update_temporal_layer_framerate(struct VP9_COMP *const cpi); + +// Update framerate-related quantities for the current spatial layer. +void vp9_update_spatial_layer_framerate(struct VP9_COMP *const cpi, + double framerate); + +// Prior to encoding the frame, set the layer context, for the current layer +// to be encoded, to the cpi struct. +void vp9_restore_layer_context(struct VP9_COMP *const cpi); + +// Save the layer context after encoding the frame. +void vp9_save_layer_context(struct VP9_COMP *const cpi); + +// Initialize second pass rc for spatial svc. +void vp9_init_second_pass_spatial_svc(struct VP9_COMP *cpi); + +// Increment number of video frames in layer +void vp9_inc_frame_in_layer(struct VP9_COMP *const cpi); + +// Check if current layer is key frame in spatial upper layer +int vp9_is_upper_layer_key_frame(const struct VP9_COMP *const cpi); + +// Copy the source image, flags and svc parameters into a new framebuffer +// with the expected stride/border +int vp9_svc_lookahead_push(const struct VP9_COMP *const cpi, + struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src, + int64_t ts_start, int64_t ts_end, + unsigned int flags); + +// Get the next source buffer to encode +struct lookahead_entry *vp9_svc_lookahead_pop(struct VP9_COMP *const cpi, + struct lookahead_ctx *ctx, + int drain); + +// Get a future source buffer to encode +struct lookahead_entry *vp9_svc_lookahead_peek(struct VP9_COMP *const cpi, + struct lookahead_ctx *ctx, + int index, int copy_params); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_ENCODER_VP9_SVC_LAYERCONTEXT_ diff --git a/media/libvpx/vp9/encoder/vp9_temporal_filter.c b/media/libvpx/vp9/encoder/vp9_temporal_filter.c index 2cace0378daa..18a6a91d8a75 100644 --- a/media/libvpx/vp9/encoder/vp9_temporal_filter.c +++ b/media/libvpx/vp9/encoder/vp9_temporal_filter.c @@ -11,37 +11,50 @@ #include #include -#include "vp9/common/vp9_onyxc_int.h" -#include "vp9/common/vp9_reconinter.h" -#include "vp9/encoder/vp9_onyx_int.h" -#include "vp9/common/vp9_systemdependent.h" -#include "vp9/encoder/vp9_quantize.h" #include "vp9/common/vp9_alloccommon.h" -#include "vp9/encoder/vp9_mcomp.h" -#include "vp9/encoder/vp9_firstpass.h" -#include "vp9/encoder/vp9_psnr.h" -#include "vpx_scale/vpx_scale.h" -#include "vp9/common/vp9_extend.h" -#include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_systemdependent.h" +#include "vp9/encoder/vp9_extend.h" +#include "vp9/encoder/vp9_firstpass.h" +#include "vp9/encoder/vp9_mcomp.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_quantize.h" +#include "vp9/encoder/vp9_ratectrl.h" #include "vp9/encoder/vp9_segmentation.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/vpx_timer.h" +#include "vpx_scale/vpx_scale.h" -#define ALT_REF_MC_ENABLED 1 // dis/enable MC in AltRef filtering -#define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering +static int fixed_divide[512]; static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr, int stride, + int uv_block_width, + int uv_block_height, int mv_row, int mv_col, uint8_t *pred, - struct scale_factors *scale) { + struct scale_factors *scale, + int x, int y) { const int which_mv = 0; - MV mv = { mv_row, mv_col }; + const MV mv = { mv_row, mv_col }; + const InterpKernel *const kernel = + vp9_get_interp_kernel(xd->mi[0]->mbmi.interp_filter); + + enum mv_precision mv_precision_uv; + int uv_stride; + if (uv_block_width == 8) { + uv_stride = (stride + 1) >> 1; + mv_precision_uv = MV_PRECISION_Q4; + } else { + uv_stride = stride; + mv_precision_uv = MV_PRECISION_Q3; + } vp9_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, @@ -49,31 +62,38 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, scale, 16, 16, which_mv, - &xd->subpix, MV_PRECISION_Q3); + kernel, MV_PRECISION_Q3, x, y); - stride = (stride + 1) >> 1; - - vp9_build_inter_predictor(u_mb_ptr, stride, - &pred[256], 8, + vp9_build_inter_predictor(u_mb_ptr, uv_stride, + &pred[256], uv_block_width, &mv, scale, - 8, 8, + uv_block_width, uv_block_height, which_mv, - &xd->subpix, MV_PRECISION_Q4); + kernel, mv_precision_uv, x, y); - vp9_build_inter_predictor(v_mb_ptr, stride, - &pred[320], 8, + vp9_build_inter_predictor(v_mb_ptr, uv_stride, + &pred[512], uv_block_width, &mv, scale, - 8, 8, + uv_block_width, uv_block_height, which_mv, - &xd->subpix, MV_PRECISION_Q4); + kernel, mv_precision_uv, x, y); +} + +void vp9_temporal_filter_init() { + int i; + + fixed_divide[0] = 0; + for (i = 1; i < 512; ++i) + fixed_divide[i] = 0x80000 / i; } void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, - unsigned int block_size, + unsigned int block_width, + unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, @@ -81,9 +101,10 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int i, j, k; int modifier; int byte = 0; + const int rounding = strength > 0 ? 1 << (strength - 1) : 0; - for (i = 0, k = 0; i < block_size; i++) { - for (j = 0; j < block_size; j++, k++) { + for (i = 0, k = 0; i < block_height; i++) { + for (j = 0; j < block_width; j++, k++) { int src_byte = frame1[byte]; int pixel_value = *frame2++; @@ -93,7 +114,7 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1, // modifier = (int)roundf(coeff > 16 ? 0 : 16-coeff); modifier *= modifier; modifier *= 3; - modifier += 1 << (strength - 1); + modifier += rounding; modifier >>= strength; if (modifier > 16) @@ -108,34 +129,34 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1, byte++; } - byte += stride - block_size; + byte += stride - block_width; } } -#if ALT_REF_MC_ENABLED - static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, uint8_t *arf_frame_buf, uint8_t *frame_ptr_buf, - int stride, - int error_thresh) { - MACROBLOCK *x = &cpi->mb; - MACROBLOCKD* const xd = &x->e_mbd; + int stride) { + MACROBLOCK *const x = &cpi->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; int step_param; int sadpb = x->sadperbit16; int bestsme = INT_MAX; + int distortion; + unsigned int sse; + int sad_list[5]; - int_mv best_ref_mv1; - int_mv best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ - int_mv *ref_mv; + MV best_ref_mv1 = {0, 0}; + MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ + MV *ref_mv = &x->e_mbd.mi[0]->bmi[0].as_mv[0].as_mv; // Save input state struct buf_2d src = x->plane[0].src; struct buf_2d pre = xd->plane[0].pre[0]; - best_ref_mv1.as_int = 0; - best_ref_mv1_full.as_mv.col = best_ref_mv1.as_mv.col >> 3; - best_ref_mv1_full.as_mv.row = best_ref_mv1.as_mv.row >> 3; + best_ref_mv1_full.col = best_ref_mv1.col >> 3; + best_ref_mv1_full.row = best_ref_mv1.row >> 3; // Setup frame pointers x->plane[0].src.buf = arf_frame_buf; @@ -143,38 +164,24 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, xd->plane[0].pre[0].buf = frame_ptr_buf; xd->plane[0].pre[0].stride = stride; - // Further step/diamond searches as necessary - if (cpi->speed < 8) - step_param = cpi->sf.reduce_first_step_size + ((cpi->speed > 5) ? 1 : 0); - else - step_param = cpi->sf.reduce_first_step_size + 2; - step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2)); + step_param = mv_sf->reduce_first_step_size; + step_param = MIN(step_param, MAX_MVSEARCH_STEPS - 2); - /*cpi->sf.search_method == HEX*/ // Ignore mv costing by sending NULL pointer instead of cost arrays - ref_mv = &x->e_mbd.mi_8x8[0]->bmi[0].as_mv[0]; - bestsme = vp9_hex_search(x, &best_ref_mv1_full.as_mv, - step_param, sadpb, 1, - &cpi->fn_ptr[BLOCK_16X16], - 0, &best_ref_mv1.as_mv, &ref_mv->as_mv); + vp9_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1, + cond_sad_list(cpi, sad_list), + &cpi->fn_ptr[BLOCK_16X16], 0, &best_ref_mv1, ref_mv); -#if ALT_REF_SUBPEL_ENABLED - // Try sub-pixel MC? - // if (bestsme > error_thresh && bestsme < INT_MAX) - { - int distortion; - unsigned int sse; - // Ignore mv costing by sending NULL pointer instead of cost array - bestsme = cpi->find_fractional_mv_step(x, &ref_mv->as_mv, - &best_ref_mv1.as_mv, - cpi->common.allow_high_precision_mv, - x->errorperbit, - &cpi->fn_ptr[BLOCK_16X16], - 0, cpi->sf.subpel_iters_per_step, - NULL, NULL, - &distortion, &sse); - } -#endif + // Ignore mv costing by sending NULL pointer instead of cost array + bestsme = cpi->find_fractional_mv_step(x, ref_mv, + &best_ref_mv1, + cpi->common.allow_high_precision_mv, + x->errorperbit, + &cpi->fn_ptr[BLOCK_16X16], + 0, mv_sf->subpel_iters_per_step, + cond_sad_list(cpi, sad_list), + NULL, NULL, + &distortion, &sse, NULL, 0, 0); // Restore input state x->plane[0].src = src; @@ -182,9 +189,9 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, return bestsme; } -#endif static void temporal_filter_iterate_c(VP9_COMP *cpi, + YV12_BUFFER_CONFIG **frames, int frame_count, int alt_ref_index, int strength, @@ -197,12 +204,14 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, int mb_rows = cpi->common.mb_rows; int mb_y_offset = 0; int mb_uv_offset = 0; - DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16 * 16 + 8 * 8 + 8 * 8); - DECLARE_ALIGNED_ARRAY(16, uint16_t, count, 16 * 16 + 8 * 8 + 8 * 8); + DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16 * 16 * 3); + DECLARE_ALIGNED_ARRAY(16, uint16_t, count, 16 * 16 * 3); MACROBLOCKD *mbd = &cpi->mb.e_mbd; - YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index]; + YV12_BUFFER_CONFIG *f = frames[alt_ref_index]; uint8_t *dst1, *dst2; - DECLARE_ALIGNED_ARRAY(16, uint8_t, predictor, 16 * 16 + 8 * 8 + 8 * 8); + DECLARE_ALIGNED_ARRAY(16, uint8_t, predictor, 16 * 16 * 3); + const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y; + const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x; // Save input state uint8_t* input_buffer[MAX_MB_PLANE]; @@ -212,9 +221,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, input_buffer[i] = mbd->plane[i].pre[0].buf; for (mb_row = 0; mb_row < mb_rows; mb_row++) { -#if ALT_REF_MC_ENABLED - // Source frames are extended to 16 pixels. This is different than - // L/A/G reference frames that have a border of 32 (VP9BORDERINPIXELS) + // Source frames are extended to 16 pixels. This is different than + // L/A/G reference frames that have a border of 32 (VP9ENCBORDERINPIXELS) // A 6/8 tap filter is used for motion search. This requires 2 pixels // before and 3 pixels after. So the largest Y mv on a border would // then be 16 - VP9_INTERP_EXTEND. The UV blocks are half the size of the @@ -227,75 +235,72 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, cpi->mb.mv_row_min = -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND)); cpi->mb.mv_row_max = ((cpi->common.mb_rows - 1 - mb_row) * 16) + (17 - 2 * VP9_INTERP_EXTEND); -#endif for (mb_col = 0; mb_col < mb_cols; mb_col++) { int i, j, k; int stride; - vpx_memset(accumulator, 0, 384 * sizeof(unsigned int)); - vpx_memset(count, 0, 384 * sizeof(uint16_t)); + vpx_memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0])); + vpx_memset(count, 0, 16 * 16 * 3 * sizeof(count[0])); -#if ALT_REF_MC_ENABLED cpi->mb.mv_col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND)); cpi->mb.mv_col_max = ((cpi->common.mb_cols - 1 - mb_col) * 16) + (17 - 2 * VP9_INTERP_EXTEND); -#endif for (frame = 0; frame < frame_count; frame++) { - if (cpi->frames[frame] == NULL) + const int thresh_low = 10000; + const int thresh_high = 20000; + + if (frames[frame] == NULL) continue; - mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.row = 0; - mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.col = 0; + mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0; + mbd->mi[0]->bmi[0].as_mv[0].as_mv.col = 0; if (frame == alt_ref_index) { filter_weight = 2; } else { - int err = 0; -#if ALT_REF_MC_ENABLED -#define THRESH_LOW 10000 -#define THRESH_HIGH 20000 - // Find best match in this frame by MC - err = temporal_filter_find_matching_mb_c - (cpi, - cpi->frames[alt_ref_index]->y_buffer + mb_y_offset, - cpi->frames[frame]->y_buffer + mb_y_offset, - cpi->frames[frame]->y_stride, - THRESH_LOW); -#endif + int err = temporal_filter_find_matching_mb_c(cpi, + frames[alt_ref_index]->y_buffer + mb_y_offset, + frames[frame]->y_buffer + mb_y_offset, + frames[frame]->y_stride); + // Assign higher weight to matching MB if it's error // score is lower. If not applying MC default behavior // is to weight all MBs equal. - filter_weight = err < THRESH_LOW - ? 2 : err < THRESH_HIGH ? 1 : 0; + filter_weight = err < thresh_low + ? 2 : err < thresh_high ? 1 : 0; } if (filter_weight != 0) { // Construct the predictors - temporal_filter_predictors_mb_c - (mbd, - cpi->frames[frame]->y_buffer + mb_y_offset, - cpi->frames[frame]->u_buffer + mb_uv_offset, - cpi->frames[frame]->v_buffer + mb_uv_offset, - cpi->frames[frame]->y_stride, - mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.row, - mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.col, - predictor, scale); + temporal_filter_predictors_mb_c(mbd, + frames[frame]->y_buffer + mb_y_offset, + frames[frame]->u_buffer + mb_uv_offset, + frames[frame]->v_buffer + mb_uv_offset, + frames[frame]->y_stride, + mb_uv_width, mb_uv_height, + mbd->mi[0]->bmi[0].as_mv[0].as_mv.row, + mbd->mi[0]->bmi[0].as_mv[0].as_mv.col, + predictor, scale, + mb_col * 16, mb_row * 16); // Apply the filter (YUV) vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride, - predictor, 16, strength, filter_weight, + predictor, 16, 16, + strength, filter_weight, accumulator, count); - vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride, - predictor + 256, 8, strength, filter_weight, - accumulator + 256, count + 256); - + predictor + 256, + mb_uv_width, mb_uv_height, strength, + filter_weight, accumulator + 256, + count + 256); vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride, - predictor + 320, 8, strength, filter_weight, - accumulator + 320, count + 320); + predictor + 512, + mb_uv_width, mb_uv_height, strength, + filter_weight, accumulator + 512, + count + 512); } } @@ -306,7 +311,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, for (i = 0, k = 0; i < 16; i++) { for (j = 0; j < 16; j++, k++) { unsigned int pval = accumulator[k] + (count[k] >> 1); - pval *= cpi->fixed_divide[count[k]]; + pval *= fixed_divide[count[k]]; pval >>= 19; dst1[byte] = (uint8_t)pval; @@ -314,7 +319,6 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, // move to next pixel byte++; } - byte += stride - 16; } @@ -322,35 +326,32 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, dst2 = cpi->alt_ref_buffer.v_buffer; stride = cpi->alt_ref_buffer.uv_stride; byte = mb_uv_offset; - for (i = 0, k = 256; i < 8; i++) { - for (j = 0; j < 8; j++, k++) { - int m = k + 64; + for (i = 0, k = 256; i < mb_uv_height; i++) { + for (j = 0; j < mb_uv_width; j++, k++) { + int m = k + 256; // U unsigned int pval = accumulator[k] + (count[k] >> 1); - pval *= cpi->fixed_divide[count[k]]; + pval *= fixed_divide[count[k]]; pval >>= 19; dst1[byte] = (uint8_t)pval; // V pval = accumulator[m] + (count[m] >> 1); - pval *= cpi->fixed_divide[count[m]]; + pval *= fixed_divide[count[m]]; pval >>= 19; dst2[byte] = (uint8_t)pval; // move to next pixel byte++; } - - byte += stride - 8; + byte += stride - mb_uv_width; } - mb_y_offset += 16; - mb_uv_offset += 8; + mb_uv_offset += mb_uv_width; } - mb_y_offset += 16 * (f->y_stride - mb_cols); - mb_uv_offset += 8 * (f->uv_stride - mb_cols); + mb_uv_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols; } // Restore input state @@ -358,170 +359,133 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, mbd->plane[i].pre[0].buf = input_buffer[i]; } -void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) { - VP9_COMMON *const cm = &cpi->common; +// Apply buffer limits and context specific adjustments to arnr filter. +static void adjust_arnr_filter(VP9_COMP *cpi, + int distance, int group_boost, + int *arnr_frames, int *arnr_strength) { + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const int frames_after_arf = + vp9_lookahead_depth(cpi->lookahead) - distance - 1; + int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1; + int frames_bwd; + int q, frames, strength; - int frame = 0; + // Define the forward and backwards filter limits for this arnr group. + if (frames_fwd > frames_after_arf) + frames_fwd = frames_after_arf; + if (frames_fwd > distance) + frames_fwd = distance; - int frames_to_blur_backward = 0; - int frames_to_blur_forward = 0; - int frames_to_blur = 0; - int start_frame = 0; + frames_bwd = frames_fwd; - int strength = cpi->active_arnr_strength; - int blur_type = cpi->oxcf.arnr_type; - int max_frames = cpi->active_arnr_frames; + // For even length filter there is one more frame backward + // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff. + if (frames_bwd < distance) + frames_bwd += (oxcf->arnr_max_frames + 1) & 0x1; - const int num_frames_backward = distance; - const int num_frames_forward = vp9_lookahead_depth(cpi->lookahead) - - (num_frames_backward + 1); + // Set the baseline active filter size. + frames = frames_bwd + 1 + frames_fwd; - struct scale_factors scale; - struct scale_factors_common scale_comm; - - switch (blur_type) { - case 1: - // Backward Blur - frames_to_blur_backward = num_frames_backward; - - if (frames_to_blur_backward >= max_frames) - frames_to_blur_backward = max_frames - 1; - - frames_to_blur = frames_to_blur_backward + 1; - break; - - case 2: - // Forward Blur - - frames_to_blur_forward = num_frames_forward; - - if (frames_to_blur_forward >= max_frames) - frames_to_blur_forward = max_frames - 1; - - frames_to_blur = frames_to_blur_forward + 1; - break; - - case 3: - default: - // Center Blur - frames_to_blur_forward = num_frames_forward; - frames_to_blur_backward = num_frames_backward; - - if (frames_to_blur_forward > frames_to_blur_backward) - frames_to_blur_forward = frames_to_blur_backward; - - if (frames_to_blur_backward > frames_to_blur_forward) - frames_to_blur_backward = frames_to_blur_forward; - - // When max_frames is even we have 1 more frame backward than forward - if (frames_to_blur_forward > (max_frames - 1) / 2) - frames_to_blur_forward = ((max_frames - 1) / 2); - - if (frames_to_blur_backward > (max_frames / 2)) - frames_to_blur_backward = (max_frames / 2); - - frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1; - break; - } - - start_frame = distance + frames_to_blur_forward; - -#ifdef DEBUGFWG - // DEBUG FWG - printf( - "max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d " - "start:%d", - max_frames, num_frames_backward, num_frames_forward, frames_to_blur, - frames_to_blur_backward, frames_to_blur_forward, cpi->source_encode_index, - cpi->last_alt_ref_sei, start_frame); -#endif - - // Setup scaling factors. Scaling on each of the arnr frames is not supported - vp9_setup_scale_factors_for_frame(&scale, &scale_comm, - get_frame_new_buffer(cm)->y_crop_width, - get_frame_new_buffer(cm)->y_crop_height, - cm->width, cm->height); - - // Setup frame pointers, NULL indicates frame not included in filter - vp9_zero(cpi->frames); - for (frame = 0; frame < frames_to_blur; frame++) { - int which_buffer = start_frame - frame; - struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead, - which_buffer); - cpi->frames[frames_to_blur - 1 - frame] = &buf->img; - } - - temporal_filter_iterate_c(cpi, frames_to_blur, frames_to_blur_backward, - strength, &scale); -} - -void configure_arnr_filter(VP9_COMP *cpi, const unsigned int this_frame, - const int group_boost) { - int half_gf_int; - int frames_after_arf; - int frames_bwd = cpi->oxcf.arnr_max_frames - 1; - int frames_fwd = cpi->oxcf.arnr_max_frames - 1; - int q; - - // Define the arnr filter width for this group of frames: - // We only filter frames that lie within a distance of half - // the GF interval from the ARF frame. We also have to trap - // cases where the filter extends beyond the end of clip. - // Note: this_frame->frame has been updated in the loop - // so it now points at the ARF frame. - half_gf_int = cpi->baseline_gf_interval >> 1; - frames_after_arf = (int)(cpi->twopass.total_stats.count - this_frame - 1); - - switch (cpi->oxcf.arnr_type) { - case 1: // Backward filter - frames_fwd = 0; - if (frames_bwd > half_gf_int) - frames_bwd = half_gf_int; - break; - - case 2: // Forward filter - if (frames_fwd > half_gf_int) - frames_fwd = half_gf_int; - if (frames_fwd > frames_after_arf) - frames_fwd = frames_after_arf; - frames_bwd = 0; - break; - - case 3: // Centered filter - default: - frames_fwd >>= 1; - if (frames_fwd > frames_after_arf) - frames_fwd = frames_after_arf; - if (frames_fwd > half_gf_int) - frames_fwd = half_gf_int; - - frames_bwd = frames_fwd; - - // For even length filter there is one more frame backward - // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff. - if (frames_bwd < half_gf_int) - frames_bwd += (cpi->oxcf.arnr_max_frames + 1) & 0x1; - break; - } - - cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd; - - // Adjust the strength based on active max q - q = ((int)vp9_convert_qindex_to_q(cpi->active_worst_quality) >> 1); - if (q > 8) { - cpi->active_arnr_strength = cpi->oxcf.arnr_strength; + // Adjust the strength based on active max q. + if (cpi->common.current_video_frame > 1) + q = ((int)vp9_convert_qindex_to_q( + cpi->rc.avg_frame_qindex[INTER_FRAME])); + else + q = ((int)vp9_convert_qindex_to_q( + cpi->rc.avg_frame_qindex[KEY_FRAME])); + if (q > 16) { + strength = oxcf->arnr_strength; } else { - cpi->active_arnr_strength = cpi->oxcf.arnr_strength - (8 - q); - if (cpi->active_arnr_strength < 0) - cpi->active_arnr_strength = 0; + strength = oxcf->arnr_strength - ((16 - q) / 2); + if (strength < 0) + strength = 0; } // Adjust number of frames in filter and strength based on gf boost level. - if (cpi->active_arnr_frames > (group_boost / 150)) { - cpi->active_arnr_frames = (group_boost / 150); - cpi->active_arnr_frames += !(cpi->active_arnr_frames & 1); + if (frames > group_boost / 150) { + frames = group_boost / 150; + frames += !(frames & 1); } - if (cpi->active_arnr_strength > (group_boost / 300)) { - cpi->active_arnr_strength = (group_boost / 300); + + if (strength > group_boost / 300) { + strength = group_boost / 300; } + + // Adjustments for second level arf in multi arf case. + if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) { + strength >>= 1; + } + } + + *arnr_frames = frames; + *arnr_strength = strength; +} + +void vp9_temporal_filter(VP9_COMP *cpi, int distance) { + VP9_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + int frame; + int frames_to_blur; + int start_frame; + int strength; + int frames_to_blur_backward; + int frames_to_blur_forward; + struct scale_factors sf; + YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = {NULL}; + + // Apply context specific adjustments to the arnr filter parameters. + adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength); + frames_to_blur_backward = (frames_to_blur / 2); + frames_to_blur_forward = ((frames_to_blur - 1) / 2); + start_frame = distance + frames_to_blur_forward; + + // Setup frame pointers, NULL indicates frame not included in filter. + for (frame = 0; frame < frames_to_blur; ++frame) { + const int which_buffer = start_frame - frame; + struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead, + which_buffer); + frames[frames_to_blur - 1 - frame] = &buf->img; + } + + // Setup scaling factors. Scaling on each of the arnr frames is not supported + if (is_two_pass_svc(cpi)) { + // In spatial svc the scaling factors might be less then 1/2. So we will use + // non-normative scaling. + int frame_used = 0; + vp9_setup_scale_factors_for_frame(&sf, + get_frame_new_buffer(cm)->y_crop_width, + get_frame_new_buffer(cm)->y_crop_height, + get_frame_new_buffer(cm)->y_crop_width, + get_frame_new_buffer(cm)->y_crop_height); + + for (frame = 0; frame < frames_to_blur; ++frame) { + if (cm->mi_cols * MI_SIZE != frames[frame]->y_width || + cm->mi_rows * MI_SIZE != frames[frame]->y_height) { + if (vp9_realloc_frame_buffer(&cpi->svc.scaled_frames[frame_used], + cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, + NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to reallocate alt_ref_buffer"); + + frames[frame] = vp9_scale_if_required(cm, frames[frame], + &cpi->svc.scaled_frames[frame_used]); + ++frame_used; + } + } + } else { + vp9_setup_scale_factors_for_frame(&sf, + get_frame_new_buffer(cm)->y_crop_width, + get_frame_new_buffer(cm)->y_crop_height, + cm->width, cm->height); + } + + temporal_filter_iterate_c(cpi, frames, frames_to_blur, + frames_to_blur_backward, strength, &sf); } diff --git a/media/libvpx/vp9/encoder/vp9_temporal_filter.h b/media/libvpx/vp9/encoder/vp9_temporal_filter.h index c5f3b467e540..a971e0ae365e 100644 --- a/media/libvpx/vp9/encoder/vp9_temporal_filter.h +++ b/media/libvpx/vp9/encoder/vp9_temporal_filter.h @@ -11,8 +11,15 @@ #ifndef VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ #define VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ -void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance); -void configure_arnr_filter(VP9_COMP *cpi, const unsigned int this_frame, - const int group_boost); +#ifdef __cplusplus +extern "C" { +#endif + +void vp9_temporal_filter_init(); +void vp9_temporal_filter(VP9_COMP *cpi, int distance); + +#ifdef __cplusplus +} // extern "C" +#endif #endif // VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ diff --git a/media/libvpx/vp9/encoder/vp9_tokenize.c b/media/libvpx/vp9/encoder/vp9_tokenize.c index c7336d003a5a..6068b85a0435 100644 --- a/media/libvpx/vp9/encoder/vp9_tokenize.c +++ b/media/libvpx/vp9/encoder/vp9_tokenize.c @@ -8,25 +8,98 @@ * be found in the AUTHORS file in the root of the source tree. */ - +#include #include #include #include -#include -#include "vp9/encoder/vp9_onyx_int.h" -#include "vp9/encoder/vp9_tokenize.h" + #include "vpx_mem/vpx_mem.h" +#include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_pred_common.h" #include "vp9/common/vp9_seg_common.h" -#include "vp9/common/vp9_entropy.h" + +#include "vp9/encoder/vp9_cost.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_tokenize.h" static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2]; const TOKENVALUE *vp9_dct_value_tokens_ptr; -static int dct_value_cost[DCT_MAX_VALUE * 2]; -const int *vp9_dct_value_cost_ptr; +static int16_t dct_value_cost[DCT_MAX_VALUE * 2]; +const int16_t *vp9_dct_value_cost_ptr; -static void fill_value_tokens() { +// Array indices are identical to previously-existing CONTEXT_NODE indices +const vp9_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = { + -EOB_TOKEN, 2, // 0 = EOB + -ZERO_TOKEN, 4, // 1 = ZERO + -ONE_TOKEN, 6, // 2 = ONE + 8, 12, // 3 = LOW_VAL + -TWO_TOKEN, 10, // 4 = TWO + -THREE_TOKEN, -FOUR_TOKEN, // 5 = THREE + 14, 16, // 6 = HIGH_LOW + -CATEGORY1_TOKEN, -CATEGORY2_TOKEN, // 7 = CAT_ONE + 18, 20, // 8 = CAT_THREEFOUR + -CATEGORY3_TOKEN, -CATEGORY4_TOKEN, // 9 = CAT_THREE + -CATEGORY5_TOKEN, -CATEGORY6_TOKEN // 10 = CAT_FIVE +}; + +// Unconstrained Node Tree +const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = { + 2, 6, // 0 = LOW_VAL + -TWO_TOKEN, 4, // 1 = TWO + -THREE_TOKEN, -FOUR_TOKEN, // 2 = THREE + 8, 10, // 3 = HIGH_LOW + -CATEGORY1_TOKEN, -CATEGORY2_TOKEN, // 4 = CAT_ONE + 12, 14, // 5 = CAT_THREEFOUR + -CATEGORY3_TOKEN, -CATEGORY4_TOKEN, // 6 = CAT_THREE + -CATEGORY5_TOKEN, -CATEGORY6_TOKEN // 7 = CAT_FIVE +}; + +static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28]; + +static void init_bit_tree(vp9_tree_index *p, int n) { + int i = 0; + + while (++i < n) { + p[0] = p[1] = i << 1; + p += 2; + } + + p[0] = p[1] = 0; +} + +static void init_bit_trees() { + init_bit_tree(cat1, 1); + init_bit_tree(cat2, 2); + init_bit_tree(cat3, 3); + init_bit_tree(cat4, 4); + init_bit_tree(cat5, 5); + init_bit_tree(cat6, 14); +} + +const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS] = { + {0, 0, 0, 0}, // ZERO_TOKEN + {0, 0, 0, 1}, // ONE_TOKEN + {0, 0, 0, 2}, // TWO_TOKEN + {0, 0, 0, 3}, // THREE_TOKEN + {0, 0, 0, 4}, // FOUR_TOKEN + {cat1, vp9_cat1_prob, 1, CAT1_MIN_VAL}, // CATEGORY1_TOKEN + {cat2, vp9_cat2_prob, 2, CAT2_MIN_VAL}, // CATEGORY2_TOKEN + {cat3, vp9_cat3_prob, 3, CAT3_MIN_VAL}, // CATEGORY3_TOKEN + {cat4, vp9_cat4_prob, 4, CAT4_MIN_VAL}, // CATEGORY4_TOKEN + {cat5, vp9_cat5_prob, 5, CAT5_MIN_VAL}, // CATEGORY5_TOKEN + {cat6, vp9_cat6_prob, 14, CAT6_MIN_VAL}, // CATEGORY6_TOKEN + {0, 0, 0, 0} // EOB_TOKEN +}; + +struct vp9_token vp9_coef_encodings[ENTROPY_TOKENS]; + +void vp9_coef_tree_initialize() { + init_bit_trees(); + vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree); +} + +void vp9_tokenize_initialize() { TOKENVALUE *const t = dct_value_tokens + DCT_MAX_VALUE; const vp9_extra_bit *const e = vp9_extra_bits; @@ -80,18 +153,48 @@ struct tokenize_b_args { VP9_COMP *cpi; MACROBLOCKD *xd; TOKENEXTRA **tp; - TX_SIZE tx_size; - uint8_t *token_cache; }; static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { struct tokenize_b_args* const args = arg; MACROBLOCKD *const xd = args->xd; + struct macroblock_plane *p = &args->cpi->mb.plane[plane]; struct macroblockd_plane *pd = &xd->plane[plane]; int aoff, loff; txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff); - set_contexts(xd, pd, plane_bsize, tx_size, pd->eobs[block] > 0, aoff, loff); + vp9_set_contexts(xd, pd, plane_bsize, tx_size, p->eobs[block] > 0, + aoff, loff); +} + +static INLINE void add_token(TOKENEXTRA **t, const vp9_prob *context_tree, + int16_t extra, uint8_t token, + uint8_t skip_eob_node, + unsigned int *counts) { + (*t)->token = token; + (*t)->extra = extra; + (*t)->context_tree = context_tree; + (*t)->skip_eob_node = skip_eob_node; + (*t)++; + ++counts[token]; +} + +static INLINE void add_token_no_extra(TOKENEXTRA **t, + const vp9_prob *context_tree, + uint8_t token, + uint8_t skip_eob_node, + unsigned int *counts) { + (*t)->token = token; + (*t)->context_tree = context_tree; + (*t)->skip_eob_node = skip_eob_node; + (*t)++; + ++counts[token]; +} + +static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id, + TX_SIZE tx_size) { + const int eob_max = 16 << (tx_size << 1); + return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max; } static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, @@ -100,92 +203,98 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize, VP9_COMP *cpi = args->cpi; MACROBLOCKD *xd = args->xd; TOKENEXTRA **tp = args->tp; - uint8_t *token_cache = args->token_cache; + uint8_t token_cache[32 * 32]; + struct macroblock_plane *p = &cpi->mb.plane[plane]; struct macroblockd_plane *pd = &xd->plane[plane]; - MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; + MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; int pt; /* near block/prev token context index */ - int c = 0, rc = 0; + int c; TOKENEXTRA *t = *tp; /* store tokens starting here */ - const int eob = pd->eobs[block]; + int eob = p->eobs[block]; const PLANE_TYPE type = pd->plane_type; - const int16_t *qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block); - + const int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); const int segment_id = mbmi->segment_id; const int16_t *scan, *nb; - vp9_coeff_count *const counts = cpi->coef_counts[tx_size]; - vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size]; + const scan_order *so; const int ref = is_inter_block(mbmi); - const uint8_t *const band_translate = get_band_translate(tx_size); + unsigned int (*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] = + cpi->coef_counts[tx_size][type][ref]; + vp9_prob (*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] = + cpi->common.fc.coef_probs[tx_size][type][ref]; + unsigned int (*const eob_branch)[COEFF_CONTEXTS] = + cpi->common.counts.eob_branch[tx_size][type][ref]; + const uint8_t *const band = get_band_translate(tx_size); const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size); + int aoff, loff; txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff); - assert((!type && !plane) || (type && plane)); - pt = get_entropy_context(tx_size, pd->above_context + aoff, - pd->left_context + loff); - get_scan(xd, tx_size, type, block, &scan, &nb); + pd->left_context + loff); + so = get_scan(xd, tx_size, type, block); + scan = so->scan; + nb = so->neighbors; c = 0; - do { - const int band = band_translate[c]; - int token; + while (c < eob) { int v = 0; - rc = scan[c]; - if (c) - pt = get_coef_context(nb, token_cache, c); - if (c < eob) { - v = qcoeff_ptr[rc]; - assert(-DCT_MAX_VALUE <= v && v < DCT_MAX_VALUE); + int skip_eob = 0; + v = qcoeff[scan[c]]; - t->extra = vp9_dct_value_tokens_ptr[v].extra; - token = vp9_dct_value_tokens_ptr[v].token; - } else { - token = DCT_EOB_TOKEN; + while (!v) { + add_token_no_extra(&t, coef_probs[band[c]][pt], ZERO_TOKEN, skip_eob, + counts[band[c]][pt]); + eob_branch[band[c]][pt] += !skip_eob; + + skip_eob = 1; + token_cache[scan[c]] = 0; + ++c; + pt = get_coef_context(nb, token_cache, c); + v = qcoeff[scan[c]]; } - t->token = token; - t->context_tree = coef_probs[type][ref][band][pt]; - t->skip_eob_node = (c > 0) && (token_cache[scan[c - 1]] == 0); + add_token(&t, coef_probs[band[c]][pt], + vp9_dct_value_tokens_ptr[v].extra, + (uint8_t)vp9_dct_value_tokens_ptr[v].token, + (uint8_t)skip_eob, + counts[band[c]][pt]); + eob_branch[band[c]][pt] += !skip_eob; - assert(vp9_coef_encodings[t->token].len - t->skip_eob_node > 0); - - ++counts[type][ref][band][pt][token]; - if (!t->skip_eob_node) - ++cpi->common.counts.eob_branch[tx_size][type][ref][band][pt]; - - token_cache[rc] = vp9_pt_energy_class[token]; - ++t; - } while (c < eob && ++c < seg_eob); + token_cache[scan[c]] = + vp9_pt_energy_class[vp9_dct_value_tokens_ptr[v].token]; + ++c; + pt = get_coef_context(nb, token_cache, c); + } + if (c < seg_eob) { + add_token_no_extra(&t, coef_probs[band[c]][pt], EOB_TOKEN, 0, + counts[band[c]][pt]); + ++eob_branch[band[c]][pt]; + } *tp = t; - set_contexts(xd, pd, plane_bsize, tx_size, c > 0, aoff, loff); + vp9_set_contexts(xd, pd, plane_bsize, tx_size, c > 0, aoff, loff); } struct is_skippable_args { - MACROBLOCKD *xd; + MACROBLOCK *x; int *skippable; }; - static void is_skippable(int plane, int block, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *argv) { struct is_skippable_args *args = argv; - args->skippable[0] &= (!args->xd->plane[plane].eobs[block]); + (void)plane_bsize; + (void)tx_size; + args->skippable[0] &= (!args->x->plane[plane].eobs[block]); } -int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize) { +// TODO(yaowu): rewrite and optimize this function to remove the usage of +// vp9_foreach_transform_block() and simplify is_skippable(). +int vp9_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { int result = 1; - struct is_skippable_args args = {xd, &result}; - foreach_transformed_block(xd, bsize, is_skippable, &args); - return result; -} - -int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize, - int plane) { - int result = 1; - struct is_skippable_args args = {xd, &result}; - foreach_transformed_block_in_plane(xd, bsize, plane, is_skippable, &args); + struct is_skippable_args args = {x, &result}; + vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane, is_skippable, + &args); return result; } @@ -193,17 +302,15 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->mb.e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; + MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; TOKENEXTRA *t_backup = *t; - const int mb_skip_context = vp9_get_pred_context_mbskip(xd); + const int ctx = vp9_get_skip_context(xd); const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP); - struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size, cpi->mb.token_cache}; - - mbmi->skip_coeff = vp9_sb_is_skippable(xd, bsize); - if (mbmi->skip_coeff) { + struct tokenize_b_args arg = {cpi, xd, t}; + if (mbmi->skip) { if (!dry_run) - cm->counts.mbskip[mb_skip_context][1] += skip_inc; + cm->counts.skip[ctx][1] += skip_inc; reset_skip_context(xd, bsize); if (dry_run) *t = t_backup; @@ -211,14 +318,10 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run, } if (!dry_run) { - cm->counts.mbskip[mb_skip_context][0] += skip_inc; - foreach_transformed_block(xd, bsize, tokenize_b, &arg); + cm->counts.skip[ctx][0] += skip_inc; + vp9_foreach_transformed_block(xd, bsize, tokenize_b, &arg); } else { - foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg); + vp9_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg); *t = t_backup; } } - -void vp9_tokenize_initialize() { - fill_value_tokens(); -} diff --git a/media/libvpx/vp9/encoder/vp9_tokenize.h b/media/libvpx/vp9/encoder/vp9_tokenize.h index e24e31b8091e..063c0bafe7ba 100644 --- a/media/libvpx/vp9/encoder/vp9_tokenize.h +++ b/media/libvpx/vp9/encoder/vp9_tokenize.h @@ -12,10 +12,18 @@ #define VP9_ENCODER_VP9_TOKENIZE_H_ #include "vp9/common/vp9_entropy.h" + #include "vp9/encoder/vp9_block.h" +#include "vp9/encoder/vp9_treewriter.h" + +#ifdef __cplusplus +extern "C" { +#endif void vp9_tokenize_initialize(); +#define EOSB_TOKEN 127 // Not signalled, encoder only + typedef struct { int16_t token; int16_t extra; @@ -28,19 +36,26 @@ typedef struct { uint8_t skip_eob_node; } TOKENEXTRA; -int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize); -int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize, - int plane); +extern const vp9_tree_index vp9_coef_tree[]; +extern const vp9_tree_index vp9_coef_con_tree[]; +extern struct vp9_token vp9_coef_encodings[]; + +int vp9_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane); + struct VP9_COMP; void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize); -extern const int *vp9_dct_value_cost_ptr; +extern const int16_t *vp9_dct_value_cost_ptr; /* TODO: The Token field should be broken out into a separate char array to * improve cache locality, since it's needed for costing when the rest of the * fields are not. */ extern const TOKENVALUE *vp9_dct_value_tokens_ptr; +#ifdef __cplusplus +} // extern "C" +#endif + #endif // VP9_ENCODER_VP9_TOKENIZE_H_ diff --git a/media/libvpx/vp9/encoder/vp9_treewriter.c b/media/libvpx/vp9/encoder/vp9_treewriter.c index e4aed5374cf4..bb04b4025c8d 100644 --- a/media/libvpx/vp9/encoder/vp9_treewriter.c +++ b/media/libvpx/vp9/encoder/vp9_treewriter.c @@ -10,29 +10,49 @@ #include "vp9/encoder/vp9_treewriter.h" -static void cost(int *costs, vp9_tree tree, const vp9_prob *probs, - int i, int c) { - const vp9_prob prob = probs[i / 2]; - int b; +static void tree2tok(struct vp9_token *tokens, const vp9_tree_index *tree, + int i, int v, int l) { + v += v; + ++l; - for (b = 0; b <= 1; ++b) { - const int cc = c + vp9_cost_bit(prob, b); - const vp9_tree_index ii = tree[i + b]; - - if (ii <= 0) - costs[-ii] = cc; - else - cost(costs, tree, probs, ii, cc); - } + do { + const vp9_tree_index j = tree[i++]; + if (j <= 0) { + tokens[-j].value = v; + tokens[-j].len = l; + } else { + tree2tok(tokens, tree, j, v, l); + } + } while (++v & 1); } -void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree) { - cost(costs, tree, probs, 0, 0); +void vp9_tokens_from_tree(struct vp9_token *tokens, + const vp9_tree_index *tree) { + tree2tok(tokens, tree, 0, 0, 0); } -void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree) { - assert(tree[0] <= 0 && tree[1] > 0); +static unsigned int convert_distribution(unsigned int i, vp9_tree tree, + unsigned int branch_ct[][2], + const unsigned int num_events[]) { + unsigned int left, right; - costs[-tree[0]] = vp9_cost_bit(probs[0], 0); - cost(costs, tree, probs, 2, 0); + if (tree[i] <= 0) + left = num_events[-tree[i]]; + else + left = convert_distribution(tree[i], tree, branch_ct, num_events); + + if (tree[i + 1] <= 0) + right = num_events[-tree[i + 1]]; + else + right = convert_distribution(tree[i + 1], tree, branch_ct, num_events); + + branch_ct[i >> 1][0] = left; + branch_ct[i >> 1][1] = right; + return left + right; +} + +void vp9_tree_probs_from_distribution(vp9_tree tree, + unsigned int branch_ct[/* n-1 */][2], + const unsigned int num_events[/* n */]) { + convert_distribution(0, tree, branch_ct, num_events); } diff --git a/media/libvpx/vp9/encoder/vp9_treewriter.h b/media/libvpx/vp9/encoder/vp9_treewriter.h index eeda5cda7961..4a76d87cdfea 100644 --- a/media/libvpx/vp9/encoder/vp9_treewriter.h +++ b/media/libvpx/vp9/encoder/vp9_treewriter.h @@ -8,47 +8,29 @@ * be found in the AUTHORS file in the root of the source tree. */ - #ifndef VP9_ENCODER_VP9_TREEWRITER_H_ #define VP9_ENCODER_VP9_TREEWRITER_H_ -/* Trees map alphabets into huffman-like codes suitable for an arithmetic - bit coder. Timothy S Murphy 11 October 2004 */ +#include "vp9/encoder/vp9_writer.h" -#include "vp9/common/vp9_treecoder.h" +#ifdef __cplusplus +extern "C" { +#endif -#include "vp9/encoder/vp9_boolhuff.h" /* for now */ +void vp9_tree_probs_from_distribution(vp9_tree tree, + unsigned int branch_ct[ /* n - 1 */ ][2], + const unsigned int num_events[ /* n */ ]); +struct vp9_token { + int value; + int len; +}; -#define vp9_write_prob(w, v) vp9_write_literal((w), (v), 8) - -/* Approximate length of an encoded bool in 256ths of a bit at given prob */ - -#define vp9_cost_zero(x) (vp9_prob_cost[x]) -#define vp9_cost_one(x) vp9_cost_zero(vp9_complement(x)) - -#define vp9_cost_bit(x, b) vp9_cost_zero((b) ? vp9_complement(x) : (x)) - -/* VP8BC version is scaled by 2^20 rather than 2^8; see bool_coder.h */ - - -/* Both of these return bits, not scaled bits. */ -static INLINE unsigned int cost_branch256(const unsigned int ct[2], - vp9_prob p) { - return ct[0] * vp9_cost_zero(p) + ct[1] * vp9_cost_one(p); -} - -static INLINE unsigned int cost_branch(const unsigned int ct[2], - vp9_prob p) { - return cost_branch256(ct, p) >> 8; -} - - -static INLINE void treed_write(vp9_writer *w, - vp9_tree tree, const vp9_prob *probs, - int bits, int len) { - vp9_tree_index i = 0; +void vp9_tokens_from_tree(struct vp9_token*, const vp9_tree_index *); +static INLINE void vp9_write_tree(vp9_writer *w, const vp9_tree_index *tree, + const vp9_prob *probs, int bits, int len, + vp9_tree_index i) { do { const int bit = (bits >> --len) & 1; vp9_write(w, bit, probs[i >> 1]); @@ -56,32 +38,14 @@ static INLINE void treed_write(vp9_writer *w, } while (len); } -static INLINE void write_token(vp9_writer *w, vp9_tree tree, - const vp9_prob *probs, - const struct vp9_token *token) { - treed_write(w, tree, probs, token->value, token->len); +static INLINE void vp9_write_token(vp9_writer *w, const vp9_tree_index *tree, + const vp9_prob *probs, + const struct vp9_token *token) { + vp9_write_tree(w, tree, probs, token->value, token->len, 0); } -static INLINE int treed_cost(vp9_tree tree, const vp9_prob *probs, - int bits, int len) { - int cost = 0; - vp9_tree_index i = 0; - - do { - const int bit = (bits >> --len) & 1; - cost += vp9_cost_bit(probs[i >> 1], bit); - i = tree[i + bit]; - } while (len); - - return cost; -} - -static INLINE int cost_token(vp9_tree tree, const vp9_prob *probs, - const struct vp9_token *token) { - return treed_cost(tree, probs, token->value, token->len); -} - -void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree); -void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree); +#ifdef __cplusplus +} // extern "C" +#endif #endif // VP9_ENCODER_VP9_TREEWRITER_H_ diff --git a/media/libvpx/vp9/encoder/vp9_variance.c b/media/libvpx/vp9/encoder/vp9_variance.c new file mode 100644 index 000000000000..afbb191ad050 --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_variance.c @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" + +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_filter.h" + +#include "vp9/encoder/vp9_variance.h" + +void variance(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + int i, j; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } +} + +// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// first-pass of 2-D separable filter. +// +// Produces int32_t output to retain precision for next pass. Two filter taps +// should sum to VP9_FILTER_WEIGHT. pixel_step defines whether the filter is +// applied horizontally (pixel_step=1) or vertically (pixel_step=stride). It +// defines the offset required to move from one input to the next. +static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr, + uint16_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp9_filter) { + unsigned int i, j; + + for (i = 0; i < output_height; i++) { + for (j = 0; j < output_width; j++) { + output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + + (int)src_ptr[pixel_step] * vp9_filter[1], + FILTER_BITS); + + src_ptr++; + } + + // Next row... + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// second-pass of 2-D separable filter. +// +// Requires 32-bit input as produced by filter_block2d_bil_first_pass. Two +// filter taps should sum to VP9_FILTER_WEIGHT. pixel_step defines whether the +// filter is applied horizontally (pixel_step=1) or vertically (pixel_step= +// stride). It defines the offset required to move from one input to the next. +static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp9_filter) { + unsigned int i, j; + + for (i = 0; i < output_height; i++) { + for (j = 0; j < output_width; j++) { + output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + + (int)src_ptr[pixel_step] * vp9_filter[1], + FILTER_BITS); + src_ptr++; + } + + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) { + unsigned int i, sum = 0; + + for (i = 0; i < 256; ++i) { + sum += src_ptr[i] * src_ptr[i]; + } + + return sum; +} + +#define VAR(W, H) \ +unsigned int vp9_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + unsigned int *sse) { \ + int sum; \ + variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (((int64_t)sum * sum) / (W * H)); \ +} + +#define SUBPIX_VAR(W, H) \ +unsigned int vp9_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, \ + int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ +\ + var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \ + BILINEAR_FILTERS_2TAP(xoffset)); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + BILINEAR_FILTERS_2TAP(yoffset)); \ +\ + return vp9_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \ +} + +#define SUBPIX_AVG_VAR(W, H) \ +unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, \ + int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, \ + unsigned int *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, H * W); \ +\ + var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \ + BILINEAR_FILTERS_2TAP(xoffset)); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + BILINEAR_FILTERS_2TAP(yoffset)); \ +\ + vp9_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ +\ + return vp9_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \ +} + +void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance(src_ptr, source_stride, ref_ptr, ref_stride, 16, 16, sse, sum); +} + +void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum); +} + +unsigned int vp9_mse16x16_c(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance(src, src_stride, ref, ref_stride, 16, 16, sse, &sum); + return *sse; +} + +unsigned int vp9_mse16x8_c(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance(src, src_stride, ref, ref_stride, 16, 8, sse, &sum); + return *sse; +} + +unsigned int vp9_mse8x16_c(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance(src, src_stride, ref, ref_stride, 8, 16, sse, &sum); + return *sse; +} + +unsigned int vp9_mse8x8_c(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance(src, src_stride, ref, ref_stride, 8, 8, sse, &sum); + return *sse; +} + +VAR(4, 4) +SUBPIX_VAR(4, 4) +SUBPIX_AVG_VAR(4, 4) + +VAR(4, 8) +SUBPIX_VAR(4, 8) +SUBPIX_AVG_VAR(4, 8) + +VAR(8, 4) +SUBPIX_VAR(8, 4) +SUBPIX_AVG_VAR(8, 4) + +VAR(8, 8) +SUBPIX_VAR(8, 8) +SUBPIX_AVG_VAR(8, 8) + +VAR(8, 16) +SUBPIX_VAR(8, 16) +SUBPIX_AVG_VAR(8, 16) + +VAR(16, 8) +SUBPIX_VAR(16, 8) +SUBPIX_AVG_VAR(16, 8) + +VAR(16, 16) +SUBPIX_VAR(16, 16) +SUBPIX_AVG_VAR(16, 16) + +VAR(16, 32) +SUBPIX_VAR(16, 32) +SUBPIX_AVG_VAR(16, 32) + +VAR(32, 16) +SUBPIX_VAR(32, 16) +SUBPIX_AVG_VAR(32, 16) + +VAR(32, 32) +SUBPIX_VAR(32, 32) +SUBPIX_AVG_VAR(32, 32) + +VAR(32, 64) +SUBPIX_VAR(32, 64) +SUBPIX_AVG_VAR(32, 64) + +VAR(64, 32) +SUBPIX_VAR(64, 32) +SUBPIX_AVG_VAR(64, 32) + +VAR(64, 64) +SUBPIX_VAR(64, 64) +SUBPIX_AVG_VAR(64, 64) + +void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + int i, j; + + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} diff --git a/media/libvpx/vp9/encoder/vp9_variance.h b/media/libvpx/vp9/encoder/vp9_variance.h index 2ded97c559e8..4a194b72cc15 100644 --- a/media/libvpx/vp9/encoder/vp9_variance.h +++ b/media/libvpx/vp9/encoder/vp9_variance.h @@ -12,29 +12,26 @@ #define VP9_ENCODER_VP9_VARIANCE_H_ #include "vpx/vpx_integer.h" -// #include "./vpx_config.h" -void variance(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - int w, - int h, - unsigned int *sse, - int *sum); +#ifdef __cplusplus +extern "C" { +#endif + +void variance(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, + unsigned int *sse, int *sum); typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, - int ref_stride, - unsigned int max_sad); + int ref_stride); typedef unsigned int(*vp9_sad_avg_fn_t)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - unsigned int max_sad); + const uint8_t *second_pred); typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr, int source_stride, @@ -42,12 +39,6 @@ typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr, int ref_stride, unsigned int *sad_array); -typedef void (*vp9_sad_multi1_fn_t)(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int *sad_array); - typedef void (*vp9_sad_multi_d_fn_t)(const uint8_t *src_ptr, int source_stride, const uint8_t* const ref_ptr[], @@ -76,40 +67,22 @@ typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr, unsigned int *sse, const uint8_t *second_pred); -typedef unsigned int (*vp9_getmbss_fn_t)(const short *); - -typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int ref_stride); - typedef struct vp9_variance_vtable { vp9_sad_fn_t sdf; vp9_sad_avg_fn_t sdaf; vp9_variance_fn_t vf; vp9_subpixvariance_fn_t svf; vp9_subp_avg_variance_fn_t svaf; - vp9_variance_fn_t svf_halfpix_h; - vp9_variance_fn_t svf_halfpix_v; - vp9_variance_fn_t svf_halfpix_hv; vp9_sad_multi_fn_t sdx3f; - vp9_sad_multi1_fn_t sdx8f; + vp9_sad_multi_fn_t sdx8f; vp9_sad_multi_d_fn_t sdx4df; } vp9_variance_fn_ptr_t; -static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width, - int height, const uint8_t *ref, int ref_stride) { - int i, j; +void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride); + +#ifdef __cplusplus +} // extern "C" +#endif - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - int tmp; - tmp = pred[j] + ref[j]; - comp_pred[j] = (tmp + 1) >> 1; - } - comp_pred += width; - pred += width; - ref += ref_stride; - } -} #endif // VP9_ENCODER_VP9_VARIANCE_H_ diff --git a/media/libvpx/vp9/encoder/vp9_variance_c.c b/media/libvpx/vp9/encoder/vp9_variance_c.c deleted file mode 100644 index 8bc385089915..000000000000 --- a/media/libvpx/vp9/encoder/vp9_variance_c.c +++ /dev/null @@ -1,1094 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vp9_rtcd.h" - -#include "vpx_ports/mem.h" -#include "vpx/vpx_integer.h" - -#include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_filter.h" - -#include "vp9/encoder/vp9_variance.h" - -void variance(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - int w, - int h, - unsigned int *sse, - int *sum) { - int i, j; - int diff; - - *sum = 0; - *sse = 0; - - for (i = 0; i < h; i++) { - for (j = 0; j < w; j++) { - diff = src_ptr[j] - ref_ptr[j]; - *sum += diff; - *sse += diff * diff; - } - - src_ptr += source_stride; - ref_ptr += recon_stride; - } -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil_first_pass - * - * INPUTS : uint8_t *src_ptr : Pointer to source block. - * uint32_t src_pixels_per_line : Stride of input block. - * uint32_t pixel_step : Offset between filter input - * samples (see notes). - * uint32_t output_height : Input block height. - * uint32_t output_width : Input block width. - * int32_t *vp9_filter : Array of 2 bi-linear filter - * taps. - * - * OUTPUTS : int32_t *output_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in - * either horizontal or vertical direction to produce the - * filtered output block. Used to implement first-pass - * of 2-D separable filter. - * - * SPECIAL NOTES : Produces int32_t output to retain precision for next pass. - * Two filter taps should sum to VP9_FILTER_WEIGHT. - * pixel_step defines whether the filter is applied - * horizontally (pixel_step=1) or vertically (pixel_step= - * stride). - * It defines the offset required to move from one input - * to the next. - * - ****************************************************************************/ -static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr, - uint16_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, - const int16_t *vp9_filter) { - unsigned int i, j; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + - (int)src_ptr[pixel_step] * vp9_filter[1], - FILTER_BITS); - - src_ptr++; - } - - // Next row... - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_width; - } -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil_second_pass - * - * INPUTS : int32_t *src_ptr : Pointer to source block. - * uint32_t src_pixels_per_line : Stride of input block. - * uint32_t pixel_step : Offset between filter input - * samples (see notes). - * uint32_t output_height : Input block height. - * uint32_t output_width : Input block width. - * int32_t *vp9_filter : Array of 2 bi-linear filter - * taps. - * - * OUTPUTS : uint16_t *output_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in - * either horizontal or vertical direction to produce the - * filtered output block. Used to implement second-pass - * of 2-D separable filter. - * - * SPECIAL NOTES : Requires 32-bit input as produced by - * filter_block2d_bil_first_pass. - * Two filter taps should sum to VP9_FILTER_WEIGHT. - * pixel_step defines whether the filter is applied - * horizontally (pixel_step=1) or vertically (pixel_step= - * stride). - * It defines the offset required to move from one input - * to the next. - * - ****************************************************************************/ -static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const int16_t *vp9_filter) { - unsigned int i, j; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + - (int)src_ptr[pixel_step] * vp9_filter[1], - FILTER_BITS); - src_ptr++; - } - - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_width; - } -} - -unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) { - unsigned int i, sum = 0; - - for (i = 0; i < 256; i++) { - sum += (src_ptr[i] * src_ptr[i]); - } - - return sum; -} - -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32, &var, &avg); - *sse = var; - return (var - (((int64_t)avg * avg) >> 11)); -} - -unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering - uint8_t temp2[68 * 64]; - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 33, 64, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter); - - return vp9_variance64x32(temp2, 64, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering - uint8_t temp2[68 * 64]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64); // compound pred buffer - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 33, 64, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter); - comp_avg_pred(temp3, second_pred, 64, 32, temp2, 64); - return vp9_variance64x32(temp3, 64, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64, &var, &avg); - *sse = var; - return (var - (((int64_t)avg * avg) >> 11)); -} - -unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering - uint8_t temp2[68 * 64]; - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 65, 32, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter); - - return vp9_variance32x64(temp2, 32, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering - uint8_t temp2[68 * 64]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 64); // compound pred buffer - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 65, 32, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter); - comp_avg_pred(temp3, second_pred, 32, 64, temp2, 32); - return vp9_variance32x64(temp3, 32, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16, &var, &avg); - *sse = var; - return (var - (((int64_t)avg * avg) >> 9)); -} - -unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering - uint8_t temp2[36 * 32]; - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 17, 32, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter); - - return vp9_variance32x16(temp2, 32, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering - uint8_t temp2[36 * 32]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 16); // compound pred buffer - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 17, 32, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter); - comp_avg_pred(temp3, second_pred, 32, 16, temp2, 32); - return vp9_variance32x16(temp3, 32, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32, &var, &avg); - *sse = var; - return (var - (((int64_t)avg * avg) >> 9)); -} - -unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering - uint8_t temp2[36 * 32]; - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 33, 16, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter); - - return vp9_variance16x32(temp2, 16, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering - uint8_t temp2[36 * 32]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 32); // compound pred buffer - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 33, 16, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter); - comp_avg_pred(temp3, second_pred, 16, 32, temp2, 16); - return vp9_variance16x32(temp3, 16, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64, &var, &avg); - *sse = var; - return (var - (((int64_t)avg * avg) >> 12)); -} - -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, &var, &avg); - *sse = var; - return (var - (((int64_t)avg * avg) >> 10)); -} - -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 8)); -} - -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); -} - -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); -} - -void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride, - const uint8_t *ref_ptr, int ref_stride, - unsigned int *sse, int *sum) { - variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum); -} - -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 6)); -} - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 5)); -} - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 5)); -} - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 4)); -} - - -unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg); - *sse = var; - return var; -} - -unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg); - *sse = var; - return var; -} - -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg); - *sse = var; - return var; -} - -unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg); - *sse = var; - return var; -} - - -unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint8_t temp2[20 * 16]; - const int16_t *hfilter, *vfilter; - uint16_t fdata3[5 * 4]; // Temp data buffer used in filtering - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - // First filter 1d Horizontal - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 5, 4, hfilter); - - // Now filter Verticaly - var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, vfilter); - - return vp9_variance4x4(temp2, 4, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint8_t temp2[20 * 16]; - const int16_t *hfilter, *vfilter; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 4); // compound pred buffer - uint16_t fdata3[5 * 4]; // Temp data buffer used in filtering - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - // First filter 1d Horizontal - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 5, 4, hfilter); - - // Now filter Verticaly - var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, vfilter); - comp_avg_pred(temp3, second_pred, 4, 4, temp2, 4); - return vp9_variance4x4(temp3, 4, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[9 * 8]; // Temp data buffer used in filtering - uint8_t temp2[20 * 16]; - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 9, 8, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter); - - return vp9_variance8x8(temp2, 8, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[9 * 8]; // Temp data buffer used in filtering - uint8_t temp2[20 * 16]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 8); // compound pred buffer - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 9, 8, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter); - comp_avg_pred(temp3, second_pred, 8, 8, temp2, 8); - return vp9_variance8x8(temp3, 8, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[17 * 16]; // Temp data buffer used in filtering - uint8_t temp2[20 * 16]; - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 17, 16, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter); - - return vp9_variance16x16(temp2, 16, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[17 * 16]; - uint8_t temp2[20 * 16]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 16); // compound pred buffer - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 17, 16, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter); - - comp_avg_pred(temp3, second_pred, 16, 16, temp2, 16); - return vp9_variance16x16(temp3, 16, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering - uint8_t temp2[68 * 64]; - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 65, 64, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter); - - return vp9_variance64x64(temp2, 64, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[65 * 64]; // Temp data buffer used in filtering - uint8_t temp2[68 * 64]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64); // compound pred buffer - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 65, 64, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter); - comp_avg_pred(temp3, second_pred, 64, 64, temp2, 64); - return vp9_variance64x64(temp3, 64, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering - uint8_t temp2[36 * 32]; - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 33, 32, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter); - - return vp9_variance32x32(temp2, 32, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[33 * 32]; // Temp data buffer used in filtering - uint8_t temp2[36 * 32]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 32); // compound pred buffer - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 33, 32, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter); - comp_avg_pred(temp3, second_pred, 32, 32, temp2, 32); - return vp9_variance32x32(temp3, 32, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 0, - ref_ptr, recon_stride, sse); -} - -unsigned int vp9_variance_halfpixvar32x32_h_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 0, - ref_ptr, recon_stride, sse); -} - -unsigned int vp9_variance_halfpixvar64x64_h_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 8, 0, - ref_ptr, recon_stride, sse); -} - -unsigned int vp9_variance_halfpixvar16x16_v_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 8, - ref_ptr, recon_stride, sse); -} - -unsigned int vp9_variance_halfpixvar32x32_v_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 0, 8, - ref_ptr, recon_stride, sse); -} - -unsigned int vp9_variance_halfpixvar64x64_v_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 0, 8, - ref_ptr, recon_stride, sse); -} - -unsigned int vp9_variance_halfpixvar16x16_hv_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 8, - ref_ptr, recon_stride, sse); -} - -unsigned int vp9_variance_halfpixvar32x32_hv_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 8, - ref_ptr, recon_stride, sse); -} - -unsigned int vp9_variance_halfpixvar64x64_hv_c(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 8, 8, - ref_ptr, recon_stride, sse); -} - -unsigned int vp9_sub_pixel_mse16x16_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - vp9_sub_pixel_variance16x16_c(src_ptr, src_pixels_per_line, - xoffset, yoffset, dst_ptr, - dst_pixels_per_line, sse); - return *sse; -} - -unsigned int vp9_sub_pixel_mse32x32_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - vp9_sub_pixel_variance32x32_c(src_ptr, src_pixels_per_line, - xoffset, yoffset, dst_ptr, - dst_pixels_per_line, sse); - return *sse; -} - -unsigned int vp9_sub_pixel_mse64x64_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - vp9_sub_pixel_variance64x64_c(src_ptr, src_pixels_per_line, - xoffset, yoffset, dst_ptr, - dst_pixels_per_line, sse); - return *sse; -} - -unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[16 * 9]; // Temp data buffer used in filtering - uint8_t temp2[20 * 16]; - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 9, 16, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter); - - return vp9_variance16x8(temp2, 16, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[16 * 9]; // Temp data buffer used in filtering - uint8_t temp2[20 * 16]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 8); // compound pred buffer - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 9, 16, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter); - comp_avg_pred(temp3, second_pred, 16, 8, temp2, 16); - return vp9_variance16x8(temp3, 16, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[9 * 16]; // Temp data buffer used in filtering - uint8_t temp2[20 * 16]; - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 17, 8, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter); - - return vp9_variance8x16(temp2, 8, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[9 * 16]; // Temp data buffer used in filtering - uint8_t temp2[20 * 16]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 16); // compound pred buffer - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 17, 8, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter); - comp_avg_pred(temp3, second_pred, 8, 16, temp2, 8); - return vp9_variance8x16(temp3, 8, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[8 * 5]; // Temp data buffer used in filtering - uint8_t temp2[20 * 16]; - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 5, 8, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 4, 8, vfilter); - - return vp9_variance8x4(temp2, 8, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[8 * 5]; // Temp data buffer used in filtering - uint8_t temp2[20 * 16]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 4); // compound pred buffer - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 5, 8, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 4, 8, vfilter); - comp_avg_pred(temp3, second_pred, 8, 4, temp2, 8); - return vp9_variance8x4(temp3, 8, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - uint16_t fdata3[5 * 8]; // Temp data buffer used in filtering - // FIXME(jingning,rbultje): this temp2 buffer probably doesn't need to be - // of this big? same issue appears in all other block size settings. - uint8_t temp2[20 * 16]; - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 9, 4, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 8, 4, vfilter); - - return vp9_variance4x8(temp2, 4, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, - const uint8_t *second_pred) { - uint16_t fdata3[5 * 8]; // Temp data buffer used in filtering - uint8_t temp2[20 * 16]; - DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 8); // compound pred buffer - const int16_t *hfilter, *vfilter; - - hfilter = BILINEAR_FILTERS_2TAP(xoffset); - vfilter = BILINEAR_FILTERS_2TAP(yoffset); - - var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, - 1, 9, 4, hfilter); - var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 8, 4, vfilter); - comp_avg_pred(temp3, second_pred, 4, 8, temp2, 4); - return vp9_variance4x8(temp3, 4, dst_ptr, dst_pixels_per_line, sse); -} diff --git a/media/libvpx/vp9/encoder/vp9_write_bit_buffer.c b/media/libvpx/vp9/encoder/vp9_write_bit_buffer.c new file mode 100644 index 000000000000..6d55e84e869c --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_write_bit_buffer.c @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "vp9/encoder/vp9_write_bit_buffer.h" + +size_t vp9_wb_bytes_written(const struct vp9_write_bit_buffer *wb) { + return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0); +} + +void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit) { + const int off = (int)wb->bit_offset; + const int p = off / CHAR_BIT; + const int q = CHAR_BIT - 1 - off % CHAR_BIT; + if (q == CHAR_BIT -1) { + wb->bit_buffer[p] = bit << q; + } else { + wb->bit_buffer[p] &= ~(1 << q); + wb->bit_buffer[p] |= bit << q; + } + wb->bit_offset = off + 1; +} + +void vp9_wb_write_literal(struct vp9_write_bit_buffer *wb, int data, int bits) { + int bit; + for (bit = bits - 1; bit >= 0; bit--) + vp9_wb_write_bit(wb, (data >> bit) & 1); +} diff --git a/media/libvpx/vp9/encoder/vp9_write_bit_buffer.h b/media/libvpx/vp9/encoder/vp9_write_bit_buffer.h index 6f91cfc85c98..59f9bbe30f3f 100644 --- a/media/libvpx/vp9/encoder/vp9_write_bit_buffer.h +++ b/media/libvpx/vp9/encoder/vp9_write_bit_buffer.h @@ -8,41 +8,29 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_BIT_WRITE_BUFFER_H_ -#define VP9_BIT_WRITE_BUFFER_H_ - -#include +#ifndef VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_ +#define VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_ #include "vpx/vpx_integer.h" +#ifdef __cplusplus +extern "C" { +#endif + struct vp9_write_bit_buffer { uint8_t *bit_buffer; size_t bit_offset; }; -static size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb) { - return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0); -} +size_t vp9_wb_bytes_written(const struct vp9_write_bit_buffer *wb); -static void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit) { - const int off = wb->bit_offset; - const int p = off / CHAR_BIT; - const int q = CHAR_BIT - 1 - off % CHAR_BIT; - if (q == CHAR_BIT -1) { - wb->bit_buffer[p] = bit << q; - } else { - wb->bit_buffer[p] &= ~(1 << q); - wb->bit_buffer[p] |= bit << q; - } - wb->bit_offset = off + 1; -} +void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit); -static void vp9_wb_write_literal(struct vp9_write_bit_buffer *wb, - int data, int bits) { - int bit; - for (bit = bits - 1; bit >= 0; bit--) - vp9_wb_write_bit(wb, (data >> bit) & 1); -} +void vp9_wb_write_literal(struct vp9_write_bit_buffer *wb, int data, int bits); -#endif // VP9_BIT_WRITE_BUFFER_H_ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_ diff --git a/media/libvpx/vp9/encoder/vp9_writer.c b/media/libvpx/vp9/encoder/vp9_writer.c new file mode 100644 index 000000000000..ff461f218a4a --- /dev/null +++ b/media/libvpx/vp9/encoder/vp9_writer.c @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "vp9/encoder/vp9_writer.h" +#include "vp9/common/vp9_entropy.h" + +void vp9_start_encode(vp9_writer *br, uint8_t *source) { + br->lowvalue = 0; + br->range = 255; + br->count = -24; + br->buffer = source; + br->pos = 0; + vp9_write_bit(br, 0); +} + +void vp9_stop_encode(vp9_writer *br) { + int i; + + for (i = 0; i < 32; i++) + vp9_write_bit(br, 0); + + // Ensure there's no ambigous collision with any index marker bytes + if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) + br->buffer[br->pos++] = 0; +} + diff --git a/media/libvpx/vp9/encoder/vp9_boolhuff.h b/media/libvpx/vp9/encoder/vp9_writer.h similarity index 62% rename from media/libvpx/vp9/encoder/vp9_boolhuff.h rename to media/libvpx/vp9/encoder/vp9_writer.h index c3f340d1bdf2..9d161f95cf62 100644 --- a/media/libvpx/vp9/encoder/vp9_boolhuff.h +++ b/media/libvpx/vp9/encoder/vp9_writer.h @@ -8,57 +8,35 @@ * be found in the AUTHORS file in the root of the source tree. */ - -/**************************************************************************** -* -* Module Title : vp9_boolhuff.h -* -* Description : Bool Coder header file. -* -****************************************************************************/ -#ifndef VP9_ENCODER_VP9_BOOLHUFF_H_ -#define VP9_ENCODER_VP9_BOOLHUFF_H_ +#ifndef VP9_ENCODER_VP9_WRITER_H_ +#define VP9_ENCODER_VP9_WRITER_H_ #include "vpx_ports/mem.h" +#include "vp9/common/vp9_prob.h" + +#ifdef __cplusplus +extern "C" { +#endif + typedef struct { unsigned int lowvalue; unsigned int range; - unsigned int value; int count; unsigned int pos; uint8_t *buffer; - - // Variables used to track bit costs without outputing to the bitstream - unsigned int measure_cost; - unsigned long bit_counter; } vp9_writer; -extern const unsigned int vp9_prob_cost[256]; - void vp9_start_encode(vp9_writer *bc, uint8_t *buffer); void vp9_stop_encode(vp9_writer *bc); -DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]); - -static void vp9_write(vp9_writer *br, int bit, int probability) { +static INLINE void vp9_write(vp9_writer *br, int bit, int probability) { unsigned int split; int count = br->count; unsigned int range = br->range; unsigned int lowvalue = br->lowvalue; register unsigned int shift; -#ifdef ENTROPY_STATS -#if defined(SECTIONBITS_OUTPUT) - - if (bit) - Sectionbits[active_section] += vp9_prob_cost[255 - probability]; - else - Sectionbits[active_section] += vp9_prob_cost[probability]; - -#endif -#endif - split = 1 + (((range - 1) * probability) >> 8); range = split; @@ -100,16 +78,21 @@ static void vp9_write(vp9_writer *br, int bit, int probability) { br->range = range; } -static void vp9_write_bit(vp9_writer *w, int bit) { +static INLINE void vp9_write_bit(vp9_writer *w, int bit) { vp9_write(w, bit, 128); // vp9_prob_half } -static void vp9_write_literal(vp9_writer *w, int data, int bits) { +static INLINE void vp9_write_literal(vp9_writer *w, int data, int bits) { int bit; for (bit = bits - 1; bit >= 0; bit--) vp9_write_bit(w, 1 & (data >> bit)); } +#define vp9_write_prob(w, v) vp9_write_literal((w), (v), 8) -#endif // VP9_ENCODER_VP9_BOOLHUFF_H_ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP9_ENCODER_VP9_WRITER_H_ diff --git a/media/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c b/media/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c new file mode 100644 index 000000000000..9ea22fed2b70 --- /dev/null +++ b/media/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c @@ -0,0 +1,2710 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // AVX2 +#include "vp9/common/vp9_idct.h" // for cospi constants +#include "vpx_ports/mem.h" + +#define pair256_set_epi16(a, b) \ + _mm256_set_epi16(b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a) + +#define pair256_set_epi32(a, b) \ + _mm256_set_epi32(b, a, b, a, b, a, b, a) + + + + +#if FDCT32x32_HIGH_PRECISION +static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) { + __m256i buf0, buf1; + buf0 = _mm256_mul_epu32(a, b); + a = _mm256_srli_epi64(a, 32); + b = _mm256_srli_epi64(b, 32); + buf1 = _mm256_mul_epu32(a, b); + return _mm256_add_epi64(buf0, buf1); +} + +static INLINE __m256i k_packs_epi64_avx2(__m256i a, __m256i b) { + __m256i buf0 = _mm256_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); + __m256i buf1 = _mm256_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); + return _mm256_unpacklo_epi64(buf0, buf1); +} +#endif + +void FDCT32x32_2D_AVX2(const int16_t *input, + int16_t *output_org, int stride) { + // Calculate pre-multiplied strides + const int str1 = stride; + const int str2 = 2 * stride; + const int str3 = 2 * stride + str1; + // We need an intermediate buffer between passes. + DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]); + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(+cospi_16_64); + const __m256i k__cospi_p16_m16 = pair256_set_epi16(+cospi_16_64, -cospi_16_64); + const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64); + const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); + const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64); + const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64); + const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64); + const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64); + const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64); + const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64); + const __m256i k__cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64); + const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64, cospi_2_64); + const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64, cospi_18_64); + const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64, cospi_10_64); + const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64, cospi_26_64); + const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64); + const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64); + const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64); + const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64); + const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64, cospi_1_64); + const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64, cospi_17_64); + const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64, cospi_9_64); + const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64, cospi_25_64); + const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64); + const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64); + const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64); + const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64); + const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64, cospi_5_64); + const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64, cospi_21_64); + const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64, cospi_13_64); + const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64, cospi_29_64); + const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64); + const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64); + const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64); + const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64); + const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING); + const __m256i kZero = _mm256_set1_epi16(0); + const __m256i kOne = _mm256_set1_epi16(1); + // Do the two transform/transpose passes + int pass; + for (pass = 0; pass < 2; ++pass) { + // We process sixteen columns (transposed rows in second pass) at a time. + int column_start; + for (column_start = 0; column_start < 32; column_start += 16) { + __m256i step1[32]; + __m256i step2[32]; + __m256i step3[32]; + __m256i out[32]; + // Stage 1 + // Note: even though all the loads below are aligned, using the aligned + // intrinsic make the code slightly slower. + if (0 == pass) { + const int16_t *in = &input[column_start]; + // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; + // Note: the next four blocks could be in a loop. That would help the + // instruction cache but is actually slower. + { + const int16_t *ina = in + 0 * str1; + const int16_t *inb = in + 31 * str1; + __m256i *step1a = &step1[ 0]; + __m256i *step1b = &step1[31]; + const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); + const __m256i ina1 = _mm256_loadu_si256((const __m256i *)(ina + str1)); + const __m256i ina2 = _mm256_loadu_si256((const __m256i *)(ina + str2)); + const __m256i ina3 = _mm256_loadu_si256((const __m256i *)(ina + str3)); + const __m256i inb3 = _mm256_loadu_si256((const __m256i *)(inb - str3)); + const __m256i inb2 = _mm256_loadu_si256((const __m256i *)(inb - str2)); + const __m256i inb1 = _mm256_loadu_si256((const __m256i *)(inb - str1)); + const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); + step1a[ 0] = _mm256_add_epi16(ina0, inb0); + step1a[ 1] = _mm256_add_epi16(ina1, inb1); + step1a[ 2] = _mm256_add_epi16(ina2, inb2); + step1a[ 3] = _mm256_add_epi16(ina3, inb3); + step1b[-3] = _mm256_sub_epi16(ina3, inb3); + step1b[-2] = _mm256_sub_epi16(ina2, inb2); + step1b[-1] = _mm256_sub_epi16(ina1, inb1); + step1b[-0] = _mm256_sub_epi16(ina0, inb0); + step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2); + step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2); + step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2); + step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2); + step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); + } + { + const int16_t *ina = in + 4 * str1; + const int16_t *inb = in + 27 * str1; + __m256i *step1a = &step1[ 4]; + __m256i *step1b = &step1[27]; + const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); + const __m256i ina1 = _mm256_loadu_si256((const __m256i *)(ina + str1)); + const __m256i ina2 = _mm256_loadu_si256((const __m256i *)(ina + str2)); + const __m256i ina3 = _mm256_loadu_si256((const __m256i *)(ina + str3)); + const __m256i inb3 = _mm256_loadu_si256((const __m256i *)(inb - str3)); + const __m256i inb2 = _mm256_loadu_si256((const __m256i *)(inb - str2)); + const __m256i inb1 = _mm256_loadu_si256((const __m256i *)(inb - str1)); + const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); + step1a[ 0] = _mm256_add_epi16(ina0, inb0); + step1a[ 1] = _mm256_add_epi16(ina1, inb1); + step1a[ 2] = _mm256_add_epi16(ina2, inb2); + step1a[ 3] = _mm256_add_epi16(ina3, inb3); + step1b[-3] = _mm256_sub_epi16(ina3, inb3); + step1b[-2] = _mm256_sub_epi16(ina2, inb2); + step1b[-1] = _mm256_sub_epi16(ina1, inb1); + step1b[-0] = _mm256_sub_epi16(ina0, inb0); + step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2); + step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2); + step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2); + step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2); + step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); + } + { + const int16_t *ina = in + 8 * str1; + const int16_t *inb = in + 23 * str1; + __m256i *step1a = &step1[ 8]; + __m256i *step1b = &step1[23]; + const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); + const __m256i ina1 = _mm256_loadu_si256((const __m256i *)(ina + str1)); + const __m256i ina2 = _mm256_loadu_si256((const __m256i *)(ina + str2)); + const __m256i ina3 = _mm256_loadu_si256((const __m256i *)(ina + str3)); + const __m256i inb3 = _mm256_loadu_si256((const __m256i *)(inb - str3)); + const __m256i inb2 = _mm256_loadu_si256((const __m256i *)(inb - str2)); + const __m256i inb1 = _mm256_loadu_si256((const __m256i *)(inb - str1)); + const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); + step1a[ 0] = _mm256_add_epi16(ina0, inb0); + step1a[ 1] = _mm256_add_epi16(ina1, inb1); + step1a[ 2] = _mm256_add_epi16(ina2, inb2); + step1a[ 3] = _mm256_add_epi16(ina3, inb3); + step1b[-3] = _mm256_sub_epi16(ina3, inb3); + step1b[-2] = _mm256_sub_epi16(ina2, inb2); + step1b[-1] = _mm256_sub_epi16(ina1, inb1); + step1b[-0] = _mm256_sub_epi16(ina0, inb0); + step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2); + step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2); + step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2); + step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2); + step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); + } + { + const int16_t *ina = in + 12 * str1; + const int16_t *inb = in + 19 * str1; + __m256i *step1a = &step1[12]; + __m256i *step1b = &step1[19]; + const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); + const __m256i ina1 = _mm256_loadu_si256((const __m256i *)(ina + str1)); + const __m256i ina2 = _mm256_loadu_si256((const __m256i *)(ina + str2)); + const __m256i ina3 = _mm256_loadu_si256((const __m256i *)(ina + str3)); + const __m256i inb3 = _mm256_loadu_si256((const __m256i *)(inb - str3)); + const __m256i inb2 = _mm256_loadu_si256((const __m256i *)(inb - str2)); + const __m256i inb1 = _mm256_loadu_si256((const __m256i *)(inb - str1)); + const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); + step1a[ 0] = _mm256_add_epi16(ina0, inb0); + step1a[ 1] = _mm256_add_epi16(ina1, inb1); + step1a[ 2] = _mm256_add_epi16(ina2, inb2); + step1a[ 3] = _mm256_add_epi16(ina3, inb3); + step1b[-3] = _mm256_sub_epi16(ina3, inb3); + step1b[-2] = _mm256_sub_epi16(ina2, inb2); + step1b[-1] = _mm256_sub_epi16(ina1, inb1); + step1b[-0] = _mm256_sub_epi16(ina0, inb0); + step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2); + step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2); + step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2); + step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2); + step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); + } + } else { + int16_t *in = &intermediate[column_start]; + // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32]; + // Note: using the same approach as above to have common offset is + // counter-productive as all offsets can be calculated at compile + // time. + // Note: the next four blocks could be in a loop. That would help the + // instruction cache but is actually slower. + { + __m256i in00 = _mm256_loadu_si256((const __m256i *)(in + 0 * 32)); + __m256i in01 = _mm256_loadu_si256((const __m256i *)(in + 1 * 32)); + __m256i in02 = _mm256_loadu_si256((const __m256i *)(in + 2 * 32)); + __m256i in03 = _mm256_loadu_si256((const __m256i *)(in + 3 * 32)); + __m256i in28 = _mm256_loadu_si256((const __m256i *)(in + 28 * 32)); + __m256i in29 = _mm256_loadu_si256((const __m256i *)(in + 29 * 32)); + __m256i in30 = _mm256_loadu_si256((const __m256i *)(in + 30 * 32)); + __m256i in31 = _mm256_loadu_si256((const __m256i *)(in + 31 * 32)); + step1[ 0] = _mm256_add_epi16(in00, in31); + step1[ 1] = _mm256_add_epi16(in01, in30); + step1[ 2] = _mm256_add_epi16(in02, in29); + step1[ 3] = _mm256_add_epi16(in03, in28); + step1[28] = _mm256_sub_epi16(in03, in28); + step1[29] = _mm256_sub_epi16(in02, in29); + step1[30] = _mm256_sub_epi16(in01, in30); + step1[31] = _mm256_sub_epi16(in00, in31); + } + { + __m256i in04 = _mm256_loadu_si256((const __m256i *)(in + 4 * 32)); + __m256i in05 = _mm256_loadu_si256((const __m256i *)(in + 5 * 32)); + __m256i in06 = _mm256_loadu_si256((const __m256i *)(in + 6 * 32)); + __m256i in07 = _mm256_loadu_si256((const __m256i *)(in + 7 * 32)); + __m256i in24 = _mm256_loadu_si256((const __m256i *)(in + 24 * 32)); + __m256i in25 = _mm256_loadu_si256((const __m256i *)(in + 25 * 32)); + __m256i in26 = _mm256_loadu_si256((const __m256i *)(in + 26 * 32)); + __m256i in27 = _mm256_loadu_si256((const __m256i *)(in + 27 * 32)); + step1[ 4] = _mm256_add_epi16(in04, in27); + step1[ 5] = _mm256_add_epi16(in05, in26); + step1[ 6] = _mm256_add_epi16(in06, in25); + step1[ 7] = _mm256_add_epi16(in07, in24); + step1[24] = _mm256_sub_epi16(in07, in24); + step1[25] = _mm256_sub_epi16(in06, in25); + step1[26] = _mm256_sub_epi16(in05, in26); + step1[27] = _mm256_sub_epi16(in04, in27); + } + { + __m256i in08 = _mm256_loadu_si256((const __m256i *)(in + 8 * 32)); + __m256i in09 = _mm256_loadu_si256((const __m256i *)(in + 9 * 32)); + __m256i in10 = _mm256_loadu_si256((const __m256i *)(in + 10 * 32)); + __m256i in11 = _mm256_loadu_si256((const __m256i *)(in + 11 * 32)); + __m256i in20 = _mm256_loadu_si256((const __m256i *)(in + 20 * 32)); + __m256i in21 = _mm256_loadu_si256((const __m256i *)(in + 21 * 32)); + __m256i in22 = _mm256_loadu_si256((const __m256i *)(in + 22 * 32)); + __m256i in23 = _mm256_loadu_si256((const __m256i *)(in + 23 * 32)); + step1[ 8] = _mm256_add_epi16(in08, in23); + step1[ 9] = _mm256_add_epi16(in09, in22); + step1[10] = _mm256_add_epi16(in10, in21); + step1[11] = _mm256_add_epi16(in11, in20); + step1[20] = _mm256_sub_epi16(in11, in20); + step1[21] = _mm256_sub_epi16(in10, in21); + step1[22] = _mm256_sub_epi16(in09, in22); + step1[23] = _mm256_sub_epi16(in08, in23); + } + { + __m256i in12 = _mm256_loadu_si256((const __m256i *)(in + 12 * 32)); + __m256i in13 = _mm256_loadu_si256((const __m256i *)(in + 13 * 32)); + __m256i in14 = _mm256_loadu_si256((const __m256i *)(in + 14 * 32)); + __m256i in15 = _mm256_loadu_si256((const __m256i *)(in + 15 * 32)); + __m256i in16 = _mm256_loadu_si256((const __m256i *)(in + 16 * 32)); + __m256i in17 = _mm256_loadu_si256((const __m256i *)(in + 17 * 32)); + __m256i in18 = _mm256_loadu_si256((const __m256i *)(in + 18 * 32)); + __m256i in19 = _mm256_loadu_si256((const __m256i *)(in + 19 * 32)); + step1[12] = _mm256_add_epi16(in12, in19); + step1[13] = _mm256_add_epi16(in13, in18); + step1[14] = _mm256_add_epi16(in14, in17); + step1[15] = _mm256_add_epi16(in15, in16); + step1[16] = _mm256_sub_epi16(in15, in16); + step1[17] = _mm256_sub_epi16(in14, in17); + step1[18] = _mm256_sub_epi16(in13, in18); + step1[19] = _mm256_sub_epi16(in12, in19); + } + } + // Stage 2 + { + step2[ 0] = _mm256_add_epi16(step1[0], step1[15]); + step2[ 1] = _mm256_add_epi16(step1[1], step1[14]); + step2[ 2] = _mm256_add_epi16(step1[2], step1[13]); + step2[ 3] = _mm256_add_epi16(step1[3], step1[12]); + step2[ 4] = _mm256_add_epi16(step1[4], step1[11]); + step2[ 5] = _mm256_add_epi16(step1[5], step1[10]); + step2[ 6] = _mm256_add_epi16(step1[6], step1[ 9]); + step2[ 7] = _mm256_add_epi16(step1[7], step1[ 8]); + step2[ 8] = _mm256_sub_epi16(step1[7], step1[ 8]); + step2[ 9] = _mm256_sub_epi16(step1[6], step1[ 9]); + step2[10] = _mm256_sub_epi16(step1[5], step1[10]); + step2[11] = _mm256_sub_epi16(step1[4], step1[11]); + step2[12] = _mm256_sub_epi16(step1[3], step1[12]); + step2[13] = _mm256_sub_epi16(step1[2], step1[13]); + step2[14] = _mm256_sub_epi16(step1[1], step1[14]); + step2[15] = _mm256_sub_epi16(step1[0], step1[15]); + } + { + const __m256i s2_20_0 = _mm256_unpacklo_epi16(step1[27], step1[20]); + const __m256i s2_20_1 = _mm256_unpackhi_epi16(step1[27], step1[20]); + const __m256i s2_21_0 = _mm256_unpacklo_epi16(step1[26], step1[21]); + const __m256i s2_21_1 = _mm256_unpackhi_epi16(step1[26], step1[21]); + const __m256i s2_22_0 = _mm256_unpacklo_epi16(step1[25], step1[22]); + const __m256i s2_22_1 = _mm256_unpackhi_epi16(step1[25], step1[22]); + const __m256i s2_23_0 = _mm256_unpacklo_epi16(step1[24], step1[23]); + const __m256i s2_23_1 = _mm256_unpackhi_epi16(step1[24], step1[23]); + const __m256i s2_20_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_m16); + const __m256i s2_20_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_m16); + const __m256i s2_21_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_m16); + const __m256i s2_21_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_m16); + const __m256i s2_22_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_m16); + const __m256i s2_22_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_m16); + const __m256i s2_23_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_m16); + const __m256i s2_23_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_m16); + const __m256i s2_24_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_p16); + const __m256i s2_24_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_p16); + const __m256i s2_25_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_p16); + const __m256i s2_25_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_p16); + const __m256i s2_26_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_p16); + const __m256i s2_26_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_p16); + const __m256i s2_27_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_p16); + const __m256i s2_27_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m256i s2_20_4 = _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); + const __m256i s2_20_5 = _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); + const __m256i s2_21_4 = _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); + const __m256i s2_21_5 = _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); + const __m256i s2_22_4 = _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); + const __m256i s2_22_5 = _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); + const __m256i s2_23_4 = _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); + const __m256i s2_23_5 = _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); + const __m256i s2_24_4 = _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); + const __m256i s2_24_5 = _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); + const __m256i s2_25_4 = _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); + const __m256i s2_25_5 = _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); + const __m256i s2_26_4 = _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); + const __m256i s2_26_5 = _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); + const __m256i s2_27_4 = _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); + const __m256i s2_27_5 = _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); + const __m256i s2_20_6 = _mm256_srai_epi32(s2_20_4, DCT_CONST_BITS); + const __m256i s2_20_7 = _mm256_srai_epi32(s2_20_5, DCT_CONST_BITS); + const __m256i s2_21_6 = _mm256_srai_epi32(s2_21_4, DCT_CONST_BITS); + const __m256i s2_21_7 = _mm256_srai_epi32(s2_21_5, DCT_CONST_BITS); + const __m256i s2_22_6 = _mm256_srai_epi32(s2_22_4, DCT_CONST_BITS); + const __m256i s2_22_7 = _mm256_srai_epi32(s2_22_5, DCT_CONST_BITS); + const __m256i s2_23_6 = _mm256_srai_epi32(s2_23_4, DCT_CONST_BITS); + const __m256i s2_23_7 = _mm256_srai_epi32(s2_23_5, DCT_CONST_BITS); + const __m256i s2_24_6 = _mm256_srai_epi32(s2_24_4, DCT_CONST_BITS); + const __m256i s2_24_7 = _mm256_srai_epi32(s2_24_5, DCT_CONST_BITS); + const __m256i s2_25_6 = _mm256_srai_epi32(s2_25_4, DCT_CONST_BITS); + const __m256i s2_25_7 = _mm256_srai_epi32(s2_25_5, DCT_CONST_BITS); + const __m256i s2_26_6 = _mm256_srai_epi32(s2_26_4, DCT_CONST_BITS); + const __m256i s2_26_7 = _mm256_srai_epi32(s2_26_5, DCT_CONST_BITS); + const __m256i s2_27_6 = _mm256_srai_epi32(s2_27_4, DCT_CONST_BITS); + const __m256i s2_27_7 = _mm256_srai_epi32(s2_27_5, DCT_CONST_BITS); + // Combine + step2[20] = _mm256_packs_epi32(s2_20_6, s2_20_7); + step2[21] = _mm256_packs_epi32(s2_21_6, s2_21_7); + step2[22] = _mm256_packs_epi32(s2_22_6, s2_22_7); + step2[23] = _mm256_packs_epi32(s2_23_6, s2_23_7); + step2[24] = _mm256_packs_epi32(s2_24_6, s2_24_7); + step2[25] = _mm256_packs_epi32(s2_25_6, s2_25_7); + step2[26] = _mm256_packs_epi32(s2_26_6, s2_26_7); + step2[27] = _mm256_packs_epi32(s2_27_6, s2_27_7); + } + +#if !FDCT32x32_HIGH_PRECISION + // dump the magnitude by half, hence the intermediate values are within + // the range of 16 bits. + if (1 == pass) { + __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero,step2[ 0]); + __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero,step2[ 1]); + __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero,step2[ 2]); + __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero,step2[ 3]); + __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero,step2[ 4]); + __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero,step2[ 5]); + __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero,step2[ 6]); + __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero,step2[ 7]); + __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero,step2[ 8]); + __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero,step2[ 9]); + __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero,step2[10]); + __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero,step2[11]); + __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero,step2[12]); + __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero,step2[13]); + __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero,step2[14]); + __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero,step2[15]); + __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero,step1[16]); + __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero,step1[17]); + __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero,step1[18]); + __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero,step1[19]); + __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero,step2[20]); + __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero,step2[21]); + __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero,step2[22]); + __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero,step2[23]); + __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero,step2[24]); + __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero,step2[25]); + __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero,step2[26]); + __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero,step2[27]); + __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero,step1[28]); + __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero,step1[29]); + __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero,step1[30]); + __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero,step1[31]); + + step2[ 0] = _mm256_sub_epi16(step2[ 0], s3_00_0); + step2[ 1] = _mm256_sub_epi16(step2[ 1], s3_01_0); + step2[ 2] = _mm256_sub_epi16(step2[ 2], s3_02_0); + step2[ 3] = _mm256_sub_epi16(step2[ 3], s3_03_0); + step2[ 4] = _mm256_sub_epi16(step2[ 4], s3_04_0); + step2[ 5] = _mm256_sub_epi16(step2[ 5], s3_05_0); + step2[ 6] = _mm256_sub_epi16(step2[ 6], s3_06_0); + step2[ 7] = _mm256_sub_epi16(step2[ 7], s3_07_0); + step2[ 8] = _mm256_sub_epi16(step2[ 8], s2_08_0); + step2[ 9] = _mm256_sub_epi16(step2[ 9], s2_09_0); + step2[10] = _mm256_sub_epi16(step2[10], s3_10_0); + step2[11] = _mm256_sub_epi16(step2[11], s3_11_0); + step2[12] = _mm256_sub_epi16(step2[12], s3_12_0); + step2[13] = _mm256_sub_epi16(step2[13], s3_13_0); + step2[14] = _mm256_sub_epi16(step2[14], s2_14_0); + step2[15] = _mm256_sub_epi16(step2[15], s2_15_0); + step1[16] = _mm256_sub_epi16(step1[16], s3_16_0); + step1[17] = _mm256_sub_epi16(step1[17], s3_17_0); + step1[18] = _mm256_sub_epi16(step1[18], s3_18_0); + step1[19] = _mm256_sub_epi16(step1[19], s3_19_0); + step2[20] = _mm256_sub_epi16(step2[20], s3_20_0); + step2[21] = _mm256_sub_epi16(step2[21], s3_21_0); + step2[22] = _mm256_sub_epi16(step2[22], s3_22_0); + step2[23] = _mm256_sub_epi16(step2[23], s3_23_0); + step2[24] = _mm256_sub_epi16(step2[24], s3_24_0); + step2[25] = _mm256_sub_epi16(step2[25], s3_25_0); + step2[26] = _mm256_sub_epi16(step2[26], s3_26_0); + step2[27] = _mm256_sub_epi16(step2[27], s3_27_0); + step1[28] = _mm256_sub_epi16(step1[28], s3_28_0); + step1[29] = _mm256_sub_epi16(step1[29], s3_29_0); + step1[30] = _mm256_sub_epi16(step1[30], s3_30_0); + step1[31] = _mm256_sub_epi16(step1[31], s3_31_0); + + step2[ 0] = _mm256_add_epi16(step2[ 0], kOne); + step2[ 1] = _mm256_add_epi16(step2[ 1], kOne); + step2[ 2] = _mm256_add_epi16(step2[ 2], kOne); + step2[ 3] = _mm256_add_epi16(step2[ 3], kOne); + step2[ 4] = _mm256_add_epi16(step2[ 4], kOne); + step2[ 5] = _mm256_add_epi16(step2[ 5], kOne); + step2[ 6] = _mm256_add_epi16(step2[ 6], kOne); + step2[ 7] = _mm256_add_epi16(step2[ 7], kOne); + step2[ 8] = _mm256_add_epi16(step2[ 8], kOne); + step2[ 9] = _mm256_add_epi16(step2[ 9], kOne); + step2[10] = _mm256_add_epi16(step2[10], kOne); + step2[11] = _mm256_add_epi16(step2[11], kOne); + step2[12] = _mm256_add_epi16(step2[12], kOne); + step2[13] = _mm256_add_epi16(step2[13], kOne); + step2[14] = _mm256_add_epi16(step2[14], kOne); + step2[15] = _mm256_add_epi16(step2[15], kOne); + step1[16] = _mm256_add_epi16(step1[16], kOne); + step1[17] = _mm256_add_epi16(step1[17], kOne); + step1[18] = _mm256_add_epi16(step1[18], kOne); + step1[19] = _mm256_add_epi16(step1[19], kOne); + step2[20] = _mm256_add_epi16(step2[20], kOne); + step2[21] = _mm256_add_epi16(step2[21], kOne); + step2[22] = _mm256_add_epi16(step2[22], kOne); + step2[23] = _mm256_add_epi16(step2[23], kOne); + step2[24] = _mm256_add_epi16(step2[24], kOne); + step2[25] = _mm256_add_epi16(step2[25], kOne); + step2[26] = _mm256_add_epi16(step2[26], kOne); + step2[27] = _mm256_add_epi16(step2[27], kOne); + step1[28] = _mm256_add_epi16(step1[28], kOne); + step1[29] = _mm256_add_epi16(step1[29], kOne); + step1[30] = _mm256_add_epi16(step1[30], kOne); + step1[31] = _mm256_add_epi16(step1[31], kOne); + + step2[ 0] = _mm256_srai_epi16(step2[ 0], 2); + step2[ 1] = _mm256_srai_epi16(step2[ 1], 2); + step2[ 2] = _mm256_srai_epi16(step2[ 2], 2); + step2[ 3] = _mm256_srai_epi16(step2[ 3], 2); + step2[ 4] = _mm256_srai_epi16(step2[ 4], 2); + step2[ 5] = _mm256_srai_epi16(step2[ 5], 2); + step2[ 6] = _mm256_srai_epi16(step2[ 6], 2); + step2[ 7] = _mm256_srai_epi16(step2[ 7], 2); + step2[ 8] = _mm256_srai_epi16(step2[ 8], 2); + step2[ 9] = _mm256_srai_epi16(step2[ 9], 2); + step2[10] = _mm256_srai_epi16(step2[10], 2); + step2[11] = _mm256_srai_epi16(step2[11], 2); + step2[12] = _mm256_srai_epi16(step2[12], 2); + step2[13] = _mm256_srai_epi16(step2[13], 2); + step2[14] = _mm256_srai_epi16(step2[14], 2); + step2[15] = _mm256_srai_epi16(step2[15], 2); + step1[16] = _mm256_srai_epi16(step1[16], 2); + step1[17] = _mm256_srai_epi16(step1[17], 2); + step1[18] = _mm256_srai_epi16(step1[18], 2); + step1[19] = _mm256_srai_epi16(step1[19], 2); + step2[20] = _mm256_srai_epi16(step2[20], 2); + step2[21] = _mm256_srai_epi16(step2[21], 2); + step2[22] = _mm256_srai_epi16(step2[22], 2); + step2[23] = _mm256_srai_epi16(step2[23], 2); + step2[24] = _mm256_srai_epi16(step2[24], 2); + step2[25] = _mm256_srai_epi16(step2[25], 2); + step2[26] = _mm256_srai_epi16(step2[26], 2); + step2[27] = _mm256_srai_epi16(step2[27], 2); + step1[28] = _mm256_srai_epi16(step1[28], 2); + step1[29] = _mm256_srai_epi16(step1[29], 2); + step1[30] = _mm256_srai_epi16(step1[30], 2); + step1[31] = _mm256_srai_epi16(step1[31], 2); + } +#endif + +#if FDCT32x32_HIGH_PRECISION + if (pass == 0) { +#endif + // Stage 3 + { + step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]); + step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]); + step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]); + step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]); + step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]); + step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]); + step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]); + step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]); + } + { + const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]); + const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]); + const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]); + const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]); + const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16); + const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16); + const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16); + const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16); + const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16); + const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16); + const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16); + const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m256i s3_10_4 = _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); + const __m256i s3_10_5 = _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); + const __m256i s3_11_4 = _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); + const __m256i s3_11_5 = _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); + const __m256i s3_12_4 = _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); + const __m256i s3_12_5 = _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); + const __m256i s3_13_4 = _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); + const __m256i s3_13_5 = _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); + const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS); + const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS); + const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS); + const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS); + const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS); + const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS); + const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS); + const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS); + // Combine + step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7); + step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7); + step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7); + step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7); + } + { + step3[16] = _mm256_add_epi16(step2[23], step1[16]); + step3[17] = _mm256_add_epi16(step2[22], step1[17]); + step3[18] = _mm256_add_epi16(step2[21], step1[18]); + step3[19] = _mm256_add_epi16(step2[20], step1[19]); + step3[20] = _mm256_sub_epi16(step1[19], step2[20]); + step3[21] = _mm256_sub_epi16(step1[18], step2[21]); + step3[22] = _mm256_sub_epi16(step1[17], step2[22]); + step3[23] = _mm256_sub_epi16(step1[16], step2[23]); + step3[24] = _mm256_sub_epi16(step1[31], step2[24]); + step3[25] = _mm256_sub_epi16(step1[30], step2[25]); + step3[26] = _mm256_sub_epi16(step1[29], step2[26]); + step3[27] = _mm256_sub_epi16(step1[28], step2[27]); + step3[28] = _mm256_add_epi16(step2[27], step1[28]); + step3[29] = _mm256_add_epi16(step2[26], step1[29]); + step3[30] = _mm256_add_epi16(step2[25], step1[30]); + step3[31] = _mm256_add_epi16(step2[24], step1[31]); + } + + // Stage 4 + { + step1[ 0] = _mm256_add_epi16(step3[ 3], step3[ 0]); + step1[ 1] = _mm256_add_epi16(step3[ 2], step3[ 1]); + step1[ 2] = _mm256_sub_epi16(step3[ 1], step3[ 2]); + step1[ 3] = _mm256_sub_epi16(step3[ 0], step3[ 3]); + step1[ 8] = _mm256_add_epi16(step3[11], step2[ 8]); + step1[ 9] = _mm256_add_epi16(step3[10], step2[ 9]); + step1[10] = _mm256_sub_epi16(step2[ 9], step3[10]); + step1[11] = _mm256_sub_epi16(step2[ 8], step3[11]); + step1[12] = _mm256_sub_epi16(step2[15], step3[12]); + step1[13] = _mm256_sub_epi16(step2[14], step3[13]); + step1[14] = _mm256_add_epi16(step3[13], step2[14]); + step1[15] = _mm256_add_epi16(step3[12], step2[15]); + } + { + const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]); + const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]); + const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16); + const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16); + const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16); + const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m256i s1_05_4 = _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); + const __m256i s1_05_5 = _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); + const __m256i s1_06_4 = _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); + const __m256i s1_06_5 = _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); + const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS); + const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS); + const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS); + const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS); + // Combine + step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7); + step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7); + } + { + const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]); + const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]); + const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]); + const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]); + const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]); + const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]); + const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]); + const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]); + const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24); + const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24); + const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24); + const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24); + const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08); + const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08); + const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08); + const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08); + const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24); + const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24); + const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24); + const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24); + const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08); + const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08); + const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08); + const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m256i s1_18_4 = _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); + const __m256i s1_18_5 = _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); + const __m256i s1_19_4 = _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); + const __m256i s1_19_5 = _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); + const __m256i s1_20_4 = _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); + const __m256i s1_20_5 = _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); + const __m256i s1_21_4 = _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); + const __m256i s1_21_5 = _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); + const __m256i s1_26_4 = _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); + const __m256i s1_26_5 = _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); + const __m256i s1_27_4 = _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); + const __m256i s1_27_5 = _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); + const __m256i s1_28_4 = _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); + const __m256i s1_28_5 = _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); + const __m256i s1_29_4 = _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); + const __m256i s1_29_5 = _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); + const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS); + const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS); + const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS); + const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS); + const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS); + const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS); + const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS); + const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS); + const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS); + const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS); + const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS); + const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS); + const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS); + const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS); + const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS); + const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS); + // Combine + step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7); + step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7); + step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7); + step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7); + step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7); + step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7); + step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7); + step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7); + } + // Stage 5 + { + step2[4] = _mm256_add_epi16(step1[5], step3[4]); + step2[5] = _mm256_sub_epi16(step3[4], step1[5]); + step2[6] = _mm256_sub_epi16(step3[7], step1[6]); + step2[7] = _mm256_add_epi16(step1[6], step3[7]); + } + { + const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]); + const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]); + const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]); + const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]); + const __m256i out_00_2 = _mm256_madd_epi16(out_00_0, k__cospi_p16_p16); + const __m256i out_00_3 = _mm256_madd_epi16(out_00_1, k__cospi_p16_p16); + const __m256i out_16_2 = _mm256_madd_epi16(out_00_0, k__cospi_p16_m16); + const __m256i out_16_3 = _mm256_madd_epi16(out_00_1, k__cospi_p16_m16); + const __m256i out_08_2 = _mm256_madd_epi16(out_08_0, k__cospi_p24_p08); + const __m256i out_08_3 = _mm256_madd_epi16(out_08_1, k__cospi_p24_p08); + const __m256i out_24_2 = _mm256_madd_epi16(out_08_0, k__cospi_m08_p24); + const __m256i out_24_3 = _mm256_madd_epi16(out_08_1, k__cospi_m08_p24); + // dct_const_round_shift + const __m256i out_00_4 = _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); + const __m256i out_00_5 = _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); + const __m256i out_16_4 = _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); + const __m256i out_16_5 = _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); + const __m256i out_08_4 = _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); + const __m256i out_08_5 = _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); + const __m256i out_24_4 = _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); + const __m256i out_24_5 = _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); + const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS); + const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS); + const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS); + const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS); + const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS); + const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS); + const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS); + const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS); + // Combine + out[ 0] = _mm256_packs_epi32(out_00_6, out_00_7); + out[16] = _mm256_packs_epi32(out_16_6, out_16_7); + out[ 8] = _mm256_packs_epi32(out_08_6, out_08_7); + out[24] = _mm256_packs_epi32(out_24_6, out_24_7); + } + { + const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[ 9], step1[14]); + const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[ 9], step1[14]); + const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]); + const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]); + const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24); + const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24); + const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08); + const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08); + const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24); + const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24); + const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08); + const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m256i s2_09_4 = _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); + const __m256i s2_09_5 = _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); + const __m256i s2_10_4 = _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); + const __m256i s2_10_5 = _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); + const __m256i s2_13_4 = _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); + const __m256i s2_13_5 = _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); + const __m256i s2_14_4 = _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); + const __m256i s2_14_5 = _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); + const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS); + const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS); + const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS); + const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS); + const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS); + const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS); + const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS); + const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS); + // Combine + step2[ 9] = _mm256_packs_epi32(s2_09_6, s2_09_7); + step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7); + step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7); + step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7); + } + { + step2[16] = _mm256_add_epi16(step1[19], step3[16]); + step2[17] = _mm256_add_epi16(step1[18], step3[17]); + step2[18] = _mm256_sub_epi16(step3[17], step1[18]); + step2[19] = _mm256_sub_epi16(step3[16], step1[19]); + step2[20] = _mm256_sub_epi16(step3[23], step1[20]); + step2[21] = _mm256_sub_epi16(step3[22], step1[21]); + step2[22] = _mm256_add_epi16(step1[21], step3[22]); + step2[23] = _mm256_add_epi16(step1[20], step3[23]); + step2[24] = _mm256_add_epi16(step1[27], step3[24]); + step2[25] = _mm256_add_epi16(step1[26], step3[25]); + step2[26] = _mm256_sub_epi16(step3[25], step1[26]); + step2[27] = _mm256_sub_epi16(step3[24], step1[27]); + step2[28] = _mm256_sub_epi16(step3[31], step1[28]); + step2[29] = _mm256_sub_epi16(step3[30], step1[29]); + step2[30] = _mm256_add_epi16(step1[29], step3[30]); + step2[31] = _mm256_add_epi16(step1[28], step3[31]); + } + // Stage 6 + { + const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]); + const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]); + const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]); + const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]); + const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]); + const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]); + const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]); + const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]); + const __m256i out_04_2 = _mm256_madd_epi16(out_04_0, k__cospi_p28_p04); + const __m256i out_04_3 = _mm256_madd_epi16(out_04_1, k__cospi_p28_p04); + const __m256i out_20_2 = _mm256_madd_epi16(out_20_0, k__cospi_p12_p20); + const __m256i out_20_3 = _mm256_madd_epi16(out_20_1, k__cospi_p12_p20); + const __m256i out_12_2 = _mm256_madd_epi16(out_12_0, k__cospi_m20_p12); + const __m256i out_12_3 = _mm256_madd_epi16(out_12_1, k__cospi_m20_p12); + const __m256i out_28_2 = _mm256_madd_epi16(out_28_0, k__cospi_m04_p28); + const __m256i out_28_3 = _mm256_madd_epi16(out_28_1, k__cospi_m04_p28); + // dct_const_round_shift + const __m256i out_04_4 = _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); + const __m256i out_04_5 = _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); + const __m256i out_20_4 = _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); + const __m256i out_20_5 = _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); + const __m256i out_12_4 = _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); + const __m256i out_12_5 = _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); + const __m256i out_28_4 = _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); + const __m256i out_28_5 = _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); + const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS); + const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS); + const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS); + const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS); + const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS); + const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS); + const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS); + const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS); + // Combine + out[ 4] = _mm256_packs_epi32(out_04_6, out_04_7); + out[20] = _mm256_packs_epi32(out_20_6, out_20_7); + out[12] = _mm256_packs_epi32(out_12_6, out_12_7); + out[28] = _mm256_packs_epi32(out_28_6, out_28_7); + } + { + step3[ 8] = _mm256_add_epi16(step2[ 9], step1[ 8]); + step3[ 9] = _mm256_sub_epi16(step1[ 8], step2[ 9]); + step3[10] = _mm256_sub_epi16(step1[11], step2[10]); + step3[11] = _mm256_add_epi16(step2[10], step1[11]); + step3[12] = _mm256_add_epi16(step2[13], step1[12]); + step3[13] = _mm256_sub_epi16(step1[12], step2[13]); + step3[14] = _mm256_sub_epi16(step1[15], step2[14]); + step3[15] = _mm256_add_epi16(step2[14], step1[15]); + } + { + const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]); + const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]); + const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]); + const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]); + const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]); + const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]); + const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]); + const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]); + const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28); + const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28); + const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04); + const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04); + const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12); + const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12); + const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20); + const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20); + const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12); + const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12); + const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20); + const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20); + const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28); + const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28); + const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04); + const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04); + // dct_const_round_shift + const __m256i s3_17_4 = _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); + const __m256i s3_17_5 = _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); + const __m256i s3_18_4 = _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); + const __m256i s3_18_5 = _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); + const __m256i s3_21_4 = _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); + const __m256i s3_21_5 = _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); + const __m256i s3_22_4 = _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); + const __m256i s3_22_5 = _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); + const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS); + const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS); + const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS); + const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS); + const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS); + const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS); + const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS); + const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS); + const __m256i s3_25_4 = _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); + const __m256i s3_25_5 = _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); + const __m256i s3_26_4 = _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); + const __m256i s3_26_5 = _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); + const __m256i s3_29_4 = _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); + const __m256i s3_29_5 = _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); + const __m256i s3_30_4 = _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); + const __m256i s3_30_5 = _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); + const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS); + const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS); + const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS); + const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS); + const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS); + const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS); + const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS); + const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS); + // Combine + step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7); + step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7); + step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7); + step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7); + // Combine + step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7); + step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7); + step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7); + step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7); + } + // Stage 7 + { + const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[ 8], step3[15]); + const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[ 8], step3[15]); + const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[ 9], step3[14]); + const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[ 9], step3[14]); + const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]); + const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]); + const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]); + const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]); + const __m256i out_02_2 = _mm256_madd_epi16(out_02_0, k__cospi_p30_p02); + const __m256i out_02_3 = _mm256_madd_epi16(out_02_1, k__cospi_p30_p02); + const __m256i out_18_2 = _mm256_madd_epi16(out_18_0, k__cospi_p14_p18); + const __m256i out_18_3 = _mm256_madd_epi16(out_18_1, k__cospi_p14_p18); + const __m256i out_10_2 = _mm256_madd_epi16(out_10_0, k__cospi_p22_p10); + const __m256i out_10_3 = _mm256_madd_epi16(out_10_1, k__cospi_p22_p10); + const __m256i out_26_2 = _mm256_madd_epi16(out_26_0, k__cospi_p06_p26); + const __m256i out_26_3 = _mm256_madd_epi16(out_26_1, k__cospi_p06_p26); + const __m256i out_06_2 = _mm256_madd_epi16(out_26_0, k__cospi_m26_p06); + const __m256i out_06_3 = _mm256_madd_epi16(out_26_1, k__cospi_m26_p06); + const __m256i out_22_2 = _mm256_madd_epi16(out_10_0, k__cospi_m10_p22); + const __m256i out_22_3 = _mm256_madd_epi16(out_10_1, k__cospi_m10_p22); + const __m256i out_14_2 = _mm256_madd_epi16(out_18_0, k__cospi_m18_p14); + const __m256i out_14_3 = _mm256_madd_epi16(out_18_1, k__cospi_m18_p14); + const __m256i out_30_2 = _mm256_madd_epi16(out_02_0, k__cospi_m02_p30); + const __m256i out_30_3 = _mm256_madd_epi16(out_02_1, k__cospi_m02_p30); + // dct_const_round_shift + const __m256i out_02_4 = _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); + const __m256i out_02_5 = _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); + const __m256i out_18_4 = _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); + const __m256i out_18_5 = _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); + const __m256i out_10_4 = _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); + const __m256i out_10_5 = _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); + const __m256i out_26_4 = _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); + const __m256i out_26_5 = _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); + const __m256i out_06_4 = _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); + const __m256i out_06_5 = _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); + const __m256i out_22_4 = _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); + const __m256i out_22_5 = _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); + const __m256i out_14_4 = _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); + const __m256i out_14_5 = _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); + const __m256i out_30_4 = _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); + const __m256i out_30_5 = _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); + const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS); + const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS); + const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS); + const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS); + const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS); + const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS); + const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS); + const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS); + const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS); + const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS); + const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS); + const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS); + const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS); + const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS); + const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS); + const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS); + // Combine + out[ 2] = _mm256_packs_epi32(out_02_6, out_02_7); + out[18] = _mm256_packs_epi32(out_18_6, out_18_7); + out[10] = _mm256_packs_epi32(out_10_6, out_10_7); + out[26] = _mm256_packs_epi32(out_26_6, out_26_7); + out[ 6] = _mm256_packs_epi32(out_06_6, out_06_7); + out[22] = _mm256_packs_epi32(out_22_6, out_22_7); + out[14] = _mm256_packs_epi32(out_14_6, out_14_7); + out[30] = _mm256_packs_epi32(out_30_6, out_30_7); + } + { + step1[16] = _mm256_add_epi16(step3[17], step2[16]); + step1[17] = _mm256_sub_epi16(step2[16], step3[17]); + step1[18] = _mm256_sub_epi16(step2[19], step3[18]); + step1[19] = _mm256_add_epi16(step3[18], step2[19]); + step1[20] = _mm256_add_epi16(step3[21], step2[20]); + step1[21] = _mm256_sub_epi16(step2[20], step3[21]); + step1[22] = _mm256_sub_epi16(step2[23], step3[22]); + step1[23] = _mm256_add_epi16(step3[22], step2[23]); + step1[24] = _mm256_add_epi16(step3[25], step2[24]); + step1[25] = _mm256_sub_epi16(step2[24], step3[25]); + step1[26] = _mm256_sub_epi16(step2[27], step3[26]); + step1[27] = _mm256_add_epi16(step3[26], step2[27]); + step1[28] = _mm256_add_epi16(step3[29], step2[28]); + step1[29] = _mm256_sub_epi16(step2[28], step3[29]); + step1[30] = _mm256_sub_epi16(step2[31], step3[30]); + step1[31] = _mm256_add_epi16(step3[30], step2[31]); + } + // Final stage --- outputs indices are bit-reversed. + { + const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]); + const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]); + const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]); + const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]); + const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]); + const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]); + const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]); + const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]); + const __m256i out_01_2 = _mm256_madd_epi16(out_01_0, k__cospi_p31_p01); + const __m256i out_01_3 = _mm256_madd_epi16(out_01_1, k__cospi_p31_p01); + const __m256i out_17_2 = _mm256_madd_epi16(out_17_0, k__cospi_p15_p17); + const __m256i out_17_3 = _mm256_madd_epi16(out_17_1, k__cospi_p15_p17); + const __m256i out_09_2 = _mm256_madd_epi16(out_09_0, k__cospi_p23_p09); + const __m256i out_09_3 = _mm256_madd_epi16(out_09_1, k__cospi_p23_p09); + const __m256i out_25_2 = _mm256_madd_epi16(out_25_0, k__cospi_p07_p25); + const __m256i out_25_3 = _mm256_madd_epi16(out_25_1, k__cospi_p07_p25); + const __m256i out_07_2 = _mm256_madd_epi16(out_25_0, k__cospi_m25_p07); + const __m256i out_07_3 = _mm256_madd_epi16(out_25_1, k__cospi_m25_p07); + const __m256i out_23_2 = _mm256_madd_epi16(out_09_0, k__cospi_m09_p23); + const __m256i out_23_3 = _mm256_madd_epi16(out_09_1, k__cospi_m09_p23); + const __m256i out_15_2 = _mm256_madd_epi16(out_17_0, k__cospi_m17_p15); + const __m256i out_15_3 = _mm256_madd_epi16(out_17_1, k__cospi_m17_p15); + const __m256i out_31_2 = _mm256_madd_epi16(out_01_0, k__cospi_m01_p31); + const __m256i out_31_3 = _mm256_madd_epi16(out_01_1, k__cospi_m01_p31); + // dct_const_round_shift + const __m256i out_01_4 = _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); + const __m256i out_01_5 = _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); + const __m256i out_17_4 = _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); + const __m256i out_17_5 = _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); + const __m256i out_09_4 = _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); + const __m256i out_09_5 = _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); + const __m256i out_25_4 = _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); + const __m256i out_25_5 = _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); + const __m256i out_07_4 = _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); + const __m256i out_07_5 = _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); + const __m256i out_23_4 = _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); + const __m256i out_23_5 = _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); + const __m256i out_15_4 = _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); + const __m256i out_15_5 = _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); + const __m256i out_31_4 = _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); + const __m256i out_31_5 = _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); + const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS); + const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS); + const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS); + const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS); + const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS); + const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS); + const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS); + const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS); + const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS); + const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS); + const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS); + const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS); + const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS); + const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS); + const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS); + const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS); + // Combine + out[ 1] = _mm256_packs_epi32(out_01_6, out_01_7); + out[17] = _mm256_packs_epi32(out_17_6, out_17_7); + out[ 9] = _mm256_packs_epi32(out_09_6, out_09_7); + out[25] = _mm256_packs_epi32(out_25_6, out_25_7); + out[ 7] = _mm256_packs_epi32(out_07_6, out_07_7); + out[23] = _mm256_packs_epi32(out_23_6, out_23_7); + out[15] = _mm256_packs_epi32(out_15_6, out_15_7); + out[31] = _mm256_packs_epi32(out_31_6, out_31_7); + } + { + const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]); + const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]); + const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]); + const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]); + const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]); + const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]); + const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]); + const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]); + const __m256i out_05_2 = _mm256_madd_epi16(out_05_0, k__cospi_p27_p05); + const __m256i out_05_3 = _mm256_madd_epi16(out_05_1, k__cospi_p27_p05); + const __m256i out_21_2 = _mm256_madd_epi16(out_21_0, k__cospi_p11_p21); + const __m256i out_21_3 = _mm256_madd_epi16(out_21_1, k__cospi_p11_p21); + const __m256i out_13_2 = _mm256_madd_epi16(out_13_0, k__cospi_p19_p13); + const __m256i out_13_3 = _mm256_madd_epi16(out_13_1, k__cospi_p19_p13); + const __m256i out_29_2 = _mm256_madd_epi16(out_29_0, k__cospi_p03_p29); + const __m256i out_29_3 = _mm256_madd_epi16(out_29_1, k__cospi_p03_p29); + const __m256i out_03_2 = _mm256_madd_epi16(out_29_0, k__cospi_m29_p03); + const __m256i out_03_3 = _mm256_madd_epi16(out_29_1, k__cospi_m29_p03); + const __m256i out_19_2 = _mm256_madd_epi16(out_13_0, k__cospi_m13_p19); + const __m256i out_19_3 = _mm256_madd_epi16(out_13_1, k__cospi_m13_p19); + const __m256i out_11_2 = _mm256_madd_epi16(out_21_0, k__cospi_m21_p11); + const __m256i out_11_3 = _mm256_madd_epi16(out_21_1, k__cospi_m21_p11); + const __m256i out_27_2 = _mm256_madd_epi16(out_05_0, k__cospi_m05_p27); + const __m256i out_27_3 = _mm256_madd_epi16(out_05_1, k__cospi_m05_p27); + // dct_const_round_shift + const __m256i out_05_4 = _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); + const __m256i out_05_5 = _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); + const __m256i out_21_4 = _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); + const __m256i out_21_5 = _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); + const __m256i out_13_4 = _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); + const __m256i out_13_5 = _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); + const __m256i out_29_4 = _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); + const __m256i out_29_5 = _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); + const __m256i out_03_4 = _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); + const __m256i out_03_5 = _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); + const __m256i out_19_4 = _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); + const __m256i out_19_5 = _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); + const __m256i out_11_4 = _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); + const __m256i out_11_5 = _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); + const __m256i out_27_4 = _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); + const __m256i out_27_5 = _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); + const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS); + const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS); + const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS); + const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS); + const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS); + const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS); + const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS); + const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS); + const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS); + const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS); + const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS); + const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS); + const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS); + const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS); + const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS); + const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS); + // Combine + out[ 5] = _mm256_packs_epi32(out_05_6, out_05_7); + out[21] = _mm256_packs_epi32(out_21_6, out_21_7); + out[13] = _mm256_packs_epi32(out_13_6, out_13_7); + out[29] = _mm256_packs_epi32(out_29_6, out_29_7); + out[ 3] = _mm256_packs_epi32(out_03_6, out_03_7); + out[19] = _mm256_packs_epi32(out_19_6, out_19_7); + out[11] = _mm256_packs_epi32(out_11_6, out_11_7); + out[27] = _mm256_packs_epi32(out_27_6, out_27_7); + } +#if FDCT32x32_HIGH_PRECISION + } else { + __m256i lstep1[64], lstep2[64], lstep3[64]; + __m256i u[32], v[32], sign[16]; + const __m256i K32One = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1); + // start using 32-bit operations + // stage 3 + { + // expanding to 32-bit length priori to addition operations + lstep2[ 0] = _mm256_unpacklo_epi16(step2[ 0], kZero); + lstep2[ 1] = _mm256_unpackhi_epi16(step2[ 0], kZero); + lstep2[ 2] = _mm256_unpacklo_epi16(step2[ 1], kZero); + lstep2[ 3] = _mm256_unpackhi_epi16(step2[ 1], kZero); + lstep2[ 4] = _mm256_unpacklo_epi16(step2[ 2], kZero); + lstep2[ 5] = _mm256_unpackhi_epi16(step2[ 2], kZero); + lstep2[ 6] = _mm256_unpacklo_epi16(step2[ 3], kZero); + lstep2[ 7] = _mm256_unpackhi_epi16(step2[ 3], kZero); + lstep2[ 8] = _mm256_unpacklo_epi16(step2[ 4], kZero); + lstep2[ 9] = _mm256_unpackhi_epi16(step2[ 4], kZero); + lstep2[10] = _mm256_unpacklo_epi16(step2[ 5], kZero); + lstep2[11] = _mm256_unpackhi_epi16(step2[ 5], kZero); + lstep2[12] = _mm256_unpacklo_epi16(step2[ 6], kZero); + lstep2[13] = _mm256_unpackhi_epi16(step2[ 6], kZero); + lstep2[14] = _mm256_unpacklo_epi16(step2[ 7], kZero); + lstep2[15] = _mm256_unpackhi_epi16(step2[ 7], kZero); + lstep2[ 0] = _mm256_madd_epi16(lstep2[ 0], kOne); + lstep2[ 1] = _mm256_madd_epi16(lstep2[ 1], kOne); + lstep2[ 2] = _mm256_madd_epi16(lstep2[ 2], kOne); + lstep2[ 3] = _mm256_madd_epi16(lstep2[ 3], kOne); + lstep2[ 4] = _mm256_madd_epi16(lstep2[ 4], kOne); + lstep2[ 5] = _mm256_madd_epi16(lstep2[ 5], kOne); + lstep2[ 6] = _mm256_madd_epi16(lstep2[ 6], kOne); + lstep2[ 7] = _mm256_madd_epi16(lstep2[ 7], kOne); + lstep2[ 8] = _mm256_madd_epi16(lstep2[ 8], kOne); + lstep2[ 9] = _mm256_madd_epi16(lstep2[ 9], kOne); + lstep2[10] = _mm256_madd_epi16(lstep2[10], kOne); + lstep2[11] = _mm256_madd_epi16(lstep2[11], kOne); + lstep2[12] = _mm256_madd_epi16(lstep2[12], kOne); + lstep2[13] = _mm256_madd_epi16(lstep2[13], kOne); + lstep2[14] = _mm256_madd_epi16(lstep2[14], kOne); + lstep2[15] = _mm256_madd_epi16(lstep2[15], kOne); + + lstep3[ 0] = _mm256_add_epi32(lstep2[14], lstep2[ 0]); + lstep3[ 1] = _mm256_add_epi32(lstep2[15], lstep2[ 1]); + lstep3[ 2] = _mm256_add_epi32(lstep2[12], lstep2[ 2]); + lstep3[ 3] = _mm256_add_epi32(lstep2[13], lstep2[ 3]); + lstep3[ 4] = _mm256_add_epi32(lstep2[10], lstep2[ 4]); + lstep3[ 5] = _mm256_add_epi32(lstep2[11], lstep2[ 5]); + lstep3[ 6] = _mm256_add_epi32(lstep2[ 8], lstep2[ 6]); + lstep3[ 7] = _mm256_add_epi32(lstep2[ 9], lstep2[ 7]); + lstep3[ 8] = _mm256_sub_epi32(lstep2[ 6], lstep2[ 8]); + lstep3[ 9] = _mm256_sub_epi32(lstep2[ 7], lstep2[ 9]); + lstep3[10] = _mm256_sub_epi32(lstep2[ 4], lstep2[10]); + lstep3[11] = _mm256_sub_epi32(lstep2[ 5], lstep2[11]); + lstep3[12] = _mm256_sub_epi32(lstep2[ 2], lstep2[12]); + lstep3[13] = _mm256_sub_epi32(lstep2[ 3], lstep2[13]); + lstep3[14] = _mm256_sub_epi32(lstep2[ 0], lstep2[14]); + lstep3[15] = _mm256_sub_epi32(lstep2[ 1], lstep2[15]); + } + { + const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]); + const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]); + const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]); + const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]); + const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16); + const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16); + const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16); + const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16); + const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16); + const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16); + const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16); + const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m256i s3_10_4 = _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); + const __m256i s3_10_5 = _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); + const __m256i s3_11_4 = _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); + const __m256i s3_11_5 = _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); + const __m256i s3_12_4 = _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); + const __m256i s3_12_5 = _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); + const __m256i s3_13_4 = _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); + const __m256i s3_13_5 = _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); + lstep3[20] = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS); + lstep3[21] = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS); + lstep3[22] = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS); + lstep3[23] = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS); + lstep3[24] = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS); + lstep3[25] = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS); + lstep3[26] = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS); + lstep3[27] = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS); + } + { + lstep2[40] = _mm256_unpacklo_epi16(step2[20], kZero); + lstep2[41] = _mm256_unpackhi_epi16(step2[20], kZero); + lstep2[42] = _mm256_unpacklo_epi16(step2[21], kZero); + lstep2[43] = _mm256_unpackhi_epi16(step2[21], kZero); + lstep2[44] = _mm256_unpacklo_epi16(step2[22], kZero); + lstep2[45] = _mm256_unpackhi_epi16(step2[22], kZero); + lstep2[46] = _mm256_unpacklo_epi16(step2[23], kZero); + lstep2[47] = _mm256_unpackhi_epi16(step2[23], kZero); + lstep2[48] = _mm256_unpacklo_epi16(step2[24], kZero); + lstep2[49] = _mm256_unpackhi_epi16(step2[24], kZero); + lstep2[50] = _mm256_unpacklo_epi16(step2[25], kZero); + lstep2[51] = _mm256_unpackhi_epi16(step2[25], kZero); + lstep2[52] = _mm256_unpacklo_epi16(step2[26], kZero); + lstep2[53] = _mm256_unpackhi_epi16(step2[26], kZero); + lstep2[54] = _mm256_unpacklo_epi16(step2[27], kZero); + lstep2[55] = _mm256_unpackhi_epi16(step2[27], kZero); + lstep2[40] = _mm256_madd_epi16(lstep2[40], kOne); + lstep2[41] = _mm256_madd_epi16(lstep2[41], kOne); + lstep2[42] = _mm256_madd_epi16(lstep2[42], kOne); + lstep2[43] = _mm256_madd_epi16(lstep2[43], kOne); + lstep2[44] = _mm256_madd_epi16(lstep2[44], kOne); + lstep2[45] = _mm256_madd_epi16(lstep2[45], kOne); + lstep2[46] = _mm256_madd_epi16(lstep2[46], kOne); + lstep2[47] = _mm256_madd_epi16(lstep2[47], kOne); + lstep2[48] = _mm256_madd_epi16(lstep2[48], kOne); + lstep2[49] = _mm256_madd_epi16(lstep2[49], kOne); + lstep2[50] = _mm256_madd_epi16(lstep2[50], kOne); + lstep2[51] = _mm256_madd_epi16(lstep2[51], kOne); + lstep2[52] = _mm256_madd_epi16(lstep2[52], kOne); + lstep2[53] = _mm256_madd_epi16(lstep2[53], kOne); + lstep2[54] = _mm256_madd_epi16(lstep2[54], kOne); + lstep2[55] = _mm256_madd_epi16(lstep2[55], kOne); + + lstep1[32] = _mm256_unpacklo_epi16(step1[16], kZero); + lstep1[33] = _mm256_unpackhi_epi16(step1[16], kZero); + lstep1[34] = _mm256_unpacklo_epi16(step1[17], kZero); + lstep1[35] = _mm256_unpackhi_epi16(step1[17], kZero); + lstep1[36] = _mm256_unpacklo_epi16(step1[18], kZero); + lstep1[37] = _mm256_unpackhi_epi16(step1[18], kZero); + lstep1[38] = _mm256_unpacklo_epi16(step1[19], kZero); + lstep1[39] = _mm256_unpackhi_epi16(step1[19], kZero); + lstep1[56] = _mm256_unpacklo_epi16(step1[28], kZero); + lstep1[57] = _mm256_unpackhi_epi16(step1[28], kZero); + lstep1[58] = _mm256_unpacklo_epi16(step1[29], kZero); + lstep1[59] = _mm256_unpackhi_epi16(step1[29], kZero); + lstep1[60] = _mm256_unpacklo_epi16(step1[30], kZero); + lstep1[61] = _mm256_unpackhi_epi16(step1[30], kZero); + lstep1[62] = _mm256_unpacklo_epi16(step1[31], kZero); + lstep1[63] = _mm256_unpackhi_epi16(step1[31], kZero); + lstep1[32] = _mm256_madd_epi16(lstep1[32], kOne); + lstep1[33] = _mm256_madd_epi16(lstep1[33], kOne); + lstep1[34] = _mm256_madd_epi16(lstep1[34], kOne); + lstep1[35] = _mm256_madd_epi16(lstep1[35], kOne); + lstep1[36] = _mm256_madd_epi16(lstep1[36], kOne); + lstep1[37] = _mm256_madd_epi16(lstep1[37], kOne); + lstep1[38] = _mm256_madd_epi16(lstep1[38], kOne); + lstep1[39] = _mm256_madd_epi16(lstep1[39], kOne); + lstep1[56] = _mm256_madd_epi16(lstep1[56], kOne); + lstep1[57] = _mm256_madd_epi16(lstep1[57], kOne); + lstep1[58] = _mm256_madd_epi16(lstep1[58], kOne); + lstep1[59] = _mm256_madd_epi16(lstep1[59], kOne); + lstep1[60] = _mm256_madd_epi16(lstep1[60], kOne); + lstep1[61] = _mm256_madd_epi16(lstep1[61], kOne); + lstep1[62] = _mm256_madd_epi16(lstep1[62], kOne); + lstep1[63] = _mm256_madd_epi16(lstep1[63], kOne); + + lstep3[32] = _mm256_add_epi32(lstep2[46], lstep1[32]); + lstep3[33] = _mm256_add_epi32(lstep2[47], lstep1[33]); + + lstep3[34] = _mm256_add_epi32(lstep2[44], lstep1[34]); + lstep3[35] = _mm256_add_epi32(lstep2[45], lstep1[35]); + lstep3[36] = _mm256_add_epi32(lstep2[42], lstep1[36]); + lstep3[37] = _mm256_add_epi32(lstep2[43], lstep1[37]); + lstep3[38] = _mm256_add_epi32(lstep2[40], lstep1[38]); + lstep3[39] = _mm256_add_epi32(lstep2[41], lstep1[39]); + lstep3[40] = _mm256_sub_epi32(lstep1[38], lstep2[40]); + lstep3[41] = _mm256_sub_epi32(lstep1[39], lstep2[41]); + lstep3[42] = _mm256_sub_epi32(lstep1[36], lstep2[42]); + lstep3[43] = _mm256_sub_epi32(lstep1[37], lstep2[43]); + lstep3[44] = _mm256_sub_epi32(lstep1[34], lstep2[44]); + lstep3[45] = _mm256_sub_epi32(lstep1[35], lstep2[45]); + lstep3[46] = _mm256_sub_epi32(lstep1[32], lstep2[46]); + lstep3[47] = _mm256_sub_epi32(lstep1[33], lstep2[47]); + lstep3[48] = _mm256_sub_epi32(lstep1[62], lstep2[48]); + lstep3[49] = _mm256_sub_epi32(lstep1[63], lstep2[49]); + lstep3[50] = _mm256_sub_epi32(lstep1[60], lstep2[50]); + lstep3[51] = _mm256_sub_epi32(lstep1[61], lstep2[51]); + lstep3[52] = _mm256_sub_epi32(lstep1[58], lstep2[52]); + lstep3[53] = _mm256_sub_epi32(lstep1[59], lstep2[53]); + lstep3[54] = _mm256_sub_epi32(lstep1[56], lstep2[54]); + lstep3[55] = _mm256_sub_epi32(lstep1[57], lstep2[55]); + lstep3[56] = _mm256_add_epi32(lstep2[54], lstep1[56]); + lstep3[57] = _mm256_add_epi32(lstep2[55], lstep1[57]); + lstep3[58] = _mm256_add_epi32(lstep2[52], lstep1[58]); + lstep3[59] = _mm256_add_epi32(lstep2[53], lstep1[59]); + lstep3[60] = _mm256_add_epi32(lstep2[50], lstep1[60]); + lstep3[61] = _mm256_add_epi32(lstep2[51], lstep1[61]); + lstep3[62] = _mm256_add_epi32(lstep2[48], lstep1[62]); + lstep3[63] = _mm256_add_epi32(lstep2[49], lstep1[63]); + } + + // stage 4 + { + // expanding to 32-bit length priori to addition operations + lstep2[16] = _mm256_unpacklo_epi16(step2[ 8], kZero); + lstep2[17] = _mm256_unpackhi_epi16(step2[ 8], kZero); + lstep2[18] = _mm256_unpacklo_epi16(step2[ 9], kZero); + lstep2[19] = _mm256_unpackhi_epi16(step2[ 9], kZero); + lstep2[28] = _mm256_unpacklo_epi16(step2[14], kZero); + lstep2[29] = _mm256_unpackhi_epi16(step2[14], kZero); + lstep2[30] = _mm256_unpacklo_epi16(step2[15], kZero); + lstep2[31] = _mm256_unpackhi_epi16(step2[15], kZero); + lstep2[16] = _mm256_madd_epi16(lstep2[16], kOne); + lstep2[17] = _mm256_madd_epi16(lstep2[17], kOne); + lstep2[18] = _mm256_madd_epi16(lstep2[18], kOne); + lstep2[19] = _mm256_madd_epi16(lstep2[19], kOne); + lstep2[28] = _mm256_madd_epi16(lstep2[28], kOne); + lstep2[29] = _mm256_madd_epi16(lstep2[29], kOne); + lstep2[30] = _mm256_madd_epi16(lstep2[30], kOne); + lstep2[31] = _mm256_madd_epi16(lstep2[31], kOne); + + lstep1[ 0] = _mm256_add_epi32(lstep3[ 6], lstep3[ 0]); + lstep1[ 1] = _mm256_add_epi32(lstep3[ 7], lstep3[ 1]); + lstep1[ 2] = _mm256_add_epi32(lstep3[ 4], lstep3[ 2]); + lstep1[ 3] = _mm256_add_epi32(lstep3[ 5], lstep3[ 3]); + lstep1[ 4] = _mm256_sub_epi32(lstep3[ 2], lstep3[ 4]); + lstep1[ 5] = _mm256_sub_epi32(lstep3[ 3], lstep3[ 5]); + lstep1[ 6] = _mm256_sub_epi32(lstep3[ 0], lstep3[ 6]); + lstep1[ 7] = _mm256_sub_epi32(lstep3[ 1], lstep3[ 7]); + lstep1[16] = _mm256_add_epi32(lstep3[22], lstep2[16]); + lstep1[17] = _mm256_add_epi32(lstep3[23], lstep2[17]); + lstep1[18] = _mm256_add_epi32(lstep3[20], lstep2[18]); + lstep1[19] = _mm256_add_epi32(lstep3[21], lstep2[19]); + lstep1[20] = _mm256_sub_epi32(lstep2[18], lstep3[20]); + lstep1[21] = _mm256_sub_epi32(lstep2[19], lstep3[21]); + lstep1[22] = _mm256_sub_epi32(lstep2[16], lstep3[22]); + lstep1[23] = _mm256_sub_epi32(lstep2[17], lstep3[23]); + lstep1[24] = _mm256_sub_epi32(lstep2[30], lstep3[24]); + lstep1[25] = _mm256_sub_epi32(lstep2[31], lstep3[25]); + lstep1[26] = _mm256_sub_epi32(lstep2[28], lstep3[26]); + lstep1[27] = _mm256_sub_epi32(lstep2[29], lstep3[27]); + lstep1[28] = _mm256_add_epi32(lstep3[26], lstep2[28]); + lstep1[29] = _mm256_add_epi32(lstep3[27], lstep2[29]); + lstep1[30] = _mm256_add_epi32(lstep3[24], lstep2[30]); + lstep1[31] = _mm256_add_epi32(lstep3[25], lstep2[31]); + } + { + // to be continued... + // + const __m256i k32_p16_p16 = pair256_set_epi32(cospi_16_64, cospi_16_64); + const __m256i k32_p16_m16 = pair256_set_epi32(cospi_16_64, -cospi_16_64); + + u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]); + u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]); + u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]); + u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]); + + // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide + // instruction latency. + v[ 0] = k_madd_epi32_avx2(u[0], k32_p16_m16); + v[ 1] = k_madd_epi32_avx2(u[1], k32_p16_m16); + v[ 2] = k_madd_epi32_avx2(u[2], k32_p16_m16); + v[ 3] = k_madd_epi32_avx2(u[3], k32_p16_m16); + v[ 4] = k_madd_epi32_avx2(u[0], k32_p16_p16); + v[ 5] = k_madd_epi32_avx2(u[1], k32_p16_p16); + v[ 6] = k_madd_epi32_avx2(u[2], k32_p16_p16); + v[ 7] = k_madd_epi32_avx2(u[3], k32_p16_p16); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + + lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + } + { + const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64); + const __m256i k32_m24_m08 = pair256_set_epi32(-cospi_24_64, -cospi_8_64); + const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64); + + u[ 0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]); + u[ 1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]); + u[ 2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]); + u[ 3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]); + u[ 4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]); + u[ 5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]); + u[ 6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]); + u[ 7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]); + u[ 8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]); + u[ 9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]); + u[10] = _mm256_unpacklo_epi32(lstep3[41], lstep3[55]); + u[11] = _mm256_unpackhi_epi32(lstep3[41], lstep3[55]); + u[12] = _mm256_unpacklo_epi32(lstep3[42], lstep3[52]); + u[13] = _mm256_unpackhi_epi32(lstep3[42], lstep3[52]); + u[14] = _mm256_unpacklo_epi32(lstep3[43], lstep3[53]); + u[15] = _mm256_unpackhi_epi32(lstep3[43], lstep3[53]); + + v[ 0] = k_madd_epi32_avx2(u[ 0], k32_m08_p24); + v[ 1] = k_madd_epi32_avx2(u[ 1], k32_m08_p24); + v[ 2] = k_madd_epi32_avx2(u[ 2], k32_m08_p24); + v[ 3] = k_madd_epi32_avx2(u[ 3], k32_m08_p24); + v[ 4] = k_madd_epi32_avx2(u[ 4], k32_m08_p24); + v[ 5] = k_madd_epi32_avx2(u[ 5], k32_m08_p24); + v[ 6] = k_madd_epi32_avx2(u[ 6], k32_m08_p24); + v[ 7] = k_madd_epi32_avx2(u[ 7], k32_m08_p24); + v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m24_m08); + v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m24_m08); + v[10] = k_madd_epi32_avx2(u[10], k32_m24_m08); + v[11] = k_madd_epi32_avx2(u[11], k32_m24_m08); + v[12] = k_madd_epi32_avx2(u[12], k32_m24_m08); + v[13] = k_madd_epi32_avx2(u[13], k32_m24_m08); + v[14] = k_madd_epi32_avx2(u[14], k32_m24_m08); + v[15] = k_madd_epi32_avx2(u[15], k32_m24_m08); + v[16] = k_madd_epi32_avx2(u[12], k32_m08_p24); + v[17] = k_madd_epi32_avx2(u[13], k32_m08_p24); + v[18] = k_madd_epi32_avx2(u[14], k32_m08_p24); + v[19] = k_madd_epi32_avx2(u[15], k32_m08_p24); + v[20] = k_madd_epi32_avx2(u[ 8], k32_m08_p24); + v[21] = k_madd_epi32_avx2(u[ 9], k32_m08_p24); + v[22] = k_madd_epi32_avx2(u[10], k32_m08_p24); + v[23] = k_madd_epi32_avx2(u[11], k32_m08_p24); + v[24] = k_madd_epi32_avx2(u[ 4], k32_p24_p08); + v[25] = k_madd_epi32_avx2(u[ 5], k32_p24_p08); + v[26] = k_madd_epi32_avx2(u[ 6], k32_p24_p08); + v[27] = k_madd_epi32_avx2(u[ 7], k32_p24_p08); + v[28] = k_madd_epi32_avx2(u[ 0], k32_p24_p08); + v[29] = k_madd_epi32_avx2(u[ 1], k32_p24_p08); + v[30] = k_madd_epi32_avx2(u[ 2], k32_p24_p08); + v[31] = k_madd_epi32_avx2(u[ 3], k32_p24_p08); + + u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]); + u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]); + u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]); + u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]); + u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]); + u[ 5] = k_packs_epi64_avx2(v[10], v[11]); + u[ 6] = k_packs_epi64_avx2(v[12], v[13]); + u[ 7] = k_packs_epi64_avx2(v[14], v[15]); + u[ 8] = k_packs_epi64_avx2(v[16], v[17]); + u[ 9] = k_packs_epi64_avx2(v[18], v[19]); + u[10] = k_packs_epi64_avx2(v[20], v[21]); + u[11] = k_packs_epi64_avx2(v[22], v[23]); + u[12] = k_packs_epi64_avx2(v[24], v[25]); + u[13] = k_packs_epi64_avx2(v[26], v[27]); + u[14] = k_packs_epi64_avx2(v[28], v[29]); + u[15] = k_packs_epi64_avx2(v[30], v[31]); + + v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); + v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); + v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); + v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); + v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); + v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); + v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); + v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); + v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); + v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); + v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + lstep1[36] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS); + lstep1[37] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS); + lstep1[38] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS); + lstep1[39] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS); + lstep1[40] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS); + lstep1[41] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS); + lstep1[42] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS); + lstep1[43] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS); + lstep1[52] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS); + lstep1[53] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS); + lstep1[54] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); + lstep1[55] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); + lstep1[56] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); + lstep1[57] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); + lstep1[58] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); + lstep1[59] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); + } + // stage 5 + { + lstep2[ 8] = _mm256_add_epi32(lstep1[10], lstep3[ 8]); + lstep2[ 9] = _mm256_add_epi32(lstep1[11], lstep3[ 9]); + lstep2[10] = _mm256_sub_epi32(lstep3[ 8], lstep1[10]); + lstep2[11] = _mm256_sub_epi32(lstep3[ 9], lstep1[11]); + lstep2[12] = _mm256_sub_epi32(lstep3[14], lstep1[12]); + lstep2[13] = _mm256_sub_epi32(lstep3[15], lstep1[13]); + lstep2[14] = _mm256_add_epi32(lstep1[12], lstep3[14]); + lstep2[15] = _mm256_add_epi32(lstep1[13], lstep3[15]); + } + { + const __m256i k32_p16_p16 = pair256_set_epi32(cospi_16_64, cospi_16_64); + const __m256i k32_p16_m16 = pair256_set_epi32(cospi_16_64, -cospi_16_64); + const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64); + const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64); + + u[0] = _mm256_unpacklo_epi32(lstep1[0], lstep1[2]); + u[1] = _mm256_unpackhi_epi32(lstep1[0], lstep1[2]); + u[2] = _mm256_unpacklo_epi32(lstep1[1], lstep1[3]); + u[3] = _mm256_unpackhi_epi32(lstep1[1], lstep1[3]); + u[4] = _mm256_unpacklo_epi32(lstep1[4], lstep1[6]); + u[5] = _mm256_unpackhi_epi32(lstep1[4], lstep1[6]); + u[6] = _mm256_unpacklo_epi32(lstep1[5], lstep1[7]); + u[7] = _mm256_unpackhi_epi32(lstep1[5], lstep1[7]); + + // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide + // instruction latency. + v[ 0] = k_madd_epi32_avx2(u[0], k32_p16_p16); + v[ 1] = k_madd_epi32_avx2(u[1], k32_p16_p16); + v[ 2] = k_madd_epi32_avx2(u[2], k32_p16_p16); + v[ 3] = k_madd_epi32_avx2(u[3], k32_p16_p16); + v[ 4] = k_madd_epi32_avx2(u[0], k32_p16_m16); + v[ 5] = k_madd_epi32_avx2(u[1], k32_p16_m16); + v[ 6] = k_madd_epi32_avx2(u[2], k32_p16_m16); + v[ 7] = k_madd_epi32_avx2(u[3], k32_p16_m16); + v[ 8] = k_madd_epi32_avx2(u[4], k32_p24_p08); + v[ 9] = k_madd_epi32_avx2(u[5], k32_p24_p08); + v[10] = k_madd_epi32_avx2(u[6], k32_p24_p08); + v[11] = k_madd_epi32_avx2(u[7], k32_p24_p08); + v[12] = k_madd_epi32_avx2(u[4], k32_m08_p24); + v[13] = k_madd_epi32_avx2(u[5], k32_m08_p24); + v[14] = k_madd_epi32_avx2(u[6], k32_m08_p24); + v[15] = k_madd_epi32_avx2(u[7], k32_m08_p24); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + + sign[0] = _mm256_cmpgt_epi32(kZero,u[0]); + sign[1] = _mm256_cmpgt_epi32(kZero,u[1]); + sign[2] = _mm256_cmpgt_epi32(kZero,u[2]); + sign[3] = _mm256_cmpgt_epi32(kZero,u[3]); + sign[4] = _mm256_cmpgt_epi32(kZero,u[4]); + sign[5] = _mm256_cmpgt_epi32(kZero,u[5]); + sign[6] = _mm256_cmpgt_epi32(kZero,u[6]); + sign[7] = _mm256_cmpgt_epi32(kZero,u[7]); + + u[0] = _mm256_sub_epi32(u[0], sign[0]); + u[1] = _mm256_sub_epi32(u[1], sign[1]); + u[2] = _mm256_sub_epi32(u[2], sign[2]); + u[3] = _mm256_sub_epi32(u[3], sign[3]); + u[4] = _mm256_sub_epi32(u[4], sign[4]); + u[5] = _mm256_sub_epi32(u[5], sign[5]); + u[6] = _mm256_sub_epi32(u[6], sign[6]); + u[7] = _mm256_sub_epi32(u[7], sign[7]); + + u[0] = _mm256_add_epi32(u[0], K32One); + u[1] = _mm256_add_epi32(u[1], K32One); + u[2] = _mm256_add_epi32(u[2], K32One); + u[3] = _mm256_add_epi32(u[3], K32One); + u[4] = _mm256_add_epi32(u[4], K32One); + u[5] = _mm256_add_epi32(u[5], K32One); + u[6] = _mm256_add_epi32(u[6], K32One); + u[7] = _mm256_add_epi32(u[7], K32One); + + u[0] = _mm256_srai_epi32(u[0], 2); + u[1] = _mm256_srai_epi32(u[1], 2); + u[2] = _mm256_srai_epi32(u[2], 2); + u[3] = _mm256_srai_epi32(u[3], 2); + u[4] = _mm256_srai_epi32(u[4], 2); + u[5] = _mm256_srai_epi32(u[5], 2); + u[6] = _mm256_srai_epi32(u[6], 2); + u[7] = _mm256_srai_epi32(u[7], 2); + + // Combine + out[ 0] = _mm256_packs_epi32(u[0], u[1]); + out[16] = _mm256_packs_epi32(u[2], u[3]); + out[ 8] = _mm256_packs_epi32(u[4], u[5]); + out[24] = _mm256_packs_epi32(u[6], u[7]); + } + { + const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64); + const __m256i k32_m24_m08 = pair256_set_epi32(-cospi_24_64, -cospi_8_64); + const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64); + + u[0] = _mm256_unpacklo_epi32(lstep1[18], lstep1[28]); + u[1] = _mm256_unpackhi_epi32(lstep1[18], lstep1[28]); + u[2] = _mm256_unpacklo_epi32(lstep1[19], lstep1[29]); + u[3] = _mm256_unpackhi_epi32(lstep1[19], lstep1[29]); + u[4] = _mm256_unpacklo_epi32(lstep1[20], lstep1[26]); + u[5] = _mm256_unpackhi_epi32(lstep1[20], lstep1[26]); + u[6] = _mm256_unpacklo_epi32(lstep1[21], lstep1[27]); + u[7] = _mm256_unpackhi_epi32(lstep1[21], lstep1[27]); + + v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24); + v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24); + v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24); + v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24); + v[4] = k_madd_epi32_avx2(u[4], k32_m24_m08); + v[5] = k_madd_epi32_avx2(u[5], k32_m24_m08); + v[6] = k_madd_epi32_avx2(u[6], k32_m24_m08); + v[7] = k_madd_epi32_avx2(u[7], k32_m24_m08); + v[ 8] = k_madd_epi32_avx2(u[4], k32_m08_p24); + v[ 9] = k_madd_epi32_avx2(u[5], k32_m08_p24); + v[10] = k_madd_epi32_avx2(u[6], k32_m08_p24); + v[11] = k_madd_epi32_avx2(u[7], k32_m08_p24); + v[12] = k_madd_epi32_avx2(u[0], k32_p24_p08); + v[13] = k_madd_epi32_avx2(u[1], k32_p24_p08); + v[14] = k_madd_epi32_avx2(u[2], k32_p24_p08); + v[15] = k_madd_epi32_avx2(u[3], k32_p24_p08); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + + u[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + u[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + u[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + u[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + u[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + u[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + u[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + u[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + lstep2[18] = _mm256_srai_epi32(u[0], DCT_CONST_BITS); + lstep2[19] = _mm256_srai_epi32(u[1], DCT_CONST_BITS); + lstep2[20] = _mm256_srai_epi32(u[2], DCT_CONST_BITS); + lstep2[21] = _mm256_srai_epi32(u[3], DCT_CONST_BITS); + lstep2[26] = _mm256_srai_epi32(u[4], DCT_CONST_BITS); + lstep2[27] = _mm256_srai_epi32(u[5], DCT_CONST_BITS); + lstep2[28] = _mm256_srai_epi32(u[6], DCT_CONST_BITS); + lstep2[29] = _mm256_srai_epi32(u[7], DCT_CONST_BITS); + } + { + lstep2[32] = _mm256_add_epi32(lstep1[38], lstep3[32]); + lstep2[33] = _mm256_add_epi32(lstep1[39], lstep3[33]); + lstep2[34] = _mm256_add_epi32(lstep1[36], lstep3[34]); + lstep2[35] = _mm256_add_epi32(lstep1[37], lstep3[35]); + lstep2[36] = _mm256_sub_epi32(lstep3[34], lstep1[36]); + lstep2[37] = _mm256_sub_epi32(lstep3[35], lstep1[37]); + lstep2[38] = _mm256_sub_epi32(lstep3[32], lstep1[38]); + lstep2[39] = _mm256_sub_epi32(lstep3[33], lstep1[39]); + lstep2[40] = _mm256_sub_epi32(lstep3[46], lstep1[40]); + lstep2[41] = _mm256_sub_epi32(lstep3[47], lstep1[41]); + lstep2[42] = _mm256_sub_epi32(lstep3[44], lstep1[42]); + lstep2[43] = _mm256_sub_epi32(lstep3[45], lstep1[43]); + lstep2[44] = _mm256_add_epi32(lstep1[42], lstep3[44]); + lstep2[45] = _mm256_add_epi32(lstep1[43], lstep3[45]); + lstep2[46] = _mm256_add_epi32(lstep1[40], lstep3[46]); + lstep2[47] = _mm256_add_epi32(lstep1[41], lstep3[47]); + lstep2[48] = _mm256_add_epi32(lstep1[54], lstep3[48]); + lstep2[49] = _mm256_add_epi32(lstep1[55], lstep3[49]); + lstep2[50] = _mm256_add_epi32(lstep1[52], lstep3[50]); + lstep2[51] = _mm256_add_epi32(lstep1[53], lstep3[51]); + lstep2[52] = _mm256_sub_epi32(lstep3[50], lstep1[52]); + lstep2[53] = _mm256_sub_epi32(lstep3[51], lstep1[53]); + lstep2[54] = _mm256_sub_epi32(lstep3[48], lstep1[54]); + lstep2[55] = _mm256_sub_epi32(lstep3[49], lstep1[55]); + lstep2[56] = _mm256_sub_epi32(lstep3[62], lstep1[56]); + lstep2[57] = _mm256_sub_epi32(lstep3[63], lstep1[57]); + lstep2[58] = _mm256_sub_epi32(lstep3[60], lstep1[58]); + lstep2[59] = _mm256_sub_epi32(lstep3[61], lstep1[59]); + lstep2[60] = _mm256_add_epi32(lstep1[58], lstep3[60]); + lstep2[61] = _mm256_add_epi32(lstep1[59], lstep3[61]); + lstep2[62] = _mm256_add_epi32(lstep1[56], lstep3[62]); + lstep2[63] = _mm256_add_epi32(lstep1[57], lstep3[63]); + } + // stage 6 + { + const __m256i k32_p28_p04 = pair256_set_epi32(cospi_28_64, cospi_4_64); + const __m256i k32_p12_p20 = pair256_set_epi32(cospi_12_64, cospi_20_64); + const __m256i k32_m20_p12 = pair256_set_epi32(-cospi_20_64, cospi_12_64); + const __m256i k32_m04_p28 = pair256_set_epi32(-cospi_4_64, cospi_28_64); + + u[0] = _mm256_unpacklo_epi32(lstep2[ 8], lstep2[14]); + u[1] = _mm256_unpackhi_epi32(lstep2[ 8], lstep2[14]); + u[2] = _mm256_unpacklo_epi32(lstep2[ 9], lstep2[15]); + u[3] = _mm256_unpackhi_epi32(lstep2[ 9], lstep2[15]); + u[4] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]); + u[5] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]); + u[6] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]); + u[7] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]); + u[8] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]); + u[9] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]); + u[10] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]); + u[11] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]); + u[12] = _mm256_unpacklo_epi32(lstep2[ 8], lstep2[14]); + u[13] = _mm256_unpackhi_epi32(lstep2[ 8], lstep2[14]); + u[14] = _mm256_unpacklo_epi32(lstep2[ 9], lstep2[15]); + u[15] = _mm256_unpackhi_epi32(lstep2[ 9], lstep2[15]); + + v[0] = k_madd_epi32_avx2(u[0], k32_p28_p04); + v[1] = k_madd_epi32_avx2(u[1], k32_p28_p04); + v[2] = k_madd_epi32_avx2(u[2], k32_p28_p04); + v[3] = k_madd_epi32_avx2(u[3], k32_p28_p04); + v[4] = k_madd_epi32_avx2(u[4], k32_p12_p20); + v[5] = k_madd_epi32_avx2(u[5], k32_p12_p20); + v[6] = k_madd_epi32_avx2(u[6], k32_p12_p20); + v[7] = k_madd_epi32_avx2(u[7], k32_p12_p20); + v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m20_p12); + v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m20_p12); + v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12); + v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12); + v[12] = k_madd_epi32_avx2(u[12], k32_m04_p28); + v[13] = k_madd_epi32_avx2(u[13], k32_m04_p28); + v[14] = k_madd_epi32_avx2(u[14], k32_m04_p28); + v[15] = k_madd_epi32_avx2(u[15], k32_m04_p28); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + + sign[0] = _mm256_cmpgt_epi32(kZero,u[0]); + sign[1] = _mm256_cmpgt_epi32(kZero,u[1]); + sign[2] = _mm256_cmpgt_epi32(kZero,u[2]); + sign[3] = _mm256_cmpgt_epi32(kZero,u[3]); + sign[4] = _mm256_cmpgt_epi32(kZero,u[4]); + sign[5] = _mm256_cmpgt_epi32(kZero,u[5]); + sign[6] = _mm256_cmpgt_epi32(kZero,u[6]); + sign[7] = _mm256_cmpgt_epi32(kZero,u[7]); + + u[0] = _mm256_sub_epi32(u[0], sign[0]); + u[1] = _mm256_sub_epi32(u[1], sign[1]); + u[2] = _mm256_sub_epi32(u[2], sign[2]); + u[3] = _mm256_sub_epi32(u[3], sign[3]); + u[4] = _mm256_sub_epi32(u[4], sign[4]); + u[5] = _mm256_sub_epi32(u[5], sign[5]); + u[6] = _mm256_sub_epi32(u[6], sign[6]); + u[7] = _mm256_sub_epi32(u[7], sign[7]); + + u[0] = _mm256_add_epi32(u[0], K32One); + u[1] = _mm256_add_epi32(u[1], K32One); + u[2] = _mm256_add_epi32(u[2], K32One); + u[3] = _mm256_add_epi32(u[3], K32One); + u[4] = _mm256_add_epi32(u[4], K32One); + u[5] = _mm256_add_epi32(u[5], K32One); + u[6] = _mm256_add_epi32(u[6], K32One); + u[7] = _mm256_add_epi32(u[7], K32One); + + u[0] = _mm256_srai_epi32(u[0], 2); + u[1] = _mm256_srai_epi32(u[1], 2); + u[2] = _mm256_srai_epi32(u[2], 2); + u[3] = _mm256_srai_epi32(u[3], 2); + u[4] = _mm256_srai_epi32(u[4], 2); + u[5] = _mm256_srai_epi32(u[5], 2); + u[6] = _mm256_srai_epi32(u[6], 2); + u[7] = _mm256_srai_epi32(u[7], 2); + + out[ 4] = _mm256_packs_epi32(u[0], u[1]); + out[20] = _mm256_packs_epi32(u[2], u[3]); + out[12] = _mm256_packs_epi32(u[4], u[5]); + out[28] = _mm256_packs_epi32(u[6], u[7]); + } + { + lstep3[16] = _mm256_add_epi32(lstep2[18], lstep1[16]); + lstep3[17] = _mm256_add_epi32(lstep2[19], lstep1[17]); + lstep3[18] = _mm256_sub_epi32(lstep1[16], lstep2[18]); + lstep3[19] = _mm256_sub_epi32(lstep1[17], lstep2[19]); + lstep3[20] = _mm256_sub_epi32(lstep1[22], lstep2[20]); + lstep3[21] = _mm256_sub_epi32(lstep1[23], lstep2[21]); + lstep3[22] = _mm256_add_epi32(lstep2[20], lstep1[22]); + lstep3[23] = _mm256_add_epi32(lstep2[21], lstep1[23]); + lstep3[24] = _mm256_add_epi32(lstep2[26], lstep1[24]); + lstep3[25] = _mm256_add_epi32(lstep2[27], lstep1[25]); + lstep3[26] = _mm256_sub_epi32(lstep1[24], lstep2[26]); + lstep3[27] = _mm256_sub_epi32(lstep1[25], lstep2[27]); + lstep3[28] = _mm256_sub_epi32(lstep1[30], lstep2[28]); + lstep3[29] = _mm256_sub_epi32(lstep1[31], lstep2[29]); + lstep3[30] = _mm256_add_epi32(lstep2[28], lstep1[30]); + lstep3[31] = _mm256_add_epi32(lstep2[29], lstep1[31]); + } + { + const __m256i k32_m04_p28 = pair256_set_epi32(-cospi_4_64, cospi_28_64); + const __m256i k32_m28_m04 = pair256_set_epi32(-cospi_28_64, -cospi_4_64); + const __m256i k32_m20_p12 = pair256_set_epi32(-cospi_20_64, cospi_12_64); + const __m256i k32_m12_m20 = pair256_set_epi32(-cospi_12_64, + -cospi_20_64); + const __m256i k32_p12_p20 = pair256_set_epi32(cospi_12_64, cospi_20_64); + const __m256i k32_p28_p04 = pair256_set_epi32(cospi_28_64, cospi_4_64); + + u[ 0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]); + u[ 1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]); + u[ 2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]); + u[ 3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]); + u[ 4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]); + u[ 5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]); + u[ 6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]); + u[ 7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]); + u[ 8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]); + u[ 9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]); + u[10] = _mm256_unpacklo_epi32(lstep2[43], lstep2[53]); + u[11] = _mm256_unpackhi_epi32(lstep2[43], lstep2[53]); + u[12] = _mm256_unpacklo_epi32(lstep2[44], lstep2[50]); + u[13] = _mm256_unpackhi_epi32(lstep2[44], lstep2[50]); + u[14] = _mm256_unpacklo_epi32(lstep2[45], lstep2[51]); + u[15] = _mm256_unpackhi_epi32(lstep2[45], lstep2[51]); + + v[ 0] = k_madd_epi32_avx2(u[ 0], k32_m04_p28); + v[ 1] = k_madd_epi32_avx2(u[ 1], k32_m04_p28); + v[ 2] = k_madd_epi32_avx2(u[ 2], k32_m04_p28); + v[ 3] = k_madd_epi32_avx2(u[ 3], k32_m04_p28); + v[ 4] = k_madd_epi32_avx2(u[ 4], k32_m28_m04); + v[ 5] = k_madd_epi32_avx2(u[ 5], k32_m28_m04); + v[ 6] = k_madd_epi32_avx2(u[ 6], k32_m28_m04); + v[ 7] = k_madd_epi32_avx2(u[ 7], k32_m28_m04); + v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m20_p12); + v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m20_p12); + v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12); + v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12); + v[12] = k_madd_epi32_avx2(u[12], k32_m12_m20); + v[13] = k_madd_epi32_avx2(u[13], k32_m12_m20); + v[14] = k_madd_epi32_avx2(u[14], k32_m12_m20); + v[15] = k_madd_epi32_avx2(u[15], k32_m12_m20); + v[16] = k_madd_epi32_avx2(u[12], k32_m20_p12); + v[17] = k_madd_epi32_avx2(u[13], k32_m20_p12); + v[18] = k_madd_epi32_avx2(u[14], k32_m20_p12); + v[19] = k_madd_epi32_avx2(u[15], k32_m20_p12); + v[20] = k_madd_epi32_avx2(u[ 8], k32_p12_p20); + v[21] = k_madd_epi32_avx2(u[ 9], k32_p12_p20); + v[22] = k_madd_epi32_avx2(u[10], k32_p12_p20); + v[23] = k_madd_epi32_avx2(u[11], k32_p12_p20); + v[24] = k_madd_epi32_avx2(u[ 4], k32_m04_p28); + v[25] = k_madd_epi32_avx2(u[ 5], k32_m04_p28); + v[26] = k_madd_epi32_avx2(u[ 6], k32_m04_p28); + v[27] = k_madd_epi32_avx2(u[ 7], k32_m04_p28); + v[28] = k_madd_epi32_avx2(u[ 0], k32_p28_p04); + v[29] = k_madd_epi32_avx2(u[ 1], k32_p28_p04); + v[30] = k_madd_epi32_avx2(u[ 2], k32_p28_p04); + v[31] = k_madd_epi32_avx2(u[ 3], k32_p28_p04); + + u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]); + u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]); + u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]); + u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]); + u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]); + u[ 5] = k_packs_epi64_avx2(v[10], v[11]); + u[ 6] = k_packs_epi64_avx2(v[12], v[13]); + u[ 7] = k_packs_epi64_avx2(v[14], v[15]); + u[ 8] = k_packs_epi64_avx2(v[16], v[17]); + u[ 9] = k_packs_epi64_avx2(v[18], v[19]); + u[10] = k_packs_epi64_avx2(v[20], v[21]); + u[11] = k_packs_epi64_avx2(v[22], v[23]); + u[12] = k_packs_epi64_avx2(v[24], v[25]); + u[13] = k_packs_epi64_avx2(v[26], v[27]); + u[14] = k_packs_epi64_avx2(v[28], v[29]); + u[15] = k_packs_epi64_avx2(v[30], v[31]); + + v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); + v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); + v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); + v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); + v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); + v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); + v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); + v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); + v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); + v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); + v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + lstep3[34] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS); + lstep3[35] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS); + lstep3[36] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS); + lstep3[37] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS); + lstep3[42] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS); + lstep3[43] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS); + lstep3[44] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS); + lstep3[45] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS); + lstep3[50] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS); + lstep3[51] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS); + lstep3[52] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); + lstep3[53] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); + lstep3[58] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); + lstep3[59] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); + lstep3[60] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); + lstep3[61] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); + } + // stage 7 + { + const __m256i k32_p30_p02 = pair256_set_epi32(cospi_30_64, cospi_2_64); + const __m256i k32_p14_p18 = pair256_set_epi32(cospi_14_64, cospi_18_64); + const __m256i k32_p22_p10 = pair256_set_epi32(cospi_22_64, cospi_10_64); + const __m256i k32_p06_p26 = pair256_set_epi32(cospi_6_64, cospi_26_64); + const __m256i k32_m26_p06 = pair256_set_epi32(-cospi_26_64, cospi_6_64); + const __m256i k32_m10_p22 = pair256_set_epi32(-cospi_10_64, cospi_22_64); + const __m256i k32_m18_p14 = pair256_set_epi32(-cospi_18_64, cospi_14_64); + const __m256i k32_m02_p30 = pair256_set_epi32(-cospi_2_64, cospi_30_64); + + u[ 0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]); + u[ 1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]); + u[ 2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]); + u[ 3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]); + u[ 4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]); + u[ 5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]); + u[ 6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]); + u[ 7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]); + u[ 8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]); + u[ 9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]); + u[10] = _mm256_unpacklo_epi32(lstep3[21], lstep3[27]); + u[11] = _mm256_unpackhi_epi32(lstep3[21], lstep3[27]); + u[12] = _mm256_unpacklo_epi32(lstep3[22], lstep3[24]); + u[13] = _mm256_unpackhi_epi32(lstep3[22], lstep3[24]); + u[14] = _mm256_unpacklo_epi32(lstep3[23], lstep3[25]); + u[15] = _mm256_unpackhi_epi32(lstep3[23], lstep3[25]); + + v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p30_p02); + v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p30_p02); + v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p30_p02); + v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p30_p02); + v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p14_p18); + v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p14_p18); + v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p14_p18); + v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p14_p18); + v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p22_p10); + v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p22_p10); + v[10] = k_madd_epi32_avx2(u[10], k32_p22_p10); + v[11] = k_madd_epi32_avx2(u[11], k32_p22_p10); + v[12] = k_madd_epi32_avx2(u[12], k32_p06_p26); + v[13] = k_madd_epi32_avx2(u[13], k32_p06_p26); + v[14] = k_madd_epi32_avx2(u[14], k32_p06_p26); + v[15] = k_madd_epi32_avx2(u[15], k32_p06_p26); + v[16] = k_madd_epi32_avx2(u[12], k32_m26_p06); + v[17] = k_madd_epi32_avx2(u[13], k32_m26_p06); + v[18] = k_madd_epi32_avx2(u[14], k32_m26_p06); + v[19] = k_madd_epi32_avx2(u[15], k32_m26_p06); + v[20] = k_madd_epi32_avx2(u[ 8], k32_m10_p22); + v[21] = k_madd_epi32_avx2(u[ 9], k32_m10_p22); + v[22] = k_madd_epi32_avx2(u[10], k32_m10_p22); + v[23] = k_madd_epi32_avx2(u[11], k32_m10_p22); + v[24] = k_madd_epi32_avx2(u[ 4], k32_m18_p14); + v[25] = k_madd_epi32_avx2(u[ 5], k32_m18_p14); + v[26] = k_madd_epi32_avx2(u[ 6], k32_m18_p14); + v[27] = k_madd_epi32_avx2(u[ 7], k32_m18_p14); + v[28] = k_madd_epi32_avx2(u[ 0], k32_m02_p30); + v[29] = k_madd_epi32_avx2(u[ 1], k32_m02_p30); + v[30] = k_madd_epi32_avx2(u[ 2], k32_m02_p30); + v[31] = k_madd_epi32_avx2(u[ 3], k32_m02_p30); + + u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]); + u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]); + u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]); + u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]); + u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]); + u[ 5] = k_packs_epi64_avx2(v[10], v[11]); + u[ 6] = k_packs_epi64_avx2(v[12], v[13]); + u[ 7] = k_packs_epi64_avx2(v[14], v[15]); + u[ 8] = k_packs_epi64_avx2(v[16], v[17]); + u[ 9] = k_packs_epi64_avx2(v[18], v[19]); + u[10] = k_packs_epi64_avx2(v[20], v[21]); + u[11] = k_packs_epi64_avx2(v[22], v[23]); + u[12] = k_packs_epi64_avx2(v[24], v[25]); + u[13] = k_packs_epi64_avx2(v[26], v[27]); + u[14] = k_packs_epi64_avx2(v[28], v[29]); + u[15] = k_packs_epi64_avx2(v[30], v[31]); + + v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); + v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); + v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); + v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); + v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); + v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); + v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); + v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); + v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); + v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); + v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS); + u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS); + u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS); + u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS); + u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS); + u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS); + u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS); + u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS); + u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS); + u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS); + u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); + + v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]); + v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]); + v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]); + v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]); + v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]); + v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]); + v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]); + v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]); + v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]); + v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]); + v[10] = _mm256_cmpgt_epi32(kZero,u[10]); + v[11] = _mm256_cmpgt_epi32(kZero,u[11]); + v[12] = _mm256_cmpgt_epi32(kZero,u[12]); + v[13] = _mm256_cmpgt_epi32(kZero,u[13]); + v[14] = _mm256_cmpgt_epi32(kZero,u[14]); + v[15] = _mm256_cmpgt_epi32(kZero,u[15]); + + u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]); + u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]); + u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]); + u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]); + u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]); + u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]); + u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]); + u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]); + u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]); + u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]); + u[10] = _mm256_sub_epi32(u[10], v[10]); + u[11] = _mm256_sub_epi32(u[11], v[11]); + u[12] = _mm256_sub_epi32(u[12], v[12]); + u[13] = _mm256_sub_epi32(u[13], v[13]); + u[14] = _mm256_sub_epi32(u[14], v[14]); + u[15] = _mm256_sub_epi32(u[15], v[15]); + + v[ 0] = _mm256_add_epi32(u[ 0], K32One); + v[ 1] = _mm256_add_epi32(u[ 1], K32One); + v[ 2] = _mm256_add_epi32(u[ 2], K32One); + v[ 3] = _mm256_add_epi32(u[ 3], K32One); + v[ 4] = _mm256_add_epi32(u[ 4], K32One); + v[ 5] = _mm256_add_epi32(u[ 5], K32One); + v[ 6] = _mm256_add_epi32(u[ 6], K32One); + v[ 7] = _mm256_add_epi32(u[ 7], K32One); + v[ 8] = _mm256_add_epi32(u[ 8], K32One); + v[ 9] = _mm256_add_epi32(u[ 9], K32One); + v[10] = _mm256_add_epi32(u[10], K32One); + v[11] = _mm256_add_epi32(u[11], K32One); + v[12] = _mm256_add_epi32(u[12], K32One); + v[13] = _mm256_add_epi32(u[13], K32One); + v[14] = _mm256_add_epi32(u[14], K32One); + v[15] = _mm256_add_epi32(u[15], K32One); + + u[ 0] = _mm256_srai_epi32(v[ 0], 2); + u[ 1] = _mm256_srai_epi32(v[ 1], 2); + u[ 2] = _mm256_srai_epi32(v[ 2], 2); + u[ 3] = _mm256_srai_epi32(v[ 3], 2); + u[ 4] = _mm256_srai_epi32(v[ 4], 2); + u[ 5] = _mm256_srai_epi32(v[ 5], 2); + u[ 6] = _mm256_srai_epi32(v[ 6], 2); + u[ 7] = _mm256_srai_epi32(v[ 7], 2); + u[ 8] = _mm256_srai_epi32(v[ 8], 2); + u[ 9] = _mm256_srai_epi32(v[ 9], 2); + u[10] = _mm256_srai_epi32(v[10], 2); + u[11] = _mm256_srai_epi32(v[11], 2); + u[12] = _mm256_srai_epi32(v[12], 2); + u[13] = _mm256_srai_epi32(v[13], 2); + u[14] = _mm256_srai_epi32(v[14], 2); + u[15] = _mm256_srai_epi32(v[15], 2); + + out[ 2] = _mm256_packs_epi32(u[0], u[1]); + out[18] = _mm256_packs_epi32(u[2], u[3]); + out[10] = _mm256_packs_epi32(u[4], u[5]); + out[26] = _mm256_packs_epi32(u[6], u[7]); + out[ 6] = _mm256_packs_epi32(u[8], u[9]); + out[22] = _mm256_packs_epi32(u[10], u[11]); + out[14] = _mm256_packs_epi32(u[12], u[13]); + out[30] = _mm256_packs_epi32(u[14], u[15]); + } + { + lstep1[32] = _mm256_add_epi32(lstep3[34], lstep2[32]); + lstep1[33] = _mm256_add_epi32(lstep3[35], lstep2[33]); + lstep1[34] = _mm256_sub_epi32(lstep2[32], lstep3[34]); + lstep1[35] = _mm256_sub_epi32(lstep2[33], lstep3[35]); + lstep1[36] = _mm256_sub_epi32(lstep2[38], lstep3[36]); + lstep1[37] = _mm256_sub_epi32(lstep2[39], lstep3[37]); + lstep1[38] = _mm256_add_epi32(lstep3[36], lstep2[38]); + lstep1[39] = _mm256_add_epi32(lstep3[37], lstep2[39]); + lstep1[40] = _mm256_add_epi32(lstep3[42], lstep2[40]); + lstep1[41] = _mm256_add_epi32(lstep3[43], lstep2[41]); + lstep1[42] = _mm256_sub_epi32(lstep2[40], lstep3[42]); + lstep1[43] = _mm256_sub_epi32(lstep2[41], lstep3[43]); + lstep1[44] = _mm256_sub_epi32(lstep2[46], lstep3[44]); + lstep1[45] = _mm256_sub_epi32(lstep2[47], lstep3[45]); + lstep1[46] = _mm256_add_epi32(lstep3[44], lstep2[46]); + lstep1[47] = _mm256_add_epi32(lstep3[45], lstep2[47]); + lstep1[48] = _mm256_add_epi32(lstep3[50], lstep2[48]); + lstep1[49] = _mm256_add_epi32(lstep3[51], lstep2[49]); + lstep1[50] = _mm256_sub_epi32(lstep2[48], lstep3[50]); + lstep1[51] = _mm256_sub_epi32(lstep2[49], lstep3[51]); + lstep1[52] = _mm256_sub_epi32(lstep2[54], lstep3[52]); + lstep1[53] = _mm256_sub_epi32(lstep2[55], lstep3[53]); + lstep1[54] = _mm256_add_epi32(lstep3[52], lstep2[54]); + lstep1[55] = _mm256_add_epi32(lstep3[53], lstep2[55]); + lstep1[56] = _mm256_add_epi32(lstep3[58], lstep2[56]); + lstep1[57] = _mm256_add_epi32(lstep3[59], lstep2[57]); + lstep1[58] = _mm256_sub_epi32(lstep2[56], lstep3[58]); + lstep1[59] = _mm256_sub_epi32(lstep2[57], lstep3[59]); + lstep1[60] = _mm256_sub_epi32(lstep2[62], lstep3[60]); + lstep1[61] = _mm256_sub_epi32(lstep2[63], lstep3[61]); + lstep1[62] = _mm256_add_epi32(lstep3[60], lstep2[62]); + lstep1[63] = _mm256_add_epi32(lstep3[61], lstep2[63]); + } + // stage 8 + { + const __m256i k32_p31_p01 = pair256_set_epi32(cospi_31_64, cospi_1_64); + const __m256i k32_p15_p17 = pair256_set_epi32(cospi_15_64, cospi_17_64); + const __m256i k32_p23_p09 = pair256_set_epi32(cospi_23_64, cospi_9_64); + const __m256i k32_p07_p25 = pair256_set_epi32(cospi_7_64, cospi_25_64); + const __m256i k32_m25_p07 = pair256_set_epi32(-cospi_25_64, cospi_7_64); + const __m256i k32_m09_p23 = pair256_set_epi32(-cospi_9_64, cospi_23_64); + const __m256i k32_m17_p15 = pair256_set_epi32(-cospi_17_64, cospi_15_64); + const __m256i k32_m01_p31 = pair256_set_epi32(-cospi_1_64, cospi_31_64); + + u[ 0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]); + u[ 1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]); + u[ 2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]); + u[ 3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]); + u[ 4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]); + u[ 5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]); + u[ 6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]); + u[ 7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]); + u[ 8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]); + u[ 9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]); + u[10] = _mm256_unpacklo_epi32(lstep1[37], lstep1[59]); + u[11] = _mm256_unpackhi_epi32(lstep1[37], lstep1[59]); + u[12] = _mm256_unpacklo_epi32(lstep1[38], lstep1[56]); + u[13] = _mm256_unpackhi_epi32(lstep1[38], lstep1[56]); + u[14] = _mm256_unpacklo_epi32(lstep1[39], lstep1[57]); + u[15] = _mm256_unpackhi_epi32(lstep1[39], lstep1[57]); + + v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p31_p01); + v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p31_p01); + v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p31_p01); + v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p31_p01); + v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p15_p17); + v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p15_p17); + v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p15_p17); + v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p15_p17); + v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p23_p09); + v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p23_p09); + v[10] = k_madd_epi32_avx2(u[10], k32_p23_p09); + v[11] = k_madd_epi32_avx2(u[11], k32_p23_p09); + v[12] = k_madd_epi32_avx2(u[12], k32_p07_p25); + v[13] = k_madd_epi32_avx2(u[13], k32_p07_p25); + v[14] = k_madd_epi32_avx2(u[14], k32_p07_p25); + v[15] = k_madd_epi32_avx2(u[15], k32_p07_p25); + v[16] = k_madd_epi32_avx2(u[12], k32_m25_p07); + v[17] = k_madd_epi32_avx2(u[13], k32_m25_p07); + v[18] = k_madd_epi32_avx2(u[14], k32_m25_p07); + v[19] = k_madd_epi32_avx2(u[15], k32_m25_p07); + v[20] = k_madd_epi32_avx2(u[ 8], k32_m09_p23); + v[21] = k_madd_epi32_avx2(u[ 9], k32_m09_p23); + v[22] = k_madd_epi32_avx2(u[10], k32_m09_p23); + v[23] = k_madd_epi32_avx2(u[11], k32_m09_p23); + v[24] = k_madd_epi32_avx2(u[ 4], k32_m17_p15); + v[25] = k_madd_epi32_avx2(u[ 5], k32_m17_p15); + v[26] = k_madd_epi32_avx2(u[ 6], k32_m17_p15); + v[27] = k_madd_epi32_avx2(u[ 7], k32_m17_p15); + v[28] = k_madd_epi32_avx2(u[ 0], k32_m01_p31); + v[29] = k_madd_epi32_avx2(u[ 1], k32_m01_p31); + v[30] = k_madd_epi32_avx2(u[ 2], k32_m01_p31); + v[31] = k_madd_epi32_avx2(u[ 3], k32_m01_p31); + + u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]); + u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]); + u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]); + u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]); + u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]); + u[ 5] = k_packs_epi64_avx2(v[10], v[11]); + u[ 6] = k_packs_epi64_avx2(v[12], v[13]); + u[ 7] = k_packs_epi64_avx2(v[14], v[15]); + u[ 8] = k_packs_epi64_avx2(v[16], v[17]); + u[ 9] = k_packs_epi64_avx2(v[18], v[19]); + u[10] = k_packs_epi64_avx2(v[20], v[21]); + u[11] = k_packs_epi64_avx2(v[22], v[23]); + u[12] = k_packs_epi64_avx2(v[24], v[25]); + u[13] = k_packs_epi64_avx2(v[26], v[27]); + u[14] = k_packs_epi64_avx2(v[28], v[29]); + u[15] = k_packs_epi64_avx2(v[30], v[31]); + + v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); + v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); + v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); + v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); + v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); + v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); + v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); + v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); + v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); + v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); + v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS); + u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS); + u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS); + u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS); + u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS); + u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS); + u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS); + u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS); + u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS); + u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS); + u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); + + v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]); + v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]); + v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]); + v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]); + v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]); + v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]); + v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]); + v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]); + v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]); + v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]); + v[10] = _mm256_cmpgt_epi32(kZero,u[10]); + v[11] = _mm256_cmpgt_epi32(kZero,u[11]); + v[12] = _mm256_cmpgt_epi32(kZero,u[12]); + v[13] = _mm256_cmpgt_epi32(kZero,u[13]); + v[14] = _mm256_cmpgt_epi32(kZero,u[14]); + v[15] = _mm256_cmpgt_epi32(kZero,u[15]); + + u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]); + u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]); + u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]); + u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]); + u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]); + u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]); + u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]); + u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]); + u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]); + u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]); + u[10] = _mm256_sub_epi32(u[10], v[10]); + u[11] = _mm256_sub_epi32(u[11], v[11]); + u[12] = _mm256_sub_epi32(u[12], v[12]); + u[13] = _mm256_sub_epi32(u[13], v[13]); + u[14] = _mm256_sub_epi32(u[14], v[14]); + u[15] = _mm256_sub_epi32(u[15], v[15]); + + v[0] = _mm256_add_epi32(u[0], K32One); + v[1] = _mm256_add_epi32(u[1], K32One); + v[2] = _mm256_add_epi32(u[2], K32One); + v[3] = _mm256_add_epi32(u[3], K32One); + v[4] = _mm256_add_epi32(u[4], K32One); + v[5] = _mm256_add_epi32(u[5], K32One); + v[6] = _mm256_add_epi32(u[6], K32One); + v[7] = _mm256_add_epi32(u[7], K32One); + v[8] = _mm256_add_epi32(u[8], K32One); + v[9] = _mm256_add_epi32(u[9], K32One); + v[10] = _mm256_add_epi32(u[10], K32One); + v[11] = _mm256_add_epi32(u[11], K32One); + v[12] = _mm256_add_epi32(u[12], K32One); + v[13] = _mm256_add_epi32(u[13], K32One); + v[14] = _mm256_add_epi32(u[14], K32One); + v[15] = _mm256_add_epi32(u[15], K32One); + + u[0] = _mm256_srai_epi32(v[0], 2); + u[1] = _mm256_srai_epi32(v[1], 2); + u[2] = _mm256_srai_epi32(v[2], 2); + u[3] = _mm256_srai_epi32(v[3], 2); + u[4] = _mm256_srai_epi32(v[4], 2); + u[5] = _mm256_srai_epi32(v[5], 2); + u[6] = _mm256_srai_epi32(v[6], 2); + u[7] = _mm256_srai_epi32(v[7], 2); + u[8] = _mm256_srai_epi32(v[8], 2); + u[9] = _mm256_srai_epi32(v[9], 2); + u[10] = _mm256_srai_epi32(v[10], 2); + u[11] = _mm256_srai_epi32(v[11], 2); + u[12] = _mm256_srai_epi32(v[12], 2); + u[13] = _mm256_srai_epi32(v[13], 2); + u[14] = _mm256_srai_epi32(v[14], 2); + u[15] = _mm256_srai_epi32(v[15], 2); + + out[ 1] = _mm256_packs_epi32(u[0], u[1]); + out[17] = _mm256_packs_epi32(u[2], u[3]); + out[ 9] = _mm256_packs_epi32(u[4], u[5]); + out[25] = _mm256_packs_epi32(u[6], u[7]); + out[ 7] = _mm256_packs_epi32(u[8], u[9]); + out[23] = _mm256_packs_epi32(u[10], u[11]); + out[15] = _mm256_packs_epi32(u[12], u[13]); + out[31] = _mm256_packs_epi32(u[14], u[15]); + } + { + const __m256i k32_p27_p05 = pair256_set_epi32(cospi_27_64, cospi_5_64); + const __m256i k32_p11_p21 = pair256_set_epi32(cospi_11_64, cospi_21_64); + const __m256i k32_p19_p13 = pair256_set_epi32(cospi_19_64, cospi_13_64); + const __m256i k32_p03_p29 = pair256_set_epi32(cospi_3_64, cospi_29_64); + const __m256i k32_m29_p03 = pair256_set_epi32(-cospi_29_64, cospi_3_64); + const __m256i k32_m13_p19 = pair256_set_epi32(-cospi_13_64, cospi_19_64); + const __m256i k32_m21_p11 = pair256_set_epi32(-cospi_21_64, cospi_11_64); + const __m256i k32_m05_p27 = pair256_set_epi32(-cospi_5_64, cospi_27_64); + + u[ 0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]); + u[ 1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]); + u[ 2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]); + u[ 3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]); + u[ 4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]); + u[ 5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]); + u[ 6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]); + u[ 7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]); + u[ 8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]); + u[ 9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]); + u[10] = _mm256_unpacklo_epi32(lstep1[45], lstep1[51]); + u[11] = _mm256_unpackhi_epi32(lstep1[45], lstep1[51]); + u[12] = _mm256_unpacklo_epi32(lstep1[46], lstep1[48]); + u[13] = _mm256_unpackhi_epi32(lstep1[46], lstep1[48]); + u[14] = _mm256_unpacklo_epi32(lstep1[47], lstep1[49]); + u[15] = _mm256_unpackhi_epi32(lstep1[47], lstep1[49]); + + v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p27_p05); + v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p27_p05); + v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p27_p05); + v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p27_p05); + v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p11_p21); + v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p11_p21); + v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p11_p21); + v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p11_p21); + v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p19_p13); + v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p19_p13); + v[10] = k_madd_epi32_avx2(u[10], k32_p19_p13); + v[11] = k_madd_epi32_avx2(u[11], k32_p19_p13); + v[12] = k_madd_epi32_avx2(u[12], k32_p03_p29); + v[13] = k_madd_epi32_avx2(u[13], k32_p03_p29); + v[14] = k_madd_epi32_avx2(u[14], k32_p03_p29); + v[15] = k_madd_epi32_avx2(u[15], k32_p03_p29); + v[16] = k_madd_epi32_avx2(u[12], k32_m29_p03); + v[17] = k_madd_epi32_avx2(u[13], k32_m29_p03); + v[18] = k_madd_epi32_avx2(u[14], k32_m29_p03); + v[19] = k_madd_epi32_avx2(u[15], k32_m29_p03); + v[20] = k_madd_epi32_avx2(u[ 8], k32_m13_p19); + v[21] = k_madd_epi32_avx2(u[ 9], k32_m13_p19); + v[22] = k_madd_epi32_avx2(u[10], k32_m13_p19); + v[23] = k_madd_epi32_avx2(u[11], k32_m13_p19); + v[24] = k_madd_epi32_avx2(u[ 4], k32_m21_p11); + v[25] = k_madd_epi32_avx2(u[ 5], k32_m21_p11); + v[26] = k_madd_epi32_avx2(u[ 6], k32_m21_p11); + v[27] = k_madd_epi32_avx2(u[ 7], k32_m21_p11); + v[28] = k_madd_epi32_avx2(u[ 0], k32_m05_p27); + v[29] = k_madd_epi32_avx2(u[ 1], k32_m05_p27); + v[30] = k_madd_epi32_avx2(u[ 2], k32_m05_p27); + v[31] = k_madd_epi32_avx2(u[ 3], k32_m05_p27); + + u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]); + u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]); + u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]); + u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]); + u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]); + u[ 5] = k_packs_epi64_avx2(v[10], v[11]); + u[ 6] = k_packs_epi64_avx2(v[12], v[13]); + u[ 7] = k_packs_epi64_avx2(v[14], v[15]); + u[ 8] = k_packs_epi64_avx2(v[16], v[17]); + u[ 9] = k_packs_epi64_avx2(v[18], v[19]); + u[10] = k_packs_epi64_avx2(v[20], v[21]); + u[11] = k_packs_epi64_avx2(v[22], v[23]); + u[12] = k_packs_epi64_avx2(v[24], v[25]); + u[13] = k_packs_epi64_avx2(v[26], v[27]); + u[14] = k_packs_epi64_avx2(v[28], v[29]); + u[15] = k_packs_epi64_avx2(v[30], v[31]); + + v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); + v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); + v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); + v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); + v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); + v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); + v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); + v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); + v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); + v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); + v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS); + u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS); + u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS); + u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS); + u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS); + u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS); + u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS); + u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS); + u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS); + u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS); + u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); + + v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]); + v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]); + v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]); + v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]); + v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]); + v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]); + v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]); + v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]); + v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]); + v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]); + v[10] = _mm256_cmpgt_epi32(kZero,u[10]); + v[11] = _mm256_cmpgt_epi32(kZero,u[11]); + v[12] = _mm256_cmpgt_epi32(kZero,u[12]); + v[13] = _mm256_cmpgt_epi32(kZero,u[13]); + v[14] = _mm256_cmpgt_epi32(kZero,u[14]); + v[15] = _mm256_cmpgt_epi32(kZero,u[15]); + + u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]); + u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]); + u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]); + u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]); + u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]); + u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]); + u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]); + u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]); + u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]); + u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]); + u[10] = _mm256_sub_epi32(u[10], v[10]); + u[11] = _mm256_sub_epi32(u[11], v[11]); + u[12] = _mm256_sub_epi32(u[12], v[12]); + u[13] = _mm256_sub_epi32(u[13], v[13]); + u[14] = _mm256_sub_epi32(u[14], v[14]); + u[15] = _mm256_sub_epi32(u[15], v[15]); + + v[0] = _mm256_add_epi32(u[0], K32One); + v[1] = _mm256_add_epi32(u[1], K32One); + v[2] = _mm256_add_epi32(u[2], K32One); + v[3] = _mm256_add_epi32(u[3], K32One); + v[4] = _mm256_add_epi32(u[4], K32One); + v[5] = _mm256_add_epi32(u[5], K32One); + v[6] = _mm256_add_epi32(u[6], K32One); + v[7] = _mm256_add_epi32(u[7], K32One); + v[8] = _mm256_add_epi32(u[8], K32One); + v[9] = _mm256_add_epi32(u[9], K32One); + v[10] = _mm256_add_epi32(u[10], K32One); + v[11] = _mm256_add_epi32(u[11], K32One); + v[12] = _mm256_add_epi32(u[12], K32One); + v[13] = _mm256_add_epi32(u[13], K32One); + v[14] = _mm256_add_epi32(u[14], K32One); + v[15] = _mm256_add_epi32(u[15], K32One); + + u[0] = _mm256_srai_epi32(v[0], 2); + u[1] = _mm256_srai_epi32(v[1], 2); + u[2] = _mm256_srai_epi32(v[2], 2); + u[3] = _mm256_srai_epi32(v[3], 2); + u[4] = _mm256_srai_epi32(v[4], 2); + u[5] = _mm256_srai_epi32(v[5], 2); + u[6] = _mm256_srai_epi32(v[6], 2); + u[7] = _mm256_srai_epi32(v[7], 2); + u[8] = _mm256_srai_epi32(v[8], 2); + u[9] = _mm256_srai_epi32(v[9], 2); + u[10] = _mm256_srai_epi32(v[10], 2); + u[11] = _mm256_srai_epi32(v[11], 2); + u[12] = _mm256_srai_epi32(v[12], 2); + u[13] = _mm256_srai_epi32(v[13], 2); + u[14] = _mm256_srai_epi32(v[14], 2); + u[15] = _mm256_srai_epi32(v[15], 2); + + out[ 5] = _mm256_packs_epi32(u[0], u[1]); + out[21] = _mm256_packs_epi32(u[2], u[3]); + out[13] = _mm256_packs_epi32(u[4], u[5]); + out[29] = _mm256_packs_epi32(u[6], u[7]); + out[ 3] = _mm256_packs_epi32(u[8], u[9]); + out[19] = _mm256_packs_epi32(u[10], u[11]); + out[11] = _mm256_packs_epi32(u[12], u[13]); + out[27] = _mm256_packs_epi32(u[14], u[15]); + } + } +#endif + // Transpose the results, do it as four 8x8 transposes. + { + int transpose_block; + int16_t *output_currStep,*output_nextStep; + if (0 == pass){ + output_currStep = &intermediate[column_start * 32]; + output_nextStep = &intermediate[(column_start + 8) * 32]; + } else{ + output_currStep = &output_org[column_start * 32]; + output_nextStep = &output_org[(column_start + 8) * 32]; + } + for (transpose_block = 0; transpose_block < 4; ++transpose_block) { + __m256i *this_out = &out[8 * transpose_block]; + // 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + // 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 + // 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 + // 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 + // 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 + // 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 + // 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 + // 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 + const __m256i tr0_0 = _mm256_unpacklo_epi16(this_out[0], this_out[1]); + const __m256i tr0_1 = _mm256_unpacklo_epi16(this_out[2], this_out[3]); + const __m256i tr0_2 = _mm256_unpackhi_epi16(this_out[0], this_out[1]); + const __m256i tr0_3 = _mm256_unpackhi_epi16(this_out[2], this_out[3]); + const __m256i tr0_4 = _mm256_unpacklo_epi16(this_out[4], this_out[5]); + const __m256i tr0_5 = _mm256_unpacklo_epi16(this_out[6], this_out[7]); + const __m256i tr0_6 = _mm256_unpackhi_epi16(this_out[4], this_out[5]); + const __m256i tr0_7 = _mm256_unpackhi_epi16(this_out[6], this_out[7]); + // 00 20 01 21 02 22 03 23 08 28 09 29 10 30 11 31 + // 40 60 41 61 42 62 43 63 48 68 49 69 50 70 51 71 + // 04 24 05 25 06 26 07 27 12 32 13 33 14 34 15 35 + // 44 64 45 65 46 66 47 67 52 72 53 73 54 74 55 75 + // 80 100 81 101 82 102 83 103 88 108 89 109 90 110 91 101 + // 120 140 121 141 122 142 123 143 128 148 129 149 130 150 131 151 + // 84 104 85 105 86 106 87 107 92 112 93 113 94 114 95 115 + // 124 144 125 145 126 146 127 147 132 152 133 153 134 154 135 155 + + const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1); + const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_2, tr0_3); + const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1); + const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_2, tr0_3); + const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_5); + const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7); + const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_4, tr0_5); + const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7); + // 00 20 40 60 01 21 41 61 08 28 48 68 09 29 49 69 + // 04 24 44 64 05 25 45 65 12 32 52 72 13 33 53 73 + // 02 22 42 62 03 23 43 63 10 30 50 70 11 31 51 71 + // 06 26 46 66 07 27 47 67 14 34 54 74 15 35 55 75 + // 80 100 120 140 81 101 121 141 88 108 128 148 89 109 129 149 + // 84 104 124 144 85 105 125 145 92 112 132 152 93 113 133 153 + // 82 102 122 142 83 103 123 143 90 110 130 150 91 101 131 151 + // 86 106 126 146 87 107 127 147 94 114 134 154 95 115 135 155 + __m256i tr2_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4); + __m256i tr2_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4); + __m256i tr2_2 = _mm256_unpacklo_epi64(tr1_2, tr1_6); + __m256i tr2_3 = _mm256_unpackhi_epi64(tr1_2, tr1_6); + __m256i tr2_4 = _mm256_unpacklo_epi64(tr1_1, tr1_5); + __m256i tr2_5 = _mm256_unpackhi_epi64(tr1_1, tr1_5); + __m256i tr2_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7); + __m256i tr2_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7); + // 00 20 40 60 80 100 120 140 08 28 48 68 88 108 128 148 + // 01 21 41 61 81 101 121 141 09 29 49 69 89 109 129 149 + // 02 22 42 62 82 102 122 142 10 30 50 70 90 110 130 150 + // 03 23 43 63 83 103 123 143 11 31 51 71 91 101 131 151 + // 04 24 44 64 84 104 124 144 12 32 52 72 92 112 132 152 + // 05 25 45 65 85 105 125 145 13 33 53 73 93 113 133 153 + // 06 26 46 66 86 106 126 146 14 34 54 74 94 114 134 154 + // 07 27 47 67 87 107 127 147 15 35 55 75 95 115 135 155 + if (0 == pass) { + // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2; + // TODO(cd): see quality impact of only doing + // output[j] = (output[j] + 1) >> 2; + // which would remove the code between here ... + __m256i tr2_0_0 = _mm256_cmpgt_epi16(tr2_0, kZero); + __m256i tr2_1_0 = _mm256_cmpgt_epi16(tr2_1, kZero); + __m256i tr2_2_0 = _mm256_cmpgt_epi16(tr2_2, kZero); + __m256i tr2_3_0 = _mm256_cmpgt_epi16(tr2_3, kZero); + __m256i tr2_4_0 = _mm256_cmpgt_epi16(tr2_4, kZero); + __m256i tr2_5_0 = _mm256_cmpgt_epi16(tr2_5, kZero); + __m256i tr2_6_0 = _mm256_cmpgt_epi16(tr2_6, kZero); + __m256i tr2_7_0 = _mm256_cmpgt_epi16(tr2_7, kZero); + tr2_0 = _mm256_sub_epi16(tr2_0, tr2_0_0); + tr2_1 = _mm256_sub_epi16(tr2_1, tr2_1_0); + tr2_2 = _mm256_sub_epi16(tr2_2, tr2_2_0); + tr2_3 = _mm256_sub_epi16(tr2_3, tr2_3_0); + tr2_4 = _mm256_sub_epi16(tr2_4, tr2_4_0); + tr2_5 = _mm256_sub_epi16(tr2_5, tr2_5_0); + tr2_6 = _mm256_sub_epi16(tr2_6, tr2_6_0); + tr2_7 = _mm256_sub_epi16(tr2_7, tr2_7_0); + // ... and here. + // PS: also change code in vp9/encoder/vp9_dct.c + tr2_0 = _mm256_add_epi16(tr2_0, kOne); + tr2_1 = _mm256_add_epi16(tr2_1, kOne); + tr2_2 = _mm256_add_epi16(tr2_2, kOne); + tr2_3 = _mm256_add_epi16(tr2_3, kOne); + tr2_4 = _mm256_add_epi16(tr2_4, kOne); + tr2_5 = _mm256_add_epi16(tr2_5, kOne); + tr2_6 = _mm256_add_epi16(tr2_6, kOne); + tr2_7 = _mm256_add_epi16(tr2_7, kOne); + tr2_0 = _mm256_srai_epi16(tr2_0, 2); + tr2_1 = _mm256_srai_epi16(tr2_1, 2); + tr2_2 = _mm256_srai_epi16(tr2_2, 2); + tr2_3 = _mm256_srai_epi16(tr2_3, 2); + tr2_4 = _mm256_srai_epi16(tr2_4, 2); + tr2_5 = _mm256_srai_epi16(tr2_5, 2); + tr2_6 = _mm256_srai_epi16(tr2_6, 2); + tr2_7 = _mm256_srai_epi16(tr2_7, 2); + } + // Note: even though all these stores are aligned, using the aligned + // intrinsic make the code slightly slower. + _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32), _mm256_castsi256_si128(tr2_0)); + _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32), _mm256_castsi256_si128(tr2_1)); + _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32), _mm256_castsi256_si128(tr2_2)); + _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32), _mm256_castsi256_si128(tr2_3)); + _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32), _mm256_castsi256_si128(tr2_4)); + _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32), _mm256_castsi256_si128(tr2_5)); + _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32), _mm256_castsi256_si128(tr2_6)); + _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32), _mm256_castsi256_si128(tr2_7)); + + _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32), _mm256_extractf128_si256(tr2_0,1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32), _mm256_extractf128_si256(tr2_1,1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32), _mm256_extractf128_si256(tr2_2,1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32), _mm256_extractf128_si256(tr2_3,1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32), _mm256_extractf128_si256(tr2_4,1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32), _mm256_extractf128_si256(tr2_5,1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32), _mm256_extractf128_si256(tr2_6,1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32), _mm256_extractf128_si256(tr2_7,1)); + // Process next 8x8 + output_currStep += 8; + output_nextStep += 8; + } + } + } + } +} // NOLINT diff --git a/media/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c b/media/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c index 2d59775cec62..42fdbbdc5ce6 100644 --- a/media/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c +++ b/media/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c @@ -12,6 +12,9 @@ #include "vp9/common/vp9_idct.h" // for cospi constants #include "vpx_ports/mem.h" +#define pair_set_epi32(a, b) \ + _mm_set_epi32(b, a, b, a) + #if FDCT32x32_HIGH_PRECISION static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) { __m128i buf0, buf1; diff --git a/media/libvpx/vp9/encoder/x86/vp9_dct_avx2.c b/media/libvpx/vp9/encoder/x86/vp9_dct_avx2.c new file mode 100644 index 000000000000..3a19f52746ce --- /dev/null +++ b/media/libvpx/vp9/encoder/x86/vp9_dct_avx2.c @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // AVX2 +#include "vp9/common/vp9_idct.h" // for cospi constants +#include "vpx_ports/mem.h" + + +#define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2 +#define FDCT32x32_HIGH_PRECISION 0 +#include "vp9/encoder/x86/vp9_dct32x32_avx2.c" +#undef FDCT32x32_2D_AVX2 +#undef FDCT32x32_HIGH_PRECISION + +#define FDCT32x32_2D_AVX2 vp9_fdct32x32_avx2 +#define FDCT32x32_HIGH_PRECISION 1 +#include "vp9/encoder/x86/vp9_dct32x32_avx2.c" // NOLINT +#undef FDCT32x32_2D_AVX2 +#undef FDCT32x32_HIGH_PRECISION diff --git a/media/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm b/media/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm new file mode 100644 index 000000000000..f71181c5e91a --- /dev/null +++ b/media/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm @@ -0,0 +1,70 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro TRANSFORM_COLS 0 + paddw m0, m1 + movq m4, m0 + psubw m3, m2 + psubw m4, m3 + psraw m4, 1 + movq m5, m4 + psubw m5, m1 ;b1 + psubw m4, m2 ;c1 + psubw m0, m4 + paddw m3, m5 + ; m0 a0 + SWAP 1, 4 ; m1 c1 + SWAP 2, 3 ; m2 d1 + SWAP 3, 5 ; m3 b1 +%endmacro + +%macro TRANSPOSE_4X4 0 + movq m4, m0 + movq m5, m2 + punpcklwd m4, m1 + punpckhwd m0, m1 + punpcklwd m5, m3 + punpckhwd m2, m3 + movq m1, m4 + movq m3, m0 + punpckldq m1, m5 + punpckhdq m4, m5 + punpckldq m3, m2 + punpckhdq m0, m2 + SWAP 2, 3, 0, 1, 4 +%endmacro + +INIT_MMX mmx +cglobal fwht4x4, 3, 4, 8, input, output, stride + lea r3q, [inputq + strideq*4] + movq m0, [inputq] ;a1 + movq m1, [inputq + strideq*2] ;b1 + movq m2, [r3q] ;c1 + movq m3, [r3q + strideq*2] ;d1 + + TRANSFORM_COLS + TRANSPOSE_4X4 + TRANSFORM_COLS + TRANSPOSE_4X4 + + psllw m0, 2 + psllw m1, 2 + psllw m2, 2 + psllw m3, 2 + + movq [outputq], m0 + movq [outputq + 8], m1 + movq [outputq + 16], m2 + movq [outputq + 24], m3 + + RET diff --git a/media/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/media/libvpx/vp9/encoder/x86/vp9_dct_sse2.c index fefca660d7ea..e799951c2b3c 100644 --- a/media/libvpx/vp9/encoder/x86/vp9_dct_sse2.c +++ b/media/libvpx/vp9/encoder/x86/vp9_dct_sse2.c @@ -12,41 +12,114 @@ #include "vp9/common/vp9_idct.h" // for cospi constants #include "vpx_ports/mem.h" +#include "vp9/common/x86/vp9_idct_intrin_sse2.h" + +void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) { + __m128i in0, in1; + __m128i tmp; + const __m128i zero = _mm_setzero_si128(); + in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *) + (input + 2 * stride))); + in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *) + (input + 3 * stride))); + + tmp = _mm_add_epi16(in0, in1); + in0 = _mm_unpacklo_epi16(zero, tmp); + in1 = _mm_unpackhi_epi16(zero, tmp); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + tmp = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(tmp, zero); + in1 = _mm_unpackhi_epi32(tmp, zero); + + tmp = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(tmp, 8); + + in1 = _mm_add_epi32(tmp, in0); + in0 = _mm_slli_epi32(in1, 1); + _mm_store_si128((__m128i *)(output), in0); +} + void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { - // The 2D transform is done with two passes which are actually pretty - // similar. In the first one, we transform the columns and transpose - // the results. In the second one, we transform the rows. To achieve that, - // as the first pass results are transposed, we tranpose the columns (that - // is the transposed rows) and transpose the results (so that it goes back - // in normal/row positions). - int pass; + // This 2D transform implements 4 vertical 1D transforms followed + // by 4 horizontal 1D transforms. The multiplies and adds are as given + // by Chen, Smith and Fralick ('77). The commands for moving the data + // around have been minimized by hand. + // For the purposes of the comments, the 16 inputs are referred to at i0 + // through iF (in raster order), intermediate variables are a0, b0, c0 + // through f, and correspond to the in-place computations mapped to input + // locations. The outputs, o0 through oF are labeled according to the + // output locations. + // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + // These are the coefficients used for the multiplies. + // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64), + // where cospi_N_64 = cos(N pi /64) + const __m128i k__cospi_A = _mm_setr_epi16(cospi_16_64, cospi_16_64, + cospi_16_64, cospi_16_64, + cospi_16_64, -cospi_16_64, + cospi_16_64, -cospi_16_64); + const __m128i k__cospi_B = _mm_setr_epi16(cospi_16_64, -cospi_16_64, + cospi_16_64, -cospi_16_64, + cospi_16_64, cospi_16_64, + cospi_16_64, cospi_16_64); + const __m128i k__cospi_C = _mm_setr_epi16(cospi_8_64, cospi_24_64, + cospi_8_64, cospi_24_64, + cospi_24_64, -cospi_8_64, + cospi_24_64, -cospi_8_64); + const __m128i k__cospi_D = _mm_setr_epi16(cospi_24_64, -cospi_8_64, + cospi_24_64, -cospi_8_64, + cospi_8_64, cospi_24_64, + cospi_8_64, cospi_24_64); + const __m128i k__cospi_E = _mm_setr_epi16(cospi_16_64, cospi_16_64, + cospi_16_64, cospi_16_64, + cospi_16_64, cospi_16_64, + cospi_16_64, cospi_16_64); + const __m128i k__cospi_F = _mm_setr_epi16(cospi_16_64, -cospi_16_64, + cospi_16_64, -cospi_16_64, + cospi_16_64, -cospi_16_64, + cospi_16_64, -cospi_16_64); + const __m128i k__cospi_G = _mm_setr_epi16(cospi_8_64, cospi_24_64, + cospi_8_64, cospi_24_64, + -cospi_8_64, -cospi_24_64, + -cospi_8_64, -cospi_24_64); + const __m128i k__cospi_H = _mm_setr_epi16(cospi_24_64, -cospi_8_64, + cospi_24_64, -cospi_8_64, + -cospi_24_64, cospi_8_64, + -cospi_24_64, cospi_8_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + // This second rounding constant saves doing some extra adds at the end + const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING + +(DCT_CONST_ROUNDING << 1)); + const int DCT_CONST_BITS2 = DCT_CONST_BITS+2; const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); - const __m128i kOne = _mm_set1_epi16(1); - __m128i in0, in1, in2, in3; + __m128i in0, in1; + // Load inputs. { in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); - in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); - // x = x << 4 + in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *) + (input + 2 * stride))); + in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *) + (input + 3 * stride))); + // in0 = [i0 i1 i2 i3 iC iD iE iF] + // in1 = [i4 i5 i6 i7 i8 i9 iA iB] + + + // multiply by 16 to give some extra precision in0 = _mm_slli_epi16(in0, 4); in1 = _mm_slli_epi16(in1, 4); - in2 = _mm_slli_epi16(in2, 4); - in3 = _mm_slli_epi16(in3, 4); // if (i == 0 && input[0]) input[0] += 1; + // add 1 to the upper left pixel if it is non-zero, which helps reduce + // the round-trip error { - // The mask will only contain wether the first value is zero, all + // The mask will only contain whether the first value is zero, all // other comparison will fail as something shifted by 4 (above << 4) // can never be equal to one. To increment in the non-zero case, we // add the mask and one for the first element: @@ -57,60 +130,119 @@ void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { in0 = _mm_add_epi16(in0, k__nonzero_bias_b); } } - // Do the two transform/transpose passes - for (pass = 0; pass < 2; ++pass) { - // Transform 1/2: Add/substract - const __m128i r0 = _mm_add_epi16(in0, in3); - const __m128i r1 = _mm_add_epi16(in1, in2); - const __m128i r2 = _mm_sub_epi16(in1, in2); - const __m128i r3 = _mm_sub_epi16(in0, in3); - // Transform 1/2: Interleave to do the multiply by constants which gets us - // into 32 bits. - const __m128i t0 = _mm_unpacklo_epi16(r0, r1); - const __m128i t2 = _mm_unpacklo_epi16(r2, r3); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); + // There are 4 total stages, alternating between an add/subtract stage + // followed by an multiply-and-add stage. + { + // Stage 1: Add/subtract + + // in0 = [i0 i1 i2 i3 iC iD iE iF] + // in1 = [i4 i5 i6 i7 i8 i9 iA iB] + const __m128i r0 = _mm_unpacklo_epi16(in0, in1); + const __m128i r1 = _mm_unpackhi_epi16(in0, in1); + // r0 = [i0 i4 i1 i5 i2 i6 i3 i7] + // r1 = [iC i8 iD i9 iE iA iF iB] + const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4); + const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4); + // r2 = [i0 i4 i1 i5 i3 i7 i2 i6] + // r3 = [iC i8 iD i9 iF iB iE iA] + + const __m128i t0 = _mm_add_epi16(r2, r3); + const __m128i t1 = _mm_sub_epi16(r2, r3); + // t0 = [a0 a4 a1 a5 a3 a7 a2 a6] + // t1 = [aC a8 aD a9 aF aB aE aA] + + // Stage 2: multiply by constants (which gets us into 32 bits). + // The constants needed here are: + // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16] + // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16] + // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08] + // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24] + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D); + // Then add and right-shift to get back to 16-bit range const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - // Combine and transpose - const __m128i res0 = _mm_packs_epi32(w0, w2); - const __m128i res1 = _mm_packs_epi32(w4, w6); - // 00 01 02 03 20 21 22 23 - // 10 11 12 13 30 31 32 33 - const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); - const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - in2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - // 00 10 20 30 01 11 21 31 in0 contains 0 followed by 1 - // 02 12 22 32 03 13 23 33 in2 contains 2 followed by 3 - if (0 == pass) { - // Extract values in the high part for second pass as transform code - // only uses the first four values. - in1 = _mm_unpackhi_epi64(in0, in0); - in3 = _mm_unpackhi_epi64(in2, in2); - } else { - // Post-condition output and store it (v + 1) >> 2, taking advantage - // of the fact 1/3 are stored just after 0/2. - __m128i out01 = _mm_add_epi16(in0, kOne); - __m128i out23 = _mm_add_epi16(in2, kOne); - out01 = _mm_srai_epi16(out01, 2); - out23 = _mm_srai_epi16(out23, 2); - _mm_storeu_si128((__m128i *)(output + 0 * 4), out01); - _mm_storeu_si128((__m128i *)(output + 2 * 4), out23); - } + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + // w0 = [b0 b1 b7 b6] + // w1 = [b8 b9 bF bE] + // w2 = [b4 b5 b3 b2] + // w3 = [bC bD bB bA] + const __m128i x0 = _mm_packs_epi32(w0, w1); + const __m128i x1 = _mm_packs_epi32(w2, w3); + // x0 = [b0 b1 b7 b6 b8 b9 bF bE] + // x1 = [b4 b5 b3 b2 bC bD bB bA] + in0 = _mm_shuffle_epi32(x0, 0xD8); + in1 = _mm_shuffle_epi32(x1, 0x8D); + // in0 = [b0 b1 b8 b9 b7 b6 bF bE] + // in1 = [b3 b2 bB bA b4 b5 bC bD] + } + { + // vertical DCTs finished. Now we do the horizontal DCTs. + // Stage 3: Add/subtract + + const __m128i t0 = _mm_add_epi16(in0, in1); + const __m128i t1 = _mm_sub_epi16(in0, in1); + // t0 = [c0 c1 c8 c9 c4 c5 cC cD] + // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE] + + // Stage 4: multiply by constants (which gets us into 32 bits). + // The constants needed here are: + // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16] + // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16] + // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24] + // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08] + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E); + const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F); + const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H); + // Then add and right-shift to get back to 16-bit range + // but this combines the final right-shift as well to save operations + // This unusual rounding operations is to maintain bit-accurate + // compatibility with the c version of this function which has two + // rounding steps in a row. + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2); + // w0 = [o0 o4 o8 oC] + // w1 = [o2 o6 oA oE] + // w2 = [o1 o5 o9 oD] + // w3 = [o3 o7 oB oF] + // remember the o's are numbered according to the correct output location + const __m128i x0 = _mm_packs_epi32(w0, w1); + const __m128i x1 = _mm_packs_epi32(w2, w3); + // x0 = [o0 o4 o8 oC o2 o6 oA oE] + // x1 = [o1 o5 o9 oD o3 o7 oB oF] + const __m128i y0 = _mm_unpacklo_epi16(x0, x1); + const __m128i y1 = _mm_unpackhi_epi16(x0, x1); + // y0 = [o0 o1 o4 o5 o8 o9 oC oD] + // y1 = [o2 o3 o6 o7 oA oB oE oF] + in0 = _mm_unpacklo_epi32(y0, y1); + // in0 = [o0 o1 o2 o3 o4 o5 o6 o7] + in1 = _mm_unpackhi_epi32(y0, y1); + // in1 = [o8 o9 oA oB oC oD oE oF] + } + // Post-condition (v + 1) >> 2 is now incorporated into previous + // add and right-shift commands. Only 2 store instructions needed + // because we are using the fact that 1/3 are stored just after 0/2. + { + _mm_storeu_si128((__m128i *)(output + 0 * 4), in0); + _mm_storeu_si128((__m128i *)(output + 2 * 4), in1); } } + static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, int stride) { const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); @@ -163,7 +295,7 @@ static INLINE void transpose_4x4(__m128i *res) { res[3] = _mm_unpackhi_epi64(res[2], res[2]); } -void fdct4_1d_sse2(__m128i *in) { +void fdct4_sse2(__m128i *in) { const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); @@ -196,7 +328,7 @@ void fdct4_1d_sse2(__m128i *in) { transpose_4x4(in); } -void fadst4_1d_sse2(__m128i *in) { +void fadst4_sse2(__m128i *in) { const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); @@ -244,32 +376,76 @@ void fadst4_1d_sse2(__m128i *in) { transpose_4x4(in); } -void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output, - int stride, int tx_type) { +void vp9_fht4x4_sse2(const int16_t *input, int16_t *output, + int stride, int tx_type) { __m128i in[4]; - load_buffer_4x4(input, in, stride); + switch (tx_type) { - case 0: // DCT_DCT - fdct4_1d_sse2(in); - fdct4_1d_sse2(in); + case DCT_DCT: + vp9_fdct4x4_sse2(input, output, stride); break; - case 1: // ADST_DCT - fadst4_1d_sse2(in); - fdct4_1d_sse2(in); + case ADST_DCT: + load_buffer_4x4(input, in, stride); + fadst4_sse2(in); + fdct4_sse2(in); + write_buffer_4x4(output, in); break; - case 2: // DCT_ADST - fdct4_1d_sse2(in); - fadst4_1d_sse2(in); + case DCT_ADST: + load_buffer_4x4(input, in, stride); + fdct4_sse2(in); + fadst4_sse2(in); + write_buffer_4x4(output, in); break; - case 3: // ADST_ADST - fadst4_1d_sse2(in); - fadst4_1d_sse2(in); - break; - default: - assert(0); + case ADST_ADST: + load_buffer_4x4(input, in, stride); + fadst4_sse2(in); + fadst4_sse2(in); + write_buffer_4x4(output, in); break; + default: + assert(0); + break; } - write_buffer_4x4(output, in); +} + +void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride) { + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + __m128i u0, u1, sum; + + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + + in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + + sum = _mm_add_epi16(u0, u1); + + in0 = _mm_add_epi16(in0, in1); + in2 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, in0); + + u0 = _mm_setzero_si128(); + sum = _mm_add_epi16(sum, in2); + + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + _mm_store_si128((__m128i *)(output), in1); } void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { @@ -313,7 +489,7 @@ void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { for (pass = 0; pass < 2; pass++) { // To store results of each pass before the transpose. __m128i res0, res1, res2, res3, res4, res5, res6, res7; - // Add/substract + // Add/subtract const __m128i q0 = _mm_add_epi16(in0, in7); const __m128i q1 = _mm_add_epi16(in1, in6); const __m128i q2 = _mm_add_epi16(in2, in5); @@ -324,7 +500,7 @@ void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { const __m128i q7 = _mm_sub_epi16(in0, in7); // Work on first four results { - // Add/substract + // Add/subtract const __m128i r0 = _mm_add_epi16(q0, q3); const __m128i r1 = _mm_add_epi16(q1, q2); const __m128i r2 = _mm_sub_epi16(q1, q2); @@ -386,7 +562,7 @@ void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { // Combine const __m128i r0 = _mm_packs_epi32(s0, s1); const __m128i r1 = _mm_packs_epi32(s2, s3); - // Add/substract + // Add/subtract const __m128i x0 = _mm_add_epi16(q4, r0); const __m128i x1 = _mm_sub_epi16(q4, r0); const __m128i x2 = _mm_sub_epi16(q7, r1); @@ -606,59 +782,7 @@ static INLINE void write_buffer_8x8(int16_t *output, __m128i *res, int stride) { _mm_store_si128((__m128i *)(output + 7 * stride), res[7]); } -// perform in-place transpose -static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 44 54 45 55 46 56 47 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 25 35 - // 44 54 64 74 45 55 65 75 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 -} - -void fdct8_1d_sse2(__m128i *in) { +void fdct8_sse2(__m128i *in) { // constants const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); @@ -798,7 +922,7 @@ void fdct8_1d_sse2(__m128i *in) { array_transpose_8x8(in, in); } -void fadst8_1d_sse2(__m128i *in) { +void fadst8_sse2(__m128i *in) { // Constants const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); @@ -1028,40 +1152,114 @@ void fadst8_1d_sse2(__m128i *in) { array_transpose_8x8(in, in); } -void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output, - int stride, int tx_type) { +void vp9_fht8x8_sse2(const int16_t *input, int16_t *output, + int stride, int tx_type) { __m128i in[8]; - load_buffer_8x8(input, in, stride); + switch (tx_type) { - case 0: // DCT_DCT - fdct8_1d_sse2(in); - fdct8_1d_sse2(in); + case DCT_DCT: + vp9_fdct8x8_sse2(input, output, stride); break; - case 1: // ADST_DCT - fadst8_1d_sse2(in); - fdct8_1d_sse2(in); + case ADST_DCT: + load_buffer_8x8(input, in, stride); + fadst8_sse2(in); + fdct8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); break; - case 2: // DCT_ADST - fdct8_1d_sse2(in); - fadst8_1d_sse2(in); + case DCT_ADST: + load_buffer_8x8(input, in, stride); + fdct8_sse2(in); + fadst8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); break; - case 3: // ADST_ADST - fadst8_1d_sse2(in); - fadst8_1d_sse2(in); + case ADST_ADST: + load_buffer_8x8(input, in, stride); + fadst8_sse2(in); + fadst8_sse2(in); + right_shift_8x8(in, 1); + write_buffer_8x8(output, in, 8); break; default: assert(0); break; } - right_shift_8x8(in, 1); - write_buffer_8x8(output, in, 8); +} + +void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride) { + __m128i in0, in1, in2, in3; + __m128i u0, u1; + __m128i sum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 2; ++i) { + input += 8 * i; + in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 8 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 9 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 10 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 11 * stride)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 12 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 13 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 14 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 15 * stride)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + sum = _mm_add_epi16(sum, u1); + } + + u0 = _mm_setzero_si128(); + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + in1 = _mm_srai_epi32(in1, 1); + _mm_store_si128((__m128i *)(output), in1); } void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, - // as the first pass results are transposed, we tranpose the columns (that + // as the first pass results are transposed, we transpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). int pass; @@ -1076,7 +1274,7 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); @@ -1216,9 +1414,9 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { step1_6 = _mm_sub_epi16(in01, in14); step1_7 = _mm_sub_epi16(in00, in15); } - // Work on the first eight values; fdct8_1d(input, even_results); + // Work on the first eight values; fdct8(input, even_results); { - // Add/substract + // Add/subtract const __m128i q0 = _mm_add_epi16(input0, input7); const __m128i q1 = _mm_add_epi16(input1, input6); const __m128i q2 = _mm_add_epi16(input2, input5); @@ -1229,7 +1427,7 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { const __m128i q7 = _mm_sub_epi16(input0, input7); // Work on first four results { - // Add/substract + // Add/subtract const __m128i r0 = _mm_add_epi16(q0, q3); const __m128i r1 = _mm_add_epi16(q1, q2); const __m128i r2 = _mm_sub_epi16(q1, q2); @@ -1293,7 +1491,7 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { // Combine const __m128i r0 = _mm_packs_epi32(s0, s1); const __m128i r1 = _mm_packs_epi32(s2, s3); - // Add/substract + // Add/subtract const __m128i x0 = _mm_add_epi16(q4, r0); const __m128i x1 = _mm_sub_epi16(q4, r0); const __m128i x2 = _mm_sub_epi16(q7, r1); @@ -1402,8 +1600,8 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08); + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p24_p08); + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p24_p08); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); @@ -1424,8 +1622,8 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24); + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p08_m24); + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p08_m24); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); @@ -1443,10 +1641,10 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { { step1_0 = _mm_add_epi16(step3_0, step2_1); step1_1 = _mm_sub_epi16(step3_0, step2_1); - step1_2 = _mm_sub_epi16(step3_3, step2_2); - step1_3 = _mm_add_epi16(step3_3, step2_2); - step1_4 = _mm_add_epi16(step3_4, step2_5); - step1_5 = _mm_sub_epi16(step3_4, step2_5); + step1_2 = _mm_add_epi16(step3_3, step2_2); + step1_3 = _mm_sub_epi16(step3_3, step2_2); + step1_4 = _mm_sub_epi16(step3_4, step2_5); + step1_5 = _mm_add_epi16(step3_4, step2_5); step1_6 = _mm_sub_epi16(step3_7, step2_6); step1_7 = _mm_add_epi16(step3_7, step2_6); } @@ -1705,23 +1903,6 @@ static INLINE void write_buffer_16x16(int16_t *output, __m128i *in0, write_buffer_8x8(output + 8 * stride, in1 + 8, stride); } -static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - array_transpose_8x8(res0, res0); - array_transpose_8x8(res1, tbuf); - array_transpose_8x8(res0 + 8, res1); - array_transpose_8x8(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; -} - static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { // perform rounding operations right_shift_8x8(res0, 2); @@ -1730,14 +1911,14 @@ static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { right_shift_8x8(res1 + 8, 2); } -void fdct16_1d_8col(__m128i *in) { +void fdct16_8col(__m128i *in) { // perform 16x16 1-D DCT for 8 columns __m128i i[8], s[8], p[8], t[8], u[16], v[16]; const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); @@ -1941,10 +2122,10 @@ void fdct16_1d_8col(__m128i *in) { v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); - v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08); - v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08); - v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24); - v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24); + v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08); + v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08); + v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24); + v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24); v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); @@ -1974,10 +2155,10 @@ void fdct16_1d_8col(__m128i *in) { // stage 5 s[0] = _mm_add_epi16(p[0], t[1]); s[1] = _mm_sub_epi16(p[0], t[1]); - s[2] = _mm_sub_epi16(p[3], t[2]); - s[3] = _mm_add_epi16(p[3], t[2]); - s[4] = _mm_add_epi16(p[4], t[5]); - s[5] = _mm_sub_epi16(p[4], t[5]); + s[2] = _mm_add_epi16(p[3], t[2]); + s[3] = _mm_sub_epi16(p[3], t[2]); + s[4] = _mm_sub_epi16(p[4], t[5]); + s[5] = _mm_add_epi16(p[4], t[5]); s[6] = _mm_sub_epi16(p[7], t[6]); s[7] = _mm_add_epi16(p[7], t[6]); @@ -2052,7 +2233,7 @@ void fdct16_1d_8col(__m128i *in) { in[15] = _mm_packs_epi32(v[14], v[15]); } -void fadst16_1d_8col(__m128i *in) { +void fadst16_8col(__m128i *in) { // perform 16x16 1-D ADST for 8 columns __m128i s[16], x[16], u[32], v[32]; const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); @@ -2522,58 +2703,132 @@ void fadst16_1d_8col(__m128i *in) { in[15] = _mm_sub_epi16(kZero, s[1]); } -void fdct16_1d_sse2(__m128i *in0, __m128i *in1) { - fdct16_1d_8col(in0); - fdct16_1d_8col(in1); +void fdct16_sse2(__m128i *in0, __m128i *in1) { + fdct16_8col(in0); + fdct16_8col(in1); array_transpose_16x16(in0, in1); } -void fadst16_1d_sse2(__m128i *in0, __m128i *in1) { - fadst16_1d_8col(in0); - fadst16_1d_8col(in1); +void fadst16_sse2(__m128i *in0, __m128i *in1) { + fadst16_8col(in0); + fadst16_8col(in1); array_transpose_16x16(in0, in1); } -void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output, - int stride, int tx_type) { +void vp9_fht16x16_sse2(const int16_t *input, int16_t *output, + int stride, int tx_type) { __m128i in0[16], in1[16]; - load_buffer_16x16(input, in0, in1, stride); + switch (tx_type) { - case 0: // DCT_DCT - fdct16_1d_sse2(in0, in1); - right_shift_16x16(in0, in1); - fdct16_1d_sse2(in0, in1); + case DCT_DCT: + vp9_fdct16x16_sse2(input, output, stride); break; - case 1: // ADST_DCT - fadst16_1d_sse2(in0, in1); + case ADST_DCT: + load_buffer_16x16(input, in0, in1, stride); + fadst16_sse2(in0, in1); right_shift_16x16(in0, in1); - fdct16_1d_sse2(in0, in1); + fdct16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); break; - case 2: // DCT_ADST - fdct16_1d_sse2(in0, in1); + case DCT_ADST: + load_buffer_16x16(input, in0, in1, stride); + fdct16_sse2(in0, in1); right_shift_16x16(in0, in1); - fadst16_1d_sse2(in0, in1); + fadst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); break; - case 3: // ADST_ADST - fadst16_1d_sse2(in0, in1); + case ADST_ADST: + load_buffer_16x16(input, in0, in1, stride); + fadst16_sse2(in0, in1); right_shift_16x16(in0, in1); - fadst16_1d_sse2(in0, in1); + fadst16_sse2(in0, in1); + write_buffer_16x16(output, in0, in1, 16); break; default: assert(0); break; } - write_buffer_16x16(output, in0, in1, 16); +} + +void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride) { + __m128i in0, in1, in2, in3; + __m128i u0, u1; + __m128i sum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 8; ++i) { + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + sum = _mm_add_epi16(sum, u1); + } + + u0 = _mm_setzero_si128(); + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + in1 = _mm_srai_epi32(in1, 3); + _mm_store_si128((__m128i *)(output), in1); } #define FDCT32x32_2D vp9_fdct32x32_rd_sse2 #define FDCT32x32_HIGH_PRECISION 0 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" -#undef FDCT32x32_2D #undef FDCT32x32_HIGH_PRECISION +#undef FDCT32x32_2D #define FDCT32x32_2D vp9_fdct32x32_sse2 #define FDCT32x32_HIGH_PRECISION 1 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT -#undef FDCT32x32_2D #undef FDCT32x32_HIGH_PRECISION +#undef FDCT32x32_2D diff --git a/media/libvpx/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm b/media/libvpx/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm new file mode 100644 index 000000000000..28458dcdd522 --- /dev/null +++ b/media/libvpx/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm @@ -0,0 +1,182 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; +%include "third_party/x86inc/x86inc.asm" + +; This file provides SSSE3 version of the forward transformation. Part +; of the macro definitions are originally derived from the ffmpeg project. +; The current version applies to x86 64-bit only. + +SECTION_RODATA + +pw_11585x2: times 8 dw 23170 +pd_8192: times 4 dd 8192 + +%macro TRANSFORM_COEFFS 2 +pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 +pw_%2_m%1: dw %2, -%1, %2, -%1, %2, -%1, %2, -%1 +%endmacro + +TRANSFORM_COEFFS 11585, 11585 +TRANSFORM_COEFFS 15137, 6270 +TRANSFORM_COEFFS 16069, 3196 +TRANSFORM_COEFFS 9102, 13623 + +SECTION .text + +%if ARCH_X86_64 +%macro SUM_SUB 3 + psubw m%3, m%1, m%2 + paddw m%1, m%2 + SWAP %2, %3 +%endmacro + +; butterfly operation +%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2 + pmaddwd m%1, m%3, %5 + pmaddwd m%2, m%3, %6 + paddd m%1, %4 + paddd m%2, %4 + psrad m%1, 14 + psrad m%2, 14 +%endmacro + +%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 + punpckhwd m%6, m%2, m%1 + MUL_ADD_2X %7, %6, %6, %5, [pw_%4_%3], [pw_%3_m%4] + punpcklwd m%2, m%1 + MUL_ADD_2X %1, %2, %2, %5, [pw_%4_%3], [pw_%3_m%4] + packssdw m%1, m%7 + packssdw m%2, m%6 +%endmacro + +; matrix transpose +%macro INTERLEAVE_2X 4 + punpckh%1 m%4, m%2, m%3 + punpckl%1 m%2, m%3 + SWAP %3, %4 +%endmacro + +%macro TRANSPOSE8X8 9 + INTERLEAVE_2X wd, %1, %2, %9 + INTERLEAVE_2X wd, %3, %4, %9 + INTERLEAVE_2X wd, %5, %6, %9 + INTERLEAVE_2X wd, %7, %8, %9 + + INTERLEAVE_2X dq, %1, %3, %9 + INTERLEAVE_2X dq, %2, %4, %9 + INTERLEAVE_2X dq, %5, %7, %9 + INTERLEAVE_2X dq, %6, %8, %9 + + INTERLEAVE_2X qdq, %1, %5, %9 + INTERLEAVE_2X qdq, %3, %7, %9 + INTERLEAVE_2X qdq, %2, %6, %9 + INTERLEAVE_2X qdq, %4, %8, %9 + + SWAP %2, %5 + SWAP %4, %7 +%endmacro + +; 1D forward 8x8 DCT transform +%macro FDCT8_1D 1 + SUM_SUB 0, 7, 9 + SUM_SUB 1, 6, 9 + SUM_SUB 2, 5, 9 + SUM_SUB 3, 4, 9 + + SUM_SUB 0, 3, 9 + SUM_SUB 1, 2, 9 + SUM_SUB 6, 5, 9 +%if %1 == 0 + SUM_SUB 0, 1, 9 +%endif + + BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 + + pmulhrsw m6, m12 + pmulhrsw m5, m12 +%if %1 == 0 + pmulhrsw m0, m12 + pmulhrsw m1, m12 +%else + BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 + SWAP 0, 1 +%endif + + SUM_SUB 4, 5, 9 + SUM_SUB 7, 6, 9 + BUTTERFLY_4X 4, 7, 3196, 16069, m8, 9, 10 + BUTTERFLY_4X 5, 6, 13623, 9102, m8, 9, 10 + SWAP 1, 4 + SWAP 3, 6 +%endmacro + +%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2 + psraw m%3, m%1, 15 + psraw m%4, m%2, 15 + psubw m%1, m%3 + psubw m%2, m%4 + psraw m%1, 1 + psraw m%2, 1 +%endmacro + +INIT_XMM ssse3 +cglobal fdct8x8, 3, 5, 13, input, output, stride + + mova m8, [pd_8192] + mova m12, [pw_11585x2] + pxor m11, m11 + + lea r3, [2 * strideq] + lea r4, [4 * strideq] + mova m0, [inputq] + mova m1, [inputq + r3] + lea inputq, [inputq + r4] + mova m2, [inputq] + mova m3, [inputq + r3] + lea inputq, [inputq + r4] + mova m4, [inputq] + mova m5, [inputq + r3] + lea inputq, [inputq + r4] + mova m6, [inputq] + mova m7, [inputq + r3] + + ; left shift by 2 to increase forward transformation precision + psllw m0, 2 + psllw m1, 2 + psllw m2, 2 + psllw m3, 2 + psllw m4, 2 + psllw m5, 2 + psllw m6, 2 + psllw m7, 2 + + ; column transform + FDCT8_1D 0 + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + + FDCT8_1D 1 + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + + DIVIDE_ROUND_2X 0, 1, 9, 10 + DIVIDE_ROUND_2X 2, 3, 9, 10 + DIVIDE_ROUND_2X 4, 5, 9, 10 + DIVIDE_ROUND_2X 6, 7, 9, 10 + + mova [outputq + 0], m0 + mova [outputq + 16], m1 + mova [outputq + 32], m2 + mova [outputq + 48], m3 + mova [outputq + 64], m4 + mova [outputq + 80], m5 + mova [outputq + 96], m6 + mova [outputq + 112], m7 + + RET +%endif diff --git a/media/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c b/media/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c new file mode 100644 index 000000000000..c67490fad345 --- /dev/null +++ b/media/libvpx/vp9/encoder/x86/vp9_error_intrin_avx2.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Usee of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // AVX2 +#include "vpx/vpx_integer.h" + + +int64_t vp9_block_error_avx2(const int16_t *coeff, + const int16_t *dqcoeff, + intptr_t block_size, + int64_t *ssz) { + __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg; + __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; + __m256i sse_reg_64hi, ssz_reg_64hi; + __m128i sse_reg128, ssz_reg128; + int64_t sse; + int i; + const __m256i zero_reg = _mm256_set1_epi16(0); + + // init sse and ssz registerd to zero + sse_reg = _mm256_set1_epi16(0); + ssz_reg = _mm256_set1_epi16(0); + + for (i = 0 ; i < block_size ; i+= 16) { + // load 32 bytes from coeff and dqcoeff + coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i)); + dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i)); + // dqcoeff - coeff + dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg); + // madd (dqcoeff - coeff) + dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg); + // madd coeff + coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg); + // expand each double word of madd (dqcoeff - coeff) to quad word + exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg); + exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg); + // expand each double word of madd (coeff) to quad word + exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg); + exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg); + // add each quad word of madd (dqcoeff - coeff) and madd (coeff) + sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo); + ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo); + sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi); + ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi); + } + // save the higher 64 bit of each 128 bit lane + sse_reg_64hi = _mm256_srli_si256(sse_reg, 8); + ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8); + // add the higher 64 bit to the low 64 bit + sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi); + ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi); + + // add each 64 bit from each of the 128 bit lane of the 256 bit + sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg), + _mm256_extractf128_si256(sse_reg, 1)); + + ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg), + _mm256_extractf128_si256(ssz_reg, 1)); + + // store the results + _mm_storel_epi64((__m128i*)(&sse), sse_reg128); + + _mm_storel_epi64((__m128i*)(ssz), ssz_reg128); + return sse; +} diff --git a/media/libvpx/vp9/encoder/x86/vp9_mcomp_x86.h b/media/libvpx/vp9/encoder/x86/vp9_mcomp_x86.h deleted file mode 100644 index ca80b8bff6b7..000000000000 --- a/media/libvpx/vp9/encoder/x86/vp9_mcomp_x86.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_ENCODER_X86_VP9_MCOMP_X86_H_ -#define VP9_ENCODER_X86_VP9_MCOMP_X86_H_ - -#if HAVE_SSE3 -#if !CONFIG_RUNTIME_CPU_DETECT - -#undef vp9_search_full_search -#define vp9_search_full_search vp9_full_search_sadx3 - -#undef vp9_search_refining_search -#define vp9_search_refining_search vp9_refining_search_sadx4 - -#undef vp9_search_diamond_search -#define vp9_search_diamond_search vp9_diamond_search_sadx4 - -#endif -#endif - -#if HAVE_SSE4_1 -#if !CONFIG_RUNTIME_CPU_DETECT - -#undef vp9_search_full_search -#define vp9_search_full_search vp9_full_search_sadx8 - -#endif -#endif - -#endif - diff --git a/media/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm b/media/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm similarity index 54% rename from media/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm rename to media/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm index db306603b5fc..508e1d4f55af 100644 --- a/media/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm +++ b/media/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm @@ -188,7 +188,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pmaxsw m8, m7 pshuflw m7, m8, 0x1 pmaxsw m8, m7 - pextrw [r2], m8, 0 + pextrw r6, m8, 0 + mov [r2], r6 RET ; skip-block, i.e. just write all zeroes @@ -214,5 +215,188 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ %endmacro INIT_XMM ssse3 -QUANTIZE_FN b, 6 +QUANTIZE_FN b, 7 QUANTIZE_FN b_32x32, 7 + +%macro QUANTIZE_FP 2 +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ + shift, qcoeff, dqcoeff, dequant, zbin_oq, \ + eob, scan, iscan + cmp dword skipm, 0 + jne .blank + + ; actual quantize loop - setup pointers, rounders, etc. + movifnidn coeffq, coeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, dequantmp + movifnidn zbinq, zbinmp + movifnidn roundq, roundmp + movifnidn quantq, quantmp + mova m1, [roundq] ; m1 = round + mova m2, [quantq] ; m2 = quant +%ifidn %1, fp_32x32 + pcmpeqw m5, m5 + psrlw m5, 15 + paddw m1, m5 + psrlw m1, 1 ; m1 = (m1 + 1) / 2 +%endif + mova m3, [r2q] ; m3 = dequant + mov r3, qcoeffmp + mov r4, dqcoeffmp + mov r5, iscanmp +%ifidn %1, fp_32x32 + psllw m2, 1 +%endif + pxor m5, m5 ; m5 = dedicated zero + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob + lea coeffq, [ coeffq+ncoeffq*2] + lea iscanq, [ iscanq+ncoeffq*2] + lea qcoeffq, [ qcoeffq+ncoeffq*2] + lea dqcoeffq, [dqcoeffq+ncoeffq*2] + neg ncoeffq + + ; get DC and first 15 AC coeffs + mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpeqw m7, m7 + + paddsw m6, m1 ; m6 += round + punpckhqdq m1, m1 + paddsw m11, m1 ; m11 += round + pmulhw m8, m6, m2 ; m8 = m6*q>>16 + punpckhqdq m2, m2 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + psignw m8, m9 ; m8 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + mova [qcoeffq+ncoeffq*2+ 0], m8 + mova [qcoeffq+ncoeffq*2+16], m13 +%ifidn %1, fp_32x32 + pabsw m8, m8 + pabsw m13, m13 +%endif + pmullw m8, m3 ; dqc[i] = qc[i] * q + punpckhqdq m3, m3 + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, fp_32x32 + psrlw m8, 1 + psrlw m13, 1 + psignw m8, m9 + psignw m13, m10 + psrlw m0, m3, 2 +%endif + mova [dqcoeffq+ncoeffq*2+ 0], m8 + mova [dqcoeffq+ncoeffq*2+16], m13 + pcmpeqw m8, m5 ; m8 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m7 ; m11 = scan[i] + 1 + pandn m8, m6 ; m8 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m13 + add ncoeffq, mmsize + jz .accumulate_eob + +.ac_only_loop: + mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) +%ifidn %1, fp_32x32 + pcmpgtw m7, m6, m0 + pcmpgtw m12, m11, m0 + pmovmskb r6, m7 + pmovmskb r2, m12 + + or r6, r2 + jz .skip_iter +%endif + pcmpeqw m7, m7 + + paddsw m6, m1 ; m6 += round + paddsw m11, m1 ; m11 += round + pmulhw m14, m6, m2 ; m14 = m6*q>>16 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + psignw m14, m9 ; m14 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + mova [qcoeffq+ncoeffq*2+ 0], m14 + mova [qcoeffq+ncoeffq*2+16], m13 +%ifidn %1, fp_32x32 + pabsw m14, m14 + pabsw m13, m13 +%endif + pmullw m14, m3 ; dqc[i] = qc[i] * q + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, fp_32x32 + psrlw m14, 1 + psrlw m13, 1 + psignw m14, m9 + psignw m13, m10 +%endif + mova [dqcoeffq+ncoeffq*2+ 0], m14 + mova [dqcoeffq+ncoeffq*2+16], m13 + pcmpeqw m14, m5 ; m14 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m7 ; m11 = scan[i] + 1 + pandn m14, m6 ; m14 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m14 + pmaxsw m8, m13 + add ncoeffq, mmsize + jl .ac_only_loop + +%ifidn %1, fp_32x32 + jmp .accumulate_eob +.skip_iter: + mova [qcoeffq+ncoeffq*2+ 0], m5 + mova [qcoeffq+ncoeffq*2+16], m5 + mova [dqcoeffq+ncoeffq*2+ 0], m5 + mova [dqcoeffq+ncoeffq*2+16], m5 + add ncoeffq, mmsize + jl .ac_only_loop +%endif + +.accumulate_eob: + ; horizontally accumulate/max eobs and write into [eob] memory pointer + mov r2, eobmp + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + pextrw r6, m8, 0 + mov [r2], r6 + RET + + ; skip-block, i.e. just write all zeroes +.blank: + mov r0, dqcoeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, qcoeffmp + mov r3, eobmp + DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob + lea dqcoeffq, [dqcoeffq+ncoeffq*2] + lea qcoeffq, [ qcoeffq+ncoeffq*2] + neg ncoeffq + pxor m7, m7 +.blank_loop: + mova [dqcoeffq+ncoeffq*2+ 0], m7 + mova [dqcoeffq+ncoeffq*2+16], m7 + mova [qcoeffq+ncoeffq*2+ 0], m7 + mova [qcoeffq+ncoeffq*2+16], m7 + add ncoeffq, mmsize + jl .blank_loop + mov word [eobq], 0 + RET +%endmacro + +INIT_XMM ssse3 +QUANTIZE_FP fp, 7 +QUANTIZE_FP fp_32x32, 7 diff --git a/media/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c b/media/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c new file mode 100644 index 000000000000..1feed62566bd --- /dev/null +++ b/media/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include // AVX2 +#include "vpx/vpx_integer.h" + +void vp9_sad32x32x4d_avx2(uint8_t *src, + int src_stride, + uint8_t *ref[4], + int ref_stride, + unsigned int res[4]) { + __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; + __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; + __m256i sum_mlow, sum_mhigh; + int i; + uint8_t *ref0, *ref1, *ref2, *ref3; + + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + sum_ref0 = _mm256_set1_epi16(0); + sum_ref1 = _mm256_set1_epi16(0); + sum_ref2 = _mm256_set1_epi16(0); + sum_ref3 = _mm256_set1_epi16(0); + for (i = 0; i < 32 ; i++) { + // load src and all refs + src_reg = _mm256_loadu_si256((__m256i *)(src)); + ref0_reg = _mm256_loadu_si256((__m256i *) (ref0)); + ref1_reg = _mm256_loadu_si256((__m256i *) (ref1)); + ref2_reg = _mm256_loadu_si256((__m256i *) (ref2)); + ref3_reg = _mm256_loadu_si256((__m256i *) (ref3)); + // sum of the absolute differences between every ref-i to src + ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); + ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); + // sum every ref-i + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); + + src+= src_stride; + ref0+= ref_stride; + ref1+= ref_stride; + ref2+= ref_stride; + ref3+= ref_stride; + } + { + __m128i sum; + // in sum_ref-i the result is saved in the first 4 bytes + // the other 4 bytes are zeroed. + // sum_ref1 and sum_ref3 are shifted left by 4 bytes + sum_ref1 = _mm256_slli_si256(sum_ref1, 4); + sum_ref3 = _mm256_slli_si256(sum_ref3, 4); + + // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 + sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); + sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); + + // merge every 64 bit from each sum_ref-i + sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); + sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); + + // add the low 64 bit to the high 64 bit + sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); + + // add the low 128 bit to the high 128 bit + sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), + _mm256_extractf128_si256(sum_mlow, 1)); + + _mm_storeu_si128((__m128i *)(res), sum); + } +} + +void vp9_sad64x64x4d_avx2(uint8_t *src, + int src_stride, + uint8_t *ref[4], + int ref_stride, + unsigned int res[4]) { + __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg; + __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg; + __m256i ref3_reg, ref3next_reg; + __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; + __m256i sum_mlow, sum_mhigh; + int i; + uint8_t *ref0, *ref1, *ref2, *ref3; + + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + sum_ref0 = _mm256_set1_epi16(0); + sum_ref1 = _mm256_set1_epi16(0); + sum_ref2 = _mm256_set1_epi16(0); + sum_ref3 = _mm256_set1_epi16(0); + for (i = 0; i < 64 ; i++) { + // load 64 bytes from src and all refs + src_reg = _mm256_loadu_si256((__m256i *)(src)); + srcnext_reg = _mm256_loadu_si256((__m256i *)(src + 32)); + ref0_reg = _mm256_loadu_si256((__m256i *) (ref0)); + ref0next_reg = _mm256_loadu_si256((__m256i *) (ref0 + 32)); + ref1_reg = _mm256_loadu_si256((__m256i *) (ref1)); + ref1next_reg = _mm256_loadu_si256((__m256i *) (ref1 + 32)); + ref2_reg = _mm256_loadu_si256((__m256i *) (ref2)); + ref2next_reg = _mm256_loadu_si256((__m256i *) (ref2 + 32)); + ref3_reg = _mm256_loadu_si256((__m256i *) (ref3)); + ref3next_reg = _mm256_loadu_si256((__m256i *) (ref3 + 32)); + // sum of the absolute differences between every ref-i to src + ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); + ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); + ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg); + ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg); + ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg); + ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg); + + // sum every ref-i + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg); + src+= src_stride; + ref0+= ref_stride; + ref1+= ref_stride; + ref2+= ref_stride; + ref3+= ref_stride; + } + { + __m128i sum; + + // in sum_ref-i the result is saved in the first 4 bytes + // the other 4 bytes are zeroed. + // sum_ref1 and sum_ref3 are shifted left by 4 bytes + sum_ref1 = _mm256_slli_si256(sum_ref1, 4); + sum_ref3 = _mm256_slli_si256(sum_ref3, 4); + + // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 + sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); + sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); + + // merge every 64 bit from each sum_ref-i + sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); + sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); + + // add the low 64 bit to the high 64 bit + sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); + + // add the low 128 bit to the high 128 bit + sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), + _mm256_extractf128_si256(sum_mlow, 1)); + + _mm_storeu_si128((__m128i *)(res), sum); + } +} diff --git a/media/libvpx/vp9/encoder/x86/vp9_sad_mmx.asm b/media/libvpx/vp9/encoder/x86/vp9_sad_mmx.asm deleted file mode 100644 index 32fdd23d7d0d..000000000000 --- a/media/libvpx/vp9/encoder/x86/vp9_sad_mmx.asm +++ /dev/null @@ -1,427 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -global sym(vp9_sad16x16_mmx) PRIVATE -global sym(vp9_sad8x16_mmx) PRIVATE -global sym(vp9_sad8x8_mmx) PRIVATE -global sym(vp9_sad4x4_mmx) PRIVATE -global sym(vp9_sad16x8_mmx) PRIVATE - -;unsigned int vp9_sad16x16_mmx( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -sym(vp9_sad16x16_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - lea rcx, [rsi+rax*8] - - lea rcx, [rcx+rax*8] - pxor mm7, mm7 - - pxor mm6, mm6 - -.x16x16sad_mmx_loop: - - movq mm0, QWORD PTR [rsi] - movq mm2, QWORD PTR [rsi+8] - - movq mm1, QWORD PTR [rdi] - movq mm3, QWORD PTR [rdi+8] - - movq mm4, mm0 - movq mm5, mm2 - - psubusb mm0, mm1 - psubusb mm1, mm4 - - psubusb mm2, mm3 - psubusb mm3, mm5 - - por mm0, mm1 - por mm2, mm3 - - movq mm1, mm0 - movq mm3, mm2 - - punpcklbw mm0, mm6 - punpcklbw mm2, mm6 - - punpckhbw mm1, mm6 - punpckhbw mm3, mm6 - - paddw mm0, mm2 - paddw mm1, mm3 - - - lea rsi, [rsi+rax] - add rdi, rdx - - paddw mm7, mm0 - paddw mm7, mm1 - - cmp rsi, rcx - jne .x16x16sad_mmx_loop - - - movq mm0, mm7 - - punpcklwd mm0, mm6 - punpckhwd mm7, mm6 - - paddw mm0, mm7 - movq mm7, mm0 - - - psrlq mm0, 32 - paddw mm7, mm0 - - movq rax, mm7 - - pop rdi - pop rsi - mov rsp, rbp - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp9_sad8x16_mmx( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -sym(vp9_sad8x16_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - lea rcx, [rsi+rax*8] - - lea rcx, [rcx+rax*8] - pxor mm7, mm7 - - pxor mm6, mm6 - -.x8x16sad_mmx_loop: - - movq mm0, QWORD PTR [rsi] - movq mm1, QWORD PTR [rdi] - - movq mm2, mm0 - psubusb mm0, mm1 - - psubusb mm1, mm2 - por mm0, mm1 - - movq mm2, mm0 - punpcklbw mm0, mm6 - - punpckhbw mm2, mm6 - lea rsi, [rsi+rax] - - add rdi, rdx - paddw mm7, mm0 - - paddw mm7, mm2 - cmp rsi, rcx - - jne .x8x16sad_mmx_loop - - movq mm0, mm7 - punpcklwd mm0, mm6 - - punpckhwd mm7, mm6 - paddw mm0, mm7 - - movq mm7, mm0 - psrlq mm0, 32 - - paddw mm7, mm0 - movq rax, mm7 - - pop rdi - pop rsi - mov rsp, rbp - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp9_sad8x8_mmx( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -sym(vp9_sad8x8_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - lea rcx, [rsi+rax*8] - pxor mm7, mm7 - - pxor mm6, mm6 - -.x8x8sad_mmx_loop: - - movq mm0, QWORD PTR [rsi] - movq mm1, QWORD PTR [rdi] - - movq mm2, mm0 - psubusb mm0, mm1 - - psubusb mm1, mm2 - por mm0, mm1 - - movq mm2, mm0 - punpcklbw mm0, mm6 - - punpckhbw mm2, mm6 - paddw mm0, mm2 - - lea rsi, [rsi+rax] - add rdi, rdx - - paddw mm7, mm0 - cmp rsi, rcx - - jne .x8x8sad_mmx_loop - - movq mm0, mm7 - punpcklwd mm0, mm6 - - punpckhwd mm7, mm6 - paddw mm0, mm7 - - movq mm7, mm0 - psrlq mm0, 32 - - paddw mm7, mm0 - movq rax, mm7 - - pop rdi - pop rsi - mov rsp, rbp - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp9_sad4x4_mmx( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -sym(vp9_sad4x4_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - movd mm0, DWORD PTR [rsi] - movd mm1, DWORD PTR [rdi] - - movd mm2, DWORD PTR [rsi+rax] - movd mm3, DWORD PTR [rdi+rdx] - - punpcklbw mm0, mm2 - punpcklbw mm1, mm3 - - movq mm2, mm0 - psubusb mm0, mm1 - - psubusb mm1, mm2 - por mm0, mm1 - - movq mm2, mm0 - pxor mm3, mm3 - - punpcklbw mm0, mm3 - punpckhbw mm2, mm3 - - paddw mm0, mm2 - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - movd mm4, DWORD PTR [rsi] - movd mm5, DWORD PTR [rdi] - - movd mm6, DWORD PTR [rsi+rax] - movd mm7, DWORD PTR [rdi+rdx] - - punpcklbw mm4, mm6 - punpcklbw mm5, mm7 - - movq mm6, mm4 - psubusb mm4, mm5 - - psubusb mm5, mm6 - por mm4, mm5 - - movq mm5, mm4 - punpcklbw mm4, mm3 - - punpckhbw mm5, mm3 - paddw mm4, mm5 - - paddw mm0, mm4 - movq mm1, mm0 - - punpcklwd mm0, mm3 - punpckhwd mm1, mm3 - - paddw mm0, mm1 - movq mm1, mm0 - - psrlq mm0, 32 - paddw mm0, mm1 - - movq rax, mm0 - - pop rdi - pop rsi - mov rsp, rbp - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp9_sad16x8_mmx( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -sym(vp9_sad16x8_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - lea rcx, [rsi+rax*8] - pxor mm7, mm7 - - pxor mm6, mm6 - -.x16x8sad_mmx_loop: - - movq mm0, [rsi] - movq mm1, [rdi] - - movq mm2, [rsi+8] - movq mm3, [rdi+8] - - movq mm4, mm0 - movq mm5, mm2 - - psubusb mm0, mm1 - psubusb mm1, mm4 - - psubusb mm2, mm3 - psubusb mm3, mm5 - - por mm0, mm1 - por mm2, mm3 - - movq mm1, mm0 - movq mm3, mm2 - - punpcklbw mm0, mm6 - punpckhbw mm1, mm6 - - punpcklbw mm2, mm6 - punpckhbw mm3, mm6 - - - paddw mm0, mm2 - paddw mm1, mm3 - - paddw mm0, mm1 - lea rsi, [rsi+rax] - - add rdi, rdx - paddw mm7, mm0 - - cmp rsi, rcx - jne .x16x8sad_mmx_loop - - movq mm0, mm7 - punpcklwd mm0, mm6 - - punpckhwd mm7, mm6 - paddw mm0, mm7 - - movq mm7, mm0 - psrlq mm0, 32 - - paddw mm7, mm0 - movq rax, mm7 - - pop rdi - pop rsi - mov rsp, rbp - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret diff --git a/media/libvpx/vp9/encoder/x86/vp9_ssim_opt_x86_64.asm b/media/libvpx/vp9/encoder/x86/vp9_ssim_opt_x86_64.asm new file mode 100644 index 000000000000..455d10d2c8a4 --- /dev/null +++ b/media/libvpx/vp9/encoder/x86/vp9_ssim_opt_x86_64.asm @@ -0,0 +1,216 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr +%macro TABULATE_SSIM 0 + paddusw xmm15, xmm3 ; sum_s + paddusw xmm14, xmm4 ; sum_r + movdqa xmm1, xmm3 + pmaddwd xmm1, xmm1 + paddd xmm13, xmm1 ; sum_sq_s + movdqa xmm2, xmm4 + pmaddwd xmm2, xmm2 + paddd xmm12, xmm2 ; sum_sq_r + pmaddwd xmm3, xmm4 + paddd xmm11, xmm3 ; sum_sxr +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_Q 1 + movdqa xmm2,%1 + punpckldq %1,xmm0 + punpckhdq xmm2,xmm0 + paddq %1,xmm2 + movdqa xmm2,%1 + punpcklqdq %1,xmm0 + punpckhqdq xmm2,xmm0 + paddq %1,xmm2 +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_W 1 + movdqa xmm1, %1 + punpcklwd %1,xmm0 + punpckhwd xmm1,xmm0 + paddd %1, xmm1 + SUM_ACROSS_Q %1 +%endmacro +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; unsigned long *sum_s, +; unsigned long *sum_r, +; unsigned long *sum_sq_s, +; unsigned long *sum_sq_r, +; unsigned long *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(vp9_ssim_parms_16x16_sse2) PRIVATE +sym(vp9_ssim_parms_16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 16 ;row counter +.NextRow: + + ;grab source and reference pixels + movdqu xmm5, [rsi] + movdqu xmm6, [rdi] + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpckhbw xmm3, xmm0 ; high_s + punpckhbw xmm4, xmm0 ; high_r + + TABULATE_SSIM + + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; unsigned long *sum_s, +; unsigned long *sum_r, +; unsigned long *sum_sq_s, +; unsigned long *sum_sq_r, +; unsigned long *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(vp9_ssim_parms_8x8_sse2) PRIVATE +sym(vp9_ssim_parms_8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 8 ;row counter +.NextRow: + + ;grab source and reference pixels + movq xmm3, [rsi] + movq xmm4, [rdi] + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/media/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c b/media/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c new file mode 100644 index 000000000000..a441cadaf703 --- /dev/null +++ b/media/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c @@ -0,0 +1,539 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // AVX2 +#include "vpx_ports/mem.h" +#include "vp9/encoder/vp9_variance.h" + +DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, + 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, + 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, + 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, + 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, + 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, + 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, + 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, + 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, + 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15 +}; + +#define FILTER_SRC(filter) \ + /* filter the source */ \ + exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ + exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ + \ + /* add 8 to source */ \ + exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ + exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ + \ + /* divide source by 16 */ \ + exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ + exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); + +#define MERGE_WITH_SRC(src_reg, reg) \ + exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ + exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); + +#define LOAD_SRC_DST \ + /* load source and destination */ \ + src_reg = _mm256_loadu_si256((__m256i const *) (src)); \ + dst_reg = _mm256_loadu_si256((__m256i const *) (dst)); + +#define AVG_NEXT_SRC(src_reg, size_stride) \ + src_next_reg = _mm256_loadu_si256((__m256i const *) \ + (src + size_stride)); \ + /* average between current and next stride source */ \ + src_reg = _mm256_avg_epu8(src_reg, src_next_reg); + +#define MERGE_NEXT_SRC(src_reg, size_stride) \ + src_next_reg = _mm256_loadu_si256((__m256i const *) \ + (src + size_stride)); \ + MERGE_WITH_SRC(src_reg, src_next_reg) + +#define CALC_SUM_SSE_INSIDE_LOOP \ + /* expand each byte to 2 bytes */ \ + exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ + exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ + /* source - dest */ \ + exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ + exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ + /* caculate sum */ \ + sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \ + exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ + sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \ + exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ + /* calculate sse */ \ + sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \ + sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); + +// final calculation to sum and sse +#define CALC_SUM_AND_SSE \ + res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ + sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ + sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ + \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ + \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ + _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ + _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); + + +unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, + int src_stride, + int x_offset, + int y_offset, + const uint8_t *dst, + int dst_stride, + int height, + unsigned int *sse) { + __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; + __m256i zero_reg; + int i, sum; + sum_reg = _mm256_set1_epi16(0); + sse_reg = _mm256_set1_epi16(0); + zero_reg = _mm256_set1_epi16(0); + + // x_offset = 0 and y_offset = 0 + if (x_offset == 0) { + if (y_offset == 0) { + for (i = 0; i < height ; i++) { + LOAD_SRC_DST + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src+= src_stride; + dst+= dst_stride; + } + // x_offset = 0 and y_offset = 8 + } else if (y_offset == 8) { + __m256i src_next_reg; + for (i = 0; i < height ; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, src_stride) + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src+= src_stride; + dst+= dst_stride; + } + // x_offset = 0 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg; + + y_offset <<= 5; + filter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height ; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, src_stride) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + src+= src_stride; + dst+= dst_stride; + } + } + // x_offset = 8 and y_offset = 0 + } else if (x_offset == 8) { + if (y_offset == 0) { + __m256i src_next_reg; + for (i = 0; i < height ; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src+= src_stride; + dst+= dst_stride; + } + // x_offset = 8 and y_offset = 8 + } else if (y_offset == 8) { + __m256i src_next_reg, src_avg; + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height ; i++) { + src_avg = src_reg; + src+= src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + // average between previous average to current average + src_avg = _mm256_avg_epu8(src_avg, src_reg); + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_avg, zero_reg) + // save current source average + CALC_SUM_SSE_INSIDE_LOOP + dst+= dst_stride; + } + // x_offset = 8 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg, src_avg; + y_offset <<= 5; + filter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height ; i++) { + // save current source average + src_avg = src_reg; + src+= src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + MERGE_WITH_SRC(src_avg, src_reg) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + dst+= dst_stride; + } + } + // x_offset = bilin interpolation and y_offset = 0 + } else { + if (y_offset == 0) { + __m256i filter, pw8, src_next_reg; + x_offset <<= 5; + filter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height ; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + src+= src_stride; + dst+= dst_stride; + } + // x_offset = bilin interpolation and y_offset = 8 + } else if (y_offset == 8) { + __m256i filter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + filter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height ; i++) { + src+= src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // average between previous pack to the current + src_pack = _mm256_avg_epu8(src_pack, src_reg); + MERGE_WITH_SRC(src_pack, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src_pack = src_reg; + dst+= dst_stride; + } + // x_offset = bilin interpolation and y_offset = bilin interpolation + } else { + __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + xfilter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + x_offset)); + y_offset <<= 5; + yfilter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + MERGE_NEXT_SRC(src_reg, 1) + + FILTER_SRC(xfilter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height ; i++) { + src+= src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(xfilter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // merge previous pack to current pack source + MERGE_WITH_SRC(src_pack, src_reg) + // filter the source + FILTER_SRC(yfilter) + src_pack = src_reg; + CALC_SUM_SSE_INSIDE_LOOP + dst+= dst_stride; + } + } + } + CALC_SUM_AND_SSE + return sum; +} + +unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, + int src_stride, + int x_offset, + int y_offset, + const uint8_t *dst, + int dst_stride, + const uint8_t *sec, + int sec_stride, + int height, + unsigned int *sse) { + __m256i sec_reg; + __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; + __m256i zero_reg; + int i, sum; + sum_reg = _mm256_set1_epi16(0); + sse_reg = _mm256_set1_epi16(0); + zero_reg = _mm256_set1_epi16(0); + + // x_offset = 0 and y_offset = 0 + if (x_offset == 0) { + if (y_offset == 0) { + for (i = 0; i < height ; i++) { + LOAD_SRC_DST + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec+= sec_stride; + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src+= src_stride; + dst+= dst_stride; + } + } else if (y_offset == 8) { + __m256i src_next_reg; + for (i = 0; i < height ; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, src_stride) + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec+= sec_stride; + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src+= src_stride; + dst+= dst_stride; + } + // x_offset = 0 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg; + + y_offset <<= 5; + filter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height ; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, src_stride) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec+= sec_stride; + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src+= src_stride; + dst+= dst_stride; + } + } + // x_offset = 8 and y_offset = 0 + } else if (x_offset == 8) { + if (y_offset == 0) { + __m256i src_next_reg; + for (i = 0; i < height ; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec+= sec_stride; + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src+= src_stride; + dst+= dst_stride; + } + // x_offset = 8 and y_offset = 8 + } else if (y_offset == 8) { + __m256i src_next_reg, src_avg; + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height ; i++) { + // save current source average + src_avg = src_reg; + src+= src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + // average between previous average to current average + src_avg = _mm256_avg_epu8(src_avg, src_reg); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + src_avg = _mm256_avg_epu8(src_avg, sec_reg); + sec+= sec_stride; + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_avg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + dst+= dst_stride; + } + // x_offset = 8 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg, src_avg; + y_offset <<= 5; + filter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height ; i++) { + // save current source average + src_avg = src_reg; + src+= src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + MERGE_WITH_SRC(src_avg, src_reg) + FILTER_SRC(filter) + src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + src_avg = _mm256_avg_epu8(src_avg, sec_reg); + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_avg, zero_reg) + sec+= sec_stride; + CALC_SUM_SSE_INSIDE_LOOP + dst+= dst_stride; + } + } + // x_offset = bilin interpolation and y_offset = 0 + } else { + if (y_offset == 0) { + __m256i filter, pw8, src_next_reg; + x_offset <<= 5; + filter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height ; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + MERGE_WITH_SRC(src_reg, zero_reg) + sec+= sec_stride; + CALC_SUM_SSE_INSIDE_LOOP + src+= src_stride; + dst+= dst_stride; + } + // x_offset = bilin interpolation and y_offset = 8 + } else if (y_offset == 8) { + __m256i filter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + filter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height ; i++) { + src+= src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // average between previous pack to the current + src_pack = _mm256_avg_epu8(src_pack, src_reg); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + src_pack = _mm256_avg_epu8(src_pack, sec_reg); + sec+= sec_stride; + MERGE_WITH_SRC(src_pack, zero_reg) + src_pack = src_reg; + CALC_SUM_SSE_INSIDE_LOOP + dst+= dst_stride; + } + // x_offset = bilin interpolation and y_offset = bilin interpolation + } else { + __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + xfilter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + x_offset)); + y_offset <<= 5; + yfilter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + MERGE_NEXT_SRC(src_reg, 1) + + FILTER_SRC(xfilter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height ; i++) { + src+= src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(xfilter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // merge previous pack to current pack source + MERGE_WITH_SRC(src_pack, src_reg) + // filter the source + FILTER_SRC(yfilter) + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + src_pack = _mm256_avg_epu8(src_pack, sec_reg); + MERGE_WITH_SRC(src_pack, zero_reg) + src_pack = src_reg; + sec+= sec_stride; + CALC_SUM_SSE_INSIDE_LOOP + dst+= dst_stride; + } + } + } + CALC_SUM_AND_SSE + return sum; +} diff --git a/media/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm b/media/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm deleted file mode 100644 index 2ecc23e55943..000000000000 --- a/media/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm +++ /dev/null @@ -1,337 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "vpx_ports/x86_abi_support.asm" - -;void vp9_half_horiz_vert_variance16x_h_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE -sym(vp9_half_horiz_vert_variance16x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref_ptr ; - - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - movsxd rdx, dword ptr arg(3) ;src_pixels_per_line - - pxor xmm0, xmm0 ; - - movdqu xmm5, XMMWORD PTR [rsi] - movdqu xmm3, XMMWORD PTR [rsi+1] - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 - - lea rsi, [rsi + rax] - -.half_horiz_vert_variance16x_h_1: - movdqu xmm1, XMMWORD PTR [rsi] ; - movdqu xmm2, XMMWORD PTR [rsi+1] ; - pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 - - pavgb xmm5, xmm1 ; xmm = vertical average of the above - - movdqa xmm4, xmm5 - punpcklbw xmm5, xmm0 ; xmm5 = words of above - punpckhbw xmm4, xmm0 - - movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 - punpcklbw xmm3, xmm0 ; xmm3 = words of above - psubw xmm5, xmm3 ; xmm5 -= xmm3 - - movq xmm3, QWORD PTR [rdi+8] - punpcklbw xmm3, xmm0 - psubw xmm4, xmm3 - - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - paddw xmm6, xmm4 - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - pmaddwd xmm4, xmm4 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - paddd xmm7, xmm4 - - movdqa xmm5, xmm1 ; save xmm1 for use on the next row - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - - sub rcx, 1 ; - jnz .half_horiz_vert_variance16x_h_1 ; - - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(5) ;[Sum] - mov rdi, arg(6) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_half_vert_variance16x_h_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE -sym(vp9_half_vert_variance16x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref_ptr - - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - movsxd rdx, dword ptr arg(3) ;src_pixels_per_line - - movdqu xmm5, XMMWORD PTR [rsi] - lea rsi, [rsi + rax ] - pxor xmm0, xmm0 - -.half_vert_variance16x_h_1: - movdqu xmm3, XMMWORD PTR [rsi] - - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) - movdqa xmm4, xmm5 - punpcklbw xmm5, xmm0 - punpckhbw xmm4, xmm0 - - movq xmm2, QWORD PTR [rdi] - punpcklbw xmm2, xmm0 - psubw xmm5, xmm2 - movq xmm2, QWORD PTR [rdi+8] - punpcklbw xmm2, xmm0 - psubw xmm4, xmm2 - - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - paddw xmm6, xmm4 - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - pmaddwd xmm4, xmm4 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - paddd xmm7, xmm4 - - movdqa xmm5, xmm3 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - - sub rcx, 1 - jnz .half_vert_variance16x_h_1 - - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(5) ;[Sum] - mov rdi, arg(6) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_half_horiz_variance16x_h_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE -sym(vp9_half_horiz_variance16x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref_ptr ; - - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - movsxd rdx, dword ptr arg(3) ;src_pixels_per_line - - pxor xmm0, xmm0 ; - -.half_horiz_variance16x_h_1: - movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15 - movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16 - - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) - movdqa xmm1, xmm5 - punpcklbw xmm5, xmm0 ; xmm5 = words of above - punpckhbw xmm1, xmm0 - - movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 - punpcklbw xmm3, xmm0 ; xmm3 = words of above - movq xmm2, QWORD PTR [rdi+8] - punpcklbw xmm2, xmm0 - - psubw xmm5, xmm3 ; xmm5 -= xmm3 - psubw xmm1, xmm2 - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - paddw xmm6, xmm1 - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - pmaddwd xmm1, xmm1 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - paddd xmm7, xmm1 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - - sub rcx, 1 ; - jnz .half_horiz_variance16x_h_1 ; - - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(5) ;[Sum] - mov rdi, arg(6) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/media/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm b/media/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm index d2d13b383942..21aaa9383181 100644 --- a/media/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm +++ b/media/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm @@ -15,41 +15,45 @@ ; (unsigned char *frame1, | 0 ; unsigned int stride, | 1 ; unsigned char *frame2, | 2 -; unsigned int block_size, | 3 -; int strength, | 4 -; int filter_weight, | 5 -; unsigned int *accumulator, | 6 -; unsigned short *count) | 7 +; unsigned int block_width, | 3 +; unsigned int block_height, | 4 +; int strength, | 5 +; int filter_weight, | 6 +; unsigned int *accumulator, | 7 +; unsigned short *count) | 8 global sym(vp9_temporal_filter_apply_sse2) PRIVATE sym(vp9_temporal_filter_apply_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 + SHADOW_ARGS_TO_STACK 9 SAVE_XMM 7 GET_GOT rbx push rsi push rdi ALIGN_STACK 16, rax - %define block_size 0 - %define strength 16 - %define filter_weight 32 - %define rounding_bit 48 - %define rbp_backup 64 - %define stack_size 80 + %define block_width 0 + %define block_height 16 + %define strength 32 + %define filter_weight 48 + %define rounding_bit 64 + %define rbp_backup 80 + %define stack_size 96 sub rsp, stack_size mov [rsp + rbp_backup], rbp ; end prolog - mov rdx, arg(3) - mov [rsp + block_size], rdx - movd xmm6, arg(4) + mov edx, arg(3) + mov [rsp + block_width], rdx + mov edx, arg(4) + mov [rsp + block_height], rdx + movd xmm6, arg(5) movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read ; calculate the rounding bit outside the loop ; 0x8000 >> (16 - strength) mov rdx, 16 - sub rdx, arg(4) ; 16 - strength + sub rdx, arg(5) ; 16 - strength movq xmm4, rdx ; can't use rdx w/ shift movdqa xmm5, [GLOBAL(_const_top_bit)] psrlw xmm5, xmm4 @@ -57,11 +61,11 @@ sym(vp9_temporal_filter_apply_sse2): mov rsi, arg(0) ; src/frame1 mov rdx, arg(2) ; predictor frame - mov rdi, arg(6) ; accumulator - mov rax, arg(7) ; count + mov rdi, arg(7) ; accumulator + mov rax, arg(8) ; count ; dup the filter weight and store for later - movd xmm0, arg(5) ; filter_weight + movd xmm0, arg(6) ; filter_weight pshuflw xmm0, xmm0, 0 punpcklwd xmm0, xmm0 movdqa [rsp + filter_weight], xmm0 @@ -69,10 +73,11 @@ sym(vp9_temporal_filter_apply_sse2): mov rbp, arg(1) ; stride pxor xmm7, xmm7 ; zero for extraction - lea rcx, [rdx + 16*16*1] - cmp dword ptr [rsp + block_size], 8 + mov rcx, [rsp + block_width] + imul rcx, [rsp + block_height] + add rcx, rdx + cmp dword ptr [rsp + block_width], 8 jne .temporal_filter_apply_load_16 - lea rcx, [rdx + 8*8*1] .temporal_filter_apply_load_8: movq xmm0, [rsi] ; first row @@ -178,7 +183,7 @@ sym(vp9_temporal_filter_apply_sse2): cmp rdx, rcx je .temporal_filter_apply_epilog pxor xmm7, xmm7 ; zero for extraction - cmp dword ptr [rsp + block_size], 16 + cmp dword ptr [rsp + block_width], 16 je .temporal_filter_apply_load_16 jmp .temporal_filter_apply_load_8 diff --git a/media/libvpx/vp9/encoder/x86/vp9_variance_avx2.c b/media/libvpx/vp9/encoder/x86/vp9_variance_avx2.c new file mode 100644 index 000000000000..ea09b959e125 --- /dev/null +++ b/media/libvpx/vp9/encoder/x86/vp9_variance_avx2.c @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "./vpx_config.h" + +#include "vp9/encoder/vp9_variance.h" +#include "vpx_ports/mem.h" + +typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +void vp9_get16x16var_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +void vp9_get32x32var_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + int height, + unsigned int *sse); + +unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, + int src_stride, + int x_offset, + int y_offset, + const uint8_t *dst, + int dst_stride, + const uint8_t *sec, + int sec_stride, + int height, + unsigned int *sseptr); + +static void variance_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int w, int h, unsigned int *sse, int *sum, + get_var_avx2 var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += 16) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(&src[src_stride * i + j], src_stride, + &ref[ref_stride * i + j], ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + + +unsigned int vp9_variance16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vp9_get16x16var_avx2, 16); + return *sse - (((unsigned int)sum * sum) >> 8); +} + +unsigned int vp9_mse16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + vp9_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse; +} + +unsigned int vp9_variance32x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 32, 16, + sse, &sum, vp9_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 9); +} + +unsigned int vp9_variance32x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 32, 32, + sse, &sum, vp9_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 10); +} + +unsigned int vp9_variance64x64_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 64, 64, + sse, &sum, vp9_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 12); +} + +unsigned int vp9_variance64x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 64, 32, + sse, &sum, vp9_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 11); +} + +unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src, + int src_stride, + int x_offset, + int y_offset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + unsigned int sse1; + const int se1 = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, + y_offset, dst, dst_stride, + 64, &sse1); + unsigned int sse2; + const int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride, + x_offset, y_offset, + dst + 32, dst_stride, + 64, &sse2); + const int se = se1 + se2; + *sse = sse1 + sse2; + return *sse - (((int64_t)se * se) >> 12); +} + +unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src, + int src_stride, + int x_offset, + int y_offset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + const int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, + y_offset, dst, dst_stride, + 32, sse); + return *sse - (((int64_t)se * se) >> 10); +} + +unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src, + int src_stride, + int x_offset, + int y_offset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse, + const uint8_t *sec) { + unsigned int sse1; + const int se1 = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, + y_offset, dst, dst_stride, + sec, 64, 64, &sse1); + unsigned int sse2; + const int se2 = + vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset, + y_offset, dst + 32, dst_stride, + sec + 32, 64, 64, &sse2); + const int se = se1 + se2; + + *sse = sse1 + sse2; + + return *sse - (((int64_t)se * se) >> 12); +} + +unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src, + int src_stride, + int x_offset, + int y_offset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse, + const uint8_t *sec) { + // processing 32 element in parallel + const int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, + y_offset, dst, dst_stride, + sec, 32, 32, sse); + return *sse - (((int64_t)se * se) >> 10); +} diff --git a/media/libvpx/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c b/media/libvpx/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c new file mode 100644 index 000000000000..f9923280a342 --- /dev/null +++ b/media/libvpx/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // AVX2 + +void vp9_get16x16var_avx2(const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *SSE, + int *Sum) { + __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; + __m256i ref_expand_high, madd_low, madd_high; + unsigned int i, src_2strides, ref_2strides; + __m256i zero_reg = _mm256_set1_epi16(0); + __m256i sum_ref_src = _mm256_set1_epi16(0); + __m256i madd_ref_src = _mm256_set1_epi16(0); + + // processing two strides in a 256 bit register reducing the number + // of loop stride by half (comparing to the sse2 code) + src_2strides = source_stride << 1; + ref_2strides = recon_stride << 1; + for (i = 0; i < 8; i++) { + src = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i const *) (src_ptr))); + src = _mm256_inserti128_si256(src, + _mm_loadu_si128((__m128i const *)(src_ptr+source_stride)), 1); + + ref =_mm256_castsi128_si256( + _mm_loadu_si128((__m128i const *) (ref_ptr))); + ref = _mm256_inserti128_si256(ref, + _mm_loadu_si128((__m128i const *)(ref_ptr+recon_stride)), 1); + + // expanding to 16 bit each lane + src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); + src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); + + ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); + ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); + + // src-ref + src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); + src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); + + // madd low (src - ref) + madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); + + // add high to low + src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); + + // madd high (src - ref) + madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); + + sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); + + // add high to low + madd_ref_src = _mm256_add_epi32(madd_ref_src, + _mm256_add_epi32(madd_low, madd_high)); + + src_ptr+= src_2strides; + ref_ptr+= ref_2strides; + } + + { + __m128i sum_res, madd_res; + __m128i expand_sum_low, expand_sum_high, expand_sum; + __m128i expand_madd_low, expand_madd_high, expand_madd; + __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; + + // extract the low lane and add it to the high lane + sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src), + _mm256_extractf128_si256(sum_ref_src, 1)); + + madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src), + _mm256_extractf128_si256(madd_ref_src, 1)); + + // padding each 2 bytes with another 2 zeroed bytes + expand_sum_low = _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), + sum_res); + expand_sum_high = _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), + sum_res); + + // shifting the sign 16 bits right + expand_sum_low = _mm_srai_epi32(expand_sum_low, 16); + expand_sum_high = _mm_srai_epi32(expand_sum_high, 16); + + expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high); + + // expand each 32 bits of the madd result to 64 bits + expand_madd_low = _mm_unpacklo_epi32(madd_res, + _mm256_castsi256_si128(zero_reg)); + expand_madd_high = _mm_unpackhi_epi32(madd_res, + _mm256_castsi256_si128(zero_reg)); + + expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high); + + ex_expand_sum_low = _mm_unpacklo_epi32(expand_sum, + _mm256_castsi256_si128(zero_reg)); + ex_expand_sum_high = _mm_unpackhi_epi32(expand_sum, + _mm256_castsi256_si128(zero_reg)); + + ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high); + + // shift 8 bytes eight + madd_res = _mm_srli_si128(expand_madd, 8); + sum_res = _mm_srli_si128(ex_expand_sum, 8); + + madd_res = _mm_add_epi32(madd_res, expand_madd); + sum_res = _mm_add_epi32(sum_res, ex_expand_sum); + + *((int*)SSE)= _mm_cvtsi128_si32(madd_res); + + *((int*)Sum)= _mm_cvtsi128_si32(sum_res); + } +} + +void vp9_get32x32var_avx2(const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *SSE, + int *Sum) { + __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; + __m256i ref_expand_high, madd_low, madd_high; + unsigned int i; + __m256i zero_reg = _mm256_set1_epi16(0); + __m256i sum_ref_src = _mm256_set1_epi16(0); + __m256i madd_ref_src = _mm256_set1_epi16(0); + + // processing 32 elements in parallel + for (i = 0; i < 16; i++) { + src = _mm256_loadu_si256((__m256i const *) (src_ptr)); + + ref = _mm256_loadu_si256((__m256i const *) (ref_ptr)); + + // expanding to 16 bit each lane + src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); + src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); + + ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); + ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); + + // src-ref + src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); + src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); + + // madd low (src - ref) + madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); + + // add high to low + src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); + + // madd high (src - ref) + madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); + + sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); + + // add high to low + madd_ref_src = _mm256_add_epi32(madd_ref_src, + _mm256_add_epi32(madd_low, madd_high)); + + src_ptr+= source_stride; + ref_ptr+= recon_stride; + } + + { + __m256i expand_sum_low, expand_sum_high, expand_sum; + __m256i expand_madd_low, expand_madd_high, expand_madd; + __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; + + // padding each 2 bytes with another 2 zeroed bytes + expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src); + expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src); + + // shifting the sign 16 bits right + expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16); + expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16); + + expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high); + + // expand each 32 bits of the madd result to 64 bits + expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg); + expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg); + + expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high); + + ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg); + ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg); + + ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high); + + // shift 8 bytes eight + madd_ref_src = _mm256_srli_si256(expand_madd, 8); + sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8); + + madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd); + sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum); + + // extract the low lane and the high lane and add the results + *((int*)SSE)= _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) + + _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1)); + + *((int*)Sum)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) + + _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1)); + } +} diff --git a/media/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm b/media/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm deleted file mode 100644 index 3501cf1fdebe..000000000000 --- a/media/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm +++ /dev/null @@ -1,510 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;unsigned int vp9_get_mb_ss_mmx( short *src_ptr ) -global sym(vp9_get_mb_ss_mmx) PRIVATE -sym(vp9_get_mb_ss_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - GET_GOT rbx - push rsi - push rdi - sub rsp, 8 - ; end prolog - - mov rax, arg(0) ;src_ptr - mov rcx, 16 - pxor mm4, mm4 - -.NEXTROW: - movq mm0, [rax] - movq mm1, [rax+8] - movq mm2, [rax+16] - movq mm3, [rax+24] - pmaddwd mm0, mm0 - pmaddwd mm1, mm1 - pmaddwd mm2, mm2 - pmaddwd mm3, mm3 - - paddd mm4, mm0 - paddd mm4, mm1 - paddd mm4, mm2 - paddd mm4, mm3 - - add rax, 32 - dec rcx - ja .NEXTROW - movq QWORD PTR [rsp], mm4 - - ;return sum[0]+sum[1]; - movsxd rax, dword ptr [rsp] - movsxd rcx, dword ptr [rsp+4] - add rax, rcx - - - ; begin epilog - add rsp, 8 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp9_get8x8var_mmx -;( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride, -; unsigned int *SSE, -; int *Sum -;) -global sym(vp9_get8x8var_mmx) PRIVATE -sym(vp9_get8x8var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - push rbx - sub rsp, 16 - ; end prolog - - - pxor mm5, mm5 ; Blank mmx6 - pxor mm6, mm6 ; Blank mmx7 - pxor mm7, mm7 ; Blank mmx7 - - mov rax, arg(0) ;[src_ptr] ; Load base addresses - mov rbx, arg(2) ;[ref_ptr] - movsxd rcx, dword ptr arg(1) ;[source_stride] - movsxd rdx, dword ptr arg(3) ;[recon_stride] - - ; Row 1 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm1, [rbx] ; Copy eight bytes to mm1 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - - ; Row 2 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 3 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 4 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 5 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - ; movq mm4, [rbx + rdx] - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 6 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 7 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movq mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Row 8 - movq mm0, [rax] ; Copy eight bytes to mm0 - movq mm2, mm0 ; Take copies - movq mm3, mm1 ; Take copies - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - punpckhbw mm2, mm6 ; unpack to higher prrcision - punpckhbw mm3, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - psubsw mm2, mm3 ; A-B (high order) to MM2 - - paddw mm5, mm0 ; accumulate differences in mm5 - paddw mm5, mm2 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - pmaddwd mm2, mm2 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - paddd mm7, mm0 ; accumulate in mm7 - paddd mm7, mm2 ; accumulate in mm7 - - ; Now accumulate the final results. - movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory - movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory - movsx rdx, WORD PTR [rsp+8] - movsx rcx, WORD PTR [rsp+10] - movsx rbx, WORD PTR [rsp+12] - movsx rax, WORD PTR [rsp+14] - add rdx, rcx - add rbx, rax - add rdx, rbx ;XSum - movsxd rax, DWORD PTR [rsp] - movsxd rcx, DWORD PTR [rsp+4] - add rax, rcx ;XXSum - mov rsi, arg(4) ;SSE - mov rdi, arg(5) ;Sum - mov dword ptr [rsi], eax - mov dword ptr [rdi], edx - xor rax, rax ; return 0 - - - ; begin epilog - add rsp, 16 - pop rbx - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - - -;unsigned int -;vp9_get4x4var_mmx -;( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride, -; unsigned int *SSE, -; int *Sum -;) -global sym(vp9_get4x4var_mmx) PRIVATE -sym(vp9_get4x4var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - push rbx - sub rsp, 16 - ; end prolog - - - pxor mm5, mm5 ; Blank mmx6 - pxor mm6, mm6 ; Blank mmx7 - pxor mm7, mm7 ; Blank mmx7 - - mov rax, arg(0) ;[src_ptr] ; Load base addresses - mov rbx, arg(2) ;[ref_ptr] - movsxd rcx, dword ptr arg(1) ;[source_stride] - movsxd rdx, dword ptr arg(3) ;[recon_stride] - - ; Row 1 - movd mm0, [rax] ; Copy 4 bytes to mm0 - movd mm1, [rbx] ; Copy 4 bytes to mm1 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - paddw mm5, mm0 ; accumulate differences in mm5 - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy 4 bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - - ; Row 2 - movd mm0, [rax] ; Copy 4 bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - paddw mm5, mm0 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy 4 bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 3 - movd mm0, [rax] ; Copy 4 bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - paddw mm5, mm0 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy 4 bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 4 - movd mm0, [rax] ; Copy 4 bytes to mm0 - - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - - paddw mm5, mm0 ; accumulate differences in mm5 - - pmaddwd mm0, mm0 ; square and accumulate - paddd mm7, mm0 ; accumulate in mm7 - - - ; Now accumulate the final results. - movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory - movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory - movsx rdx, WORD PTR [rsp+8] - movsx rcx, WORD PTR [rsp+10] - movsx rbx, WORD PTR [rsp+12] - movsx rax, WORD PTR [rsp+14] - add rdx, rcx - add rbx, rax - add rdx, rbx ;XSum - movsxd rax, DWORD PTR [rsp] - movsxd rcx, DWORD PTR [rsp+4] - add rax, rcx ;XXSum - mov rsi, arg(4) ;SSE - mov rdi, arg(5) ;Sum - mov dword ptr [rsi], eax - mov dword ptr [rdi], edx - xor rax, rax ; return 0 - - - ; begin epilog - add rsp, 16 - pop rbx - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - - -;unsigned int -;vp9_get4x4sse_cs_mmx -;( -; unsigned char *src_ptr, -; int source_stride, -; unsigned char *ref_ptr, -; int recon_stride -;) -global sym(vp9_get4x4sse_cs_mmx) PRIVATE -sym(vp9_get4x4sse_cs_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - push rbx - ; end prolog - - - pxor mm6, mm6 ; Blank mmx7 - pxor mm7, mm7 ; Blank mmx7 - - mov rax, arg(0) ;[src_ptr] ; Load base addresses - mov rbx, arg(2) ;[ref_ptr] - movsxd rcx, dword ptr arg(1) ;[source_stride] - movsxd rdx, dword ptr arg(3) ;[recon_stride] - ; Row 1 - movd mm0, [rax] ; Copy eight bytes to mm0 - movd mm1, [rbx] ; Copy eight bytes to mm1 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 2 - movd mm0, [rax] ; Copy eight bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 3 - movd mm0, [rax] ; Copy eight bytes to mm0 - punpcklbw mm1, mm6 - punpcklbw mm0, mm6 ; unpack to higher prrcision - psubsw mm0, mm1 ; A-B (low order) to MM0 - - pmaddwd mm0, mm0 ; square and accumulate - add rbx,rdx ; Inc pointer into ref data - add rax,rcx ; Inc pointer into the new data - movd mm1, [rbx] ; Copy eight bytes to mm1 - paddd mm7, mm0 ; accumulate in mm7 - - ; Row 4 - movd mm0, [rax] ; Copy eight bytes to mm0 - punpcklbw mm0, mm6 ; unpack to higher prrcision - punpcklbw mm1, mm6 - psubsw mm0, mm1 ; A-B (low order) to MM0 - pmaddwd mm0, mm0 ; square and accumulate - paddd mm7, mm0 ; accumulate in mm7 - - movq mm0, mm7 ; - psrlq mm7, 32 - - paddd mm0, mm7 - movq rax, mm0 - - - ; begin epilog - pop rbx - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret diff --git a/media/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm b/media/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm deleted file mode 100644 index 2c50881340d4..000000000000 --- a/media/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm +++ /dev/null @@ -1,734 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;unsigned int vp9_get_mb_ss_sse2 -;( -; short *src_ptr -;) -global sym(vp9_get_mb_ss_sse2) PRIVATE -sym(vp9_get_mb_ss_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 1 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - - mov rax, arg(0) ;[src_ptr] - mov rcx, 8 - pxor xmm4, xmm4 - -.NEXTROW: - movdqa xmm0, [rax] - movdqa xmm1, [rax+16] - movdqa xmm2, [rax+32] - movdqa xmm3, [rax+48] - pmaddwd xmm0, xmm0 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - pmaddwd xmm3, xmm3 - - paddd xmm0, xmm1 - paddd xmm2, xmm3 - paddd xmm4, xmm0 - paddd xmm4, xmm2 - - add rax, 0x40 - dec rcx - ja .NEXTROW - - movdqa xmm3,xmm4 - psrldq xmm4,8 - paddd xmm4,xmm3 - movdqa xmm3,xmm4 - psrldq xmm4,4 - paddd xmm4,xmm3 - movq rax,xmm4 - - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp9_get16x16var_sse2 -;( -; unsigned char * src_ptr, -; int source_stride, -; unsigned char * ref_ptr, -; int recon_stride, -; unsigned int * SSE, -; int * Sum -;) -global sym(vp9_get16x16var_sse2) PRIVATE -sym(vp9_get16x16var_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;[src_ptr] - mov rdi, arg(2) ;[ref_ptr] - - movsxd rax, DWORD PTR arg(1) ;[source_stride] - movsxd rdx, DWORD PTR arg(3) ;[recon_stride] - - ; Prefetch data - lea rcx, [rax+rax*2] - prefetcht0 [rsi] - prefetcht0 [rsi+rax] - prefetcht0 [rsi+rax*2] - prefetcht0 [rsi+rcx] - lea rbx, [rsi+rax*4] - prefetcht0 [rbx] - prefetcht0 [rbx+rax] - prefetcht0 [rbx+rax*2] - prefetcht0 [rbx+rcx] - - lea rcx, [rdx+rdx*2] - prefetcht0 [rdi] - prefetcht0 [rdi+rdx] - prefetcht0 [rdi+rdx*2] - prefetcht0 [rdi+rcx] - lea rbx, [rdi+rdx*4] - prefetcht0 [rbx] - prefetcht0 [rbx+rdx] - prefetcht0 [rbx+rdx*2] - prefetcht0 [rbx+rcx] - - pxor xmm0, xmm0 ; clear xmm0 for unpack - pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs - - pxor xmm6, xmm6 ; clear xmm6 for accumulating sse - mov rcx, 16 - -.var16loop: - movdqu xmm1, XMMWORD PTR [rsi] - movdqu xmm2, XMMWORD PTR [rdi] - - prefetcht0 [rsi+rax*8] - prefetcht0 [rdi+rdx*8] - - movdqa xmm3, xmm1 - movdqa xmm4, xmm2 - - - punpcklbw xmm1, xmm0 - punpckhbw xmm3, xmm0 - - punpcklbw xmm2, xmm0 - punpckhbw xmm4, xmm0 - - - psubw xmm1, xmm2 - psubw xmm3, xmm4 - - paddw xmm7, xmm1 - pmaddwd xmm1, xmm1 - - paddw xmm7, xmm3 - pmaddwd xmm3, xmm3 - - paddd xmm6, xmm1 - paddd xmm6, xmm3 - - add rsi, rax - add rdi, rdx - - sub rcx, 1 - jnz .var16loop - - - movdqa xmm1, xmm6 - pxor xmm6, xmm6 - - pxor xmm5, xmm5 - punpcklwd xmm6, xmm7 - - punpckhwd xmm5, xmm7 - psrad xmm5, 16 - - psrad xmm6, 16 - paddd xmm6, xmm5 - - movdqa xmm2, xmm1 - punpckldq xmm1, xmm0 - - punpckhdq xmm2, xmm0 - movdqa xmm7, xmm6 - - paddd xmm1, xmm2 - punpckldq xmm6, xmm0 - - punpckhdq xmm7, xmm0 - paddd xmm6, xmm7 - - movdqa xmm2, xmm1 - movdqa xmm7, xmm6 - - psrldq xmm1, 8 - psrldq xmm6, 8 - - paddd xmm7, xmm6 - paddd xmm1, xmm2 - - mov rax, arg(5) ;[Sum] - mov rdi, arg(4) ;[SSE] - - movd DWORD PTR [rax], xmm7 - movd DWORD PTR [rdi], xmm1 - - - ; begin epilog - pop rdi - pop rsi - pop rbx - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - - - -;unsigned int vp9_get8x8var_sse2 -;( -; unsigned char * src_ptr, -; int source_stride, -; unsigned char * ref_ptr, -; int recon_stride, -; unsigned int * SSE, -; int * Sum -;) -global sym(vp9_get8x8var_sse2) PRIVATE -sym(vp9_get8x8var_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - mov rsi, arg(0) ;[src_ptr] - mov rdi, arg(2) ;[ref_ptr] - - movsxd rax, DWORD PTR arg(1) ;[source_stride] - movsxd rdx, DWORD PTR arg(3) ;[recon_stride] - - pxor xmm0, xmm0 ; clear xmm0 for unpack - pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs - - movq xmm1, QWORD PTR [rsi] - movq xmm2, QWORD PTR [rdi] - - punpcklbw xmm1, xmm0 - punpcklbw xmm2, xmm0 - - psubsw xmm1, xmm2 - paddw xmm7, xmm1 - - pmaddwd xmm1, xmm1 - - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - movq xmm2, QWORD PTR[rsi + rax * 2] - movq xmm3, QWORD PTR[rdi + rdx * 2] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - lea rsi, [rsi + rax * 2] - lea rdi, [rdi + rdx * 2] - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - movq xmm2, QWORD PTR[rsi + rax *2] - movq xmm3, QWORD PTR[rdi + rdx *2] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - lea rsi, [rsi + rax * 2] - lea rdi, [rdi + rdx * 2] - - - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - movq xmm2, QWORD PTR[rsi + rax *2] - movq xmm3, QWORD PTR[rdi + rdx *2] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - lea rsi, [rsi + rax * 2] - lea rdi, [rdi + rdx * 2] - - movq xmm2, QWORD PTR[rsi + rax] - movq xmm3, QWORD PTR[rdi + rdx] - - punpcklbw xmm2, xmm0 - punpcklbw xmm3, xmm0 - - psubsw xmm2, xmm3 - paddw xmm7, xmm2 - - pmaddwd xmm2, xmm2 - paddd xmm1, xmm2 - - - movdqa xmm6, xmm7 - punpcklwd xmm6, xmm0 - - punpckhwd xmm7, xmm0 - movdqa xmm2, xmm1 - - paddw xmm6, xmm7 - punpckldq xmm1, xmm0 - - punpckhdq xmm2, xmm0 - movdqa xmm7, xmm6 - - paddd xmm1, xmm2 - punpckldq xmm6, xmm0 - - punpckhdq xmm7, xmm0 - paddw xmm6, xmm7 - - movdqa xmm2, xmm1 - movdqa xmm7, xmm6 - - psrldq xmm1, 8 - psrldq xmm6, 8 - - paddw xmm7, xmm6 - paddd xmm1, xmm2 - - mov rax, arg(5) ;[Sum] - mov rdi, arg(4) ;[SSE] - - movq rdx, xmm7 - movsx rcx, dx - - mov dword ptr [rax], ecx - movd DWORD PTR [rdi], xmm1 - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_half_horiz_vert_variance8x_h_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp9_half_horiz_vert_variance8x_h_sse2) PRIVATE -sym(vp9_half_horiz_vert_variance8x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - movsxd r9, dword ptr arg(3) ;src_pixels_per_line -%endif - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref_ptr ; - - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - - pxor xmm0, xmm0 ; - - movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 - movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source -%else - add rsi, r8 -%endif - -.half_horiz_vert_variance8x_h_1: - - movq xmm1, QWORD PTR [rsi] ; - movq xmm2, QWORD PTR [rsi+1] ; - pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 - - pavgb xmm5, xmm1 ; xmm = vertical average of the above - punpcklbw xmm5, xmm0 ; xmm5 = words of above - - movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 - punpcklbw xmm3, xmm0 ; xmm3 = words of above - - psubw xmm5, xmm3 ; xmm5 -= xmm3 - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - - movdqa xmm5, xmm1 ; save xmm1 for use on the next row - -%if ABI_IS_32BIT - add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source - add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination -%else - add rsi, r8 - add rdi, r9 -%endif - - sub rcx, 1 ; - jnz .half_horiz_vert_variance8x_h_1 ; - - movdq2q mm6, xmm6 ; - movdq2q mm7, xmm7 ; - - psrldq xmm6, 8 - psrldq xmm7, 8 - - movdq2q mm2, xmm6 - movdq2q mm3, xmm7 - - paddw mm6, mm2 - paddd mm7, mm3 - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rsi, arg(5) ; sum - mov rdi, arg(6) ; sumsquared - - movd [rsi], mm2 ; - movd [rdi], mm4 ; - - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_half_vert_variance8x_h_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp9_half_vert_variance8x_h_sse2) PRIVATE -sym(vp9_half_vert_variance8x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - movsxd r9, dword ptr arg(3) ;src_pixels_per_line -%endif - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref_ptr ; - - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - - pxor xmm0, xmm0 ; -.half_vert_variance8x_h_1: - movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 - movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9 - - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) - punpcklbw xmm5, xmm0 ; xmm5 = words of above - - movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 - punpcklbw xmm3, xmm0 ; xmm3 = words of above - - psubw xmm5, xmm3 ; xmm5 -= xmm3 - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - -%if ABI_IS_32BIT - add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source - add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination -%else - add rsi, r8 - add rdi, r9 -%endif - - sub rcx, 1 ; - jnz .half_vert_variance8x_h_1 ; - - movdq2q mm6, xmm6 ; - movdq2q mm7, xmm7 ; - - psrldq xmm6, 8 - psrldq xmm7, 8 - - movdq2q mm2, xmm6 - movdq2q mm3, xmm7 - - paddw mm6, mm2 - paddd mm7, mm3 - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rsi, arg(5) ; sum - mov rdi, arg(6) ; sumsquared - - movd [rsi], mm2 ; - movd [rdi], mm4 ; - - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_half_horiz_variance8x_h_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp9_half_horiz_variance8x_h_sse2) PRIVATE -sym(vp9_half_horiz_variance8x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - movsxd r9, dword ptr arg(3) ;src_pixels_per_line -%endif - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref_ptr ; - - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; - - pxor xmm0, xmm0 ; -.half_horiz_variance8x_h_1: - movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 - movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 - - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) - punpcklbw xmm5, xmm0 ; xmm5 = words of above - - movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 - punpcklbw xmm3, xmm0 ; xmm3 = words of above - - psubw xmm5, xmm3 ; xmm5 -= xmm3 - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - -%if ABI_IS_32BIT - add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source - add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination -%else - add rsi, r8 - add rdi, r9 -%endif - sub rcx, 1 ; - jnz .half_horiz_variance8x_h_1 ; - - movdq2q mm6, xmm6 ; - movdq2q mm7, xmm7 ; - - psrldq xmm6, 8 - psrldq xmm7, 8 - - movdq2q mm2, xmm6 - movdq2q mm3, xmm7 - - paddw mm6, mm2 - paddd mm7, mm3 - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rsi, arg(5) ; sum - mov rdi, arg(6) ; sumsquared - - movd [rsi], mm2 ; - movd [rdi], mm4 ; - - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/media/libvpx/vp9/encoder/x86/vp9_variance_mmx.c b/media/libvpx/vp9/encoder/x86/vp9_variance_mmx.c deleted file mode 100644 index a3d011401dd7..000000000000 --- a/media/libvpx/vp9/encoder/x86/vp9_variance_mmx.c +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_config.h" -#include "vp9/encoder/vp9_variance.h" -#include "vp9/common/vp9_pragmas.h" -#include "vpx_ports/mem.h" - -extern unsigned int vp9_get_mb_ss_mmx(const int16_t *src_ptr); -extern unsigned int vp9_get8x8var_mmx -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); -extern unsigned int vp9_get4x4var_mmx -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); - -unsigned int vp9_variance4x4_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 4)); -} - -unsigned int vp9_variance8x8_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg); - *sse = var; - - return (var - (((unsigned int)avg * avg) >> 6)); -} - -unsigned int vp9_mse16x16_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int sse0, sse1, sse2, sse3, var; - int sum0, sum1, sum2, sum3; - - - vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, - &sum0); - vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, - &sse1, &sum1); - vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, - ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2); - vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, - ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); - - var = sse0 + sse1 + sse2 + sse3; - *sse = var; - return var; -} - - -unsigned int vp9_variance16x16_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int sse0, sse1, sse2, sse3, var; - int sum0, sum1, sum2, sum3, avg; - - vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, - &sum0); - vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, - &sse1, &sum1); - vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, - ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2); - vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, - ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); - - var = sse0 + sse1 + sse2 + sse3; - avg = sum0 + sum1 + sum2 + sum3; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 8)); -} - -unsigned int vp9_variance16x8_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - - vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, - &sum0); - vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, - &sse1, &sum1); - - var = sse0 + sse1; - avg = sum0 + sum1; - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); -} - - -unsigned int vp9_variance8x16_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int sse0, sse1, var; - int sum0, sum1, avg; - - vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, - &sum0); - vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, - ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1); - - var = sse0 + sse1; - avg = sum0 + sum1; - *sse = var; - - return (var - (((unsigned int)avg * avg) >> 7)); -} diff --git a/media/libvpx/vp9/encoder/x86/vp9_variance_sse2.c b/media/libvpx/vp9/encoder/x86/vp9_variance_sse2.c index 79e42c4cd4ac..b4d2b0ac408d 100644 --- a/media/libvpx/vp9/encoder/x86/vp9_variance_sse2.c +++ b/media/libvpx/vp9/encoder/x86/vp9_variance_sse2.c @@ -8,120 +8,153 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include // SSE2 + #include "./vpx_config.h" #include "vp9/encoder/vp9_variance.h" -#include "vp9/common/vp9_pragmas.h" #include "vpx_ports/mem.h" -extern unsigned int vp9_get4x4var_mmx -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); +typedef unsigned int (*variance_fn_t) (const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse, int *sum); -unsigned int vp9_get_mb_ss_sse2 -( - const int16_t *src_ptr -); -unsigned int vp9_get16x16var_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); -unsigned int vp9_get8x8var_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); -void vp9_half_horiz_vert_variance8x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -void vp9_half_horiz_vert_variance16x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -void vp9_half_horiz_variance8x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -void vp9_half_horiz_variance16x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -void vp9_half_vert_variance8x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -void vp9_half_vert_variance16x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); +unsigned int vp9_get_mb_ss_sse2(const int16_t *src) { + __m128i vsum = _mm_setzero_si128(); + int i; -typedef unsigned int (*get_var_sse2) ( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); + for (i = 0; i < 32; ++i) { + const __m128i v = _mm_loadu_si128((const __m128i *)src); + vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); + src += 8; + } -static void variance_sse2(const unsigned char *src_ptr, int source_stride, - const unsigned char *ref_ptr, int recon_stride, - int w, int h, unsigned int *sse, int *sum, - get_var_sse2 var_fn, int block_size) { - unsigned int sse0; - int sum0; + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + return _mm_cvtsi128_si32(vsum); +} + +#define READ64(p, stride, i) \ + _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \ + _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride))) + +unsigned int vp9_get4x4var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero); + const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero); + const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero); + const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + // sum + __m128i vsum = _mm_add_epi16(diff0, diff1); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + + // sse + vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0), + _mm_madd_epi16(diff1, diff1)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + *sse = _mm_cvtsi128_si32(vsum); + + return 0; +} + +unsigned int vp9_get8x8var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); + __m128i vsse = _mm_setzero_si128(); + int i; + + for (i = 0; i < 8; i += 2) { + const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(src + i * src_stride)), zero); + const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(ref + i * ref_stride)), zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + + const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(src + (i + 1) * src_stride)), zero); + const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64( + (const __m128i *)(ref + (i + 1) * ref_stride)), zero); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + vsum = _mm_add_epi16(vsum, diff0); + vsum = _mm_add_epi16(vsum, diff1); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); + } + + // sum + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + + // sse + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); + *sse = _mm_cvtsi128_si32(vsse); + + return 0; +} + +unsigned int vp9_get16x16var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); + __m128i vsse = _mm_setzero_si128(); + int i; + + for (i = 0; i < 16; ++i) { + const __m128i s = _mm_loadu_si128((const __m128i *)src); + const __m128i r = _mm_loadu_si128((const __m128i *)ref); + + const __m128i src0 = _mm_unpacklo_epi8(s, zero); + const __m128i ref0 = _mm_unpacklo_epi8(r, zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + + const __m128i src1 = _mm_unpackhi_epi8(s, zero); + const __m128i ref1 = _mm_unpackhi_epi8(r, zero); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + vsum = _mm_add_epi16(vsum, diff0); + vsum = _mm_add_epi16(vsum, diff1); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); + + src += src_stride; + ref += ref_stride; + } + + // sum + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0) + + (int16_t)_mm_extract_epi16(vsum, 1); + + // sse + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); + *sse = _mm_cvtsi128_si32(vsse); + + return 0; +} + + +static void variance_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + int w, int h, unsigned int *sse, int *sum, + variance_fn_t var_fn, int block_size) { int i, j; *sse = 0; @@ -129,217 +162,156 @@ static void variance_sse2(const unsigned char *src_ptr, int source_stride, for (i = 0; i < h; i += block_size) { for (j = 0; j < w; j += block_size) { - var_fn(src_ptr + source_stride * i + j, source_stride, - ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0); + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); *sse += sse0; *sum += sum0; } } } -unsigned int vp9_variance4x4_sse2( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, - &var, &avg, vp9_get4x4var_mmx, 4); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 4)); -} - -unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4, - &var, &avg, vp9_get4x4var_mmx, 4); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 5)); + int sum; + vp9_get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse - (((unsigned int)sum * sum) >> 4); } -unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8, - &var, &avg, vp9_get4x4var_mmx, 4); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 5)); + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 8, 4, + sse, &sum, vp9_get4x4var_sse2, 4); + return *sse - (((unsigned int)sum * sum) >> 5); } -unsigned int vp9_variance8x8_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, - &var, &avg, vp9_get8x8var_sse2, 8); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 6)); +unsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 4, 8, + sse, &sum, vp9_get4x4var_sse2, 4); + return *sse - (((unsigned int)sum * sum) >> 5); } -unsigned int vp9_variance16x8_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, - &var, &avg, vp9_get8x8var_sse2, 8); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); +unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + vp9_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse - (((unsigned int)sum * sum) >> 6); } -unsigned int vp9_variance8x16_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, - &var, &avg, vp9_get8x8var_sse2, 8); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 7)); +unsigned int vp9_variance16x8_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 16, 8, + sse, &sum, vp9_get8x8var_sse2, 8); + return *sse - (((unsigned int)sum * sum) >> 7); } -unsigned int vp9_variance16x16_sse2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, - &var, &avg, vp9_get16x16var_sse2, 16); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 8)); +unsigned int vp9_variance8x16_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 8, 16, + sse, &sum, vp9_get8x8var_sse2, 8); + return *sse - (((unsigned int)sum * sum) >> 7); } -unsigned int vp9_mse16x16_sse2( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int sse0; - int sum0; - vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, - &sum0); - *sse = sse0; - return sse0; -} - -unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, - &var, &avg, vp9_get16x16var_sse2, 16); - *sse = var; - return (var - (((int64_t)avg * avg) >> 10)); + int sum; + vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse - (((unsigned int)sum * sum) >> 8); } -unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16, - &var, &avg, vp9_get16x16var_sse2, 16); - *sse = var; - return (var - (((int64_t)avg * avg) >> 9)); + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 32, + sse, &sum, vp9_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 10); } -unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance32x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32, - &var, &avg, vp9_get16x16var_sse2, 16); - *sse = var; - return (var - (((int64_t)avg * avg) >> 9)); + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 16, + sse, &sum, vp9_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 9); } -unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance16x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64, - &var, &avg, vp9_get16x16var_sse2, 16); - *sse = var; - return (var - (((int64_t)avg * avg) >> 12)); + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 16, 32, + sse, &sum, vp9_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 9); } -unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance64x64_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32, - &var, &avg, vp9_get16x16var_sse2, 16); - *sse = var; - return (var - (((int64_t)avg * avg) >> 11)); + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 64, 64, + sse, &sum, vp9_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 12); } -unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance64x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 64, 32, + sse, &sum, vp9_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 11); +} - variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64, - &var, &avg, vp9_get16x16var_sse2, 16); - *sse = var; - return (var - (((int64_t)avg * avg) >> 11)); +unsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 64, + sse, &sum, vp9_get16x16var_sse2, 16); + return *sse - (((int64_t)sum * sum) >> 11); +} + +unsigned int vp9_mse8x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vp9_variance8x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vp9_mse8x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vp9_variance8x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vp9_mse16x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vp9_variance16x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int vp9_mse16x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + vp9_variance16x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; } #define DECL(w, opt) \ @@ -498,58 +470,3 @@ FNS(ssse3, ssse3); #undef FNS #undef FN - -unsigned int vp9_variance_halfpixvar16x16_h_sse2( - const unsigned char *src_ptr, - int src_pixels_per_line, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - int xsum0; - unsigned int xxsum0; - - vp9_half_horiz_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); -} - - -unsigned int vp9_variance_halfpixvar16x16_v_sse2( - const unsigned char *src_ptr, - int src_pixels_per_line, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - int xsum0; - unsigned int xxsum0; - vp9_half_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); -} - - -unsigned int vp9_variance_halfpixvar16x16_hv_sse2( - const unsigned char *src_ptr, - int src_pixels_per_line, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - int xsum0; - unsigned int xxsum0; - - vp9_half_horiz_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); -} diff --git a/media/libvpx/vp9/vp9_cx_iface.c b/media/libvpx/vp9/vp9_cx_iface.c index 9a23ebd5398d..0f0b7a5abbde 100644 --- a/media/libvpx/vp9/vp9_cx_iface.c +++ b/media/libvpx/vp9/vp9_cx_iface.c @@ -11,84 +11,74 @@ #include #include +#include "./vpx_config.h" #include "vpx/vpx_codec.h" #include "vpx/internal/vpx_codec_internal.h" #include "./vpx_version.h" -#include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_encoder.h" #include "vpx/vp8cx.h" #include "vp9/encoder/vp9_firstpass.h" -#include "vp9/common/vp9_onyx.h" #include "vp9/vp9_iface_common.h" struct vp9_extracfg { - struct vpx_codec_pkt_list *pkt_list; - int cpu_used; /* available cpu percentage in 1/16 */ + int cpu_used; // available cpu percentage in 1/16 unsigned int enable_auto_alt_ref; unsigned int noise_sensitivity; - unsigned int Sharpness; + unsigned int sharpness; unsigned int static_thresh; unsigned int tile_columns; unsigned int tile_rows; unsigned int arnr_max_frames; unsigned int arnr_strength; - unsigned int arnr_type; - unsigned int experimental; vp8e_tuning tuning; - unsigned int cq_level; /* constrained quality level */ + unsigned int cq_level; // constrained quality level unsigned int rc_max_intra_bitrate_pct; unsigned int lossless; unsigned int frame_parallel_decoding_mode; - unsigned int aq_mode; + AQ_MODE aq_mode; + unsigned int frame_periodic_boost; + vpx_bit_depth_t bit_depth; + vp9e_tune_content content; }; -struct extraconfig_map { - int usage; - struct vp9_extracfg cfg; -}; - -static const struct extraconfig_map extracfg_map[] = { - { - 0, - { // NOLINT - NULL, - 0, /* cpu_used */ - 1, /* enable_auto_alt_ref */ - 0, /* noise_sensitivity */ - 0, /* Sharpness */ - 0, /* static_thresh */ - 0, /* tile_columns */ - 0, /* tile_rows */ - 7, /* arnr_max_frames */ - 5, /* arnr_strength */ - 3, /* arnr_type*/ - 0, /* experimental mode */ - 0, /* tuning*/ - 10, /* cq_level */ - 0, /* rc_max_intra_bitrate_pct */ - 0, /* lossless */ - 0, /* frame_parallel_decoding_mode */ - 0, /* aq_mode */ - } - } +static struct vp9_extracfg default_extra_cfg = { + 0, // cpu_used + 1, // enable_auto_alt_ref + 0, // noise_sensitivity + 0, // sharpness + 0, // static_thresh + 0, // tile_columns + 0, // tile_rows + 7, // arnr_max_frames + 5, // arnr_strength + VP8_TUNE_PSNR, // tuning + 10, // cq_level + 0, // rc_max_intra_bitrate_pct + 0, // lossless + 0, // frame_parallel_decoding_mode + NO_AQ, // aq_mode + 0, // frame_periodic_delta_q + VPX_BITS_8, // Bit depth + VP9E_CONTENT_DEFAULT // content }; struct vpx_codec_alg_priv { vpx_codec_priv_t base; vpx_codec_enc_cfg_t cfg; - struct vp9_extracfg vp8_cfg; - VP9_CONFIG oxcf; - VP9_PTR cpi; + struct vp9_extracfg extra_cfg; + VP9EncoderConfig oxcf; + VP9_COMP *cpi; unsigned char *cx_data; - unsigned int cx_data_sz; + size_t cx_data_sz; unsigned char *pending_cx_data; - unsigned int pending_cx_data_sz; + size_t pending_cx_data_sz; int pending_frame_count; - uint32_t pending_frame_sizes[8]; - uint32_t pending_frame_magnitude; + size_t pending_frame_sizes[8]; + size_t pending_frame_magnitude; vpx_image_t preview_img; vp8_postproc_cfg_t preview_ppcfg; - vpx_codec_pkt_list_decl(64) pkt_list; - unsigned int fixed_kf_cntr; + vpx_codec_pkt_list_decl(256) pkt_list; + unsigned int fixed_kf_cntr; }; static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) { @@ -100,19 +90,16 @@ static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) { case VP8_ALTR_FRAME: return VP9_ALT_FLAG; } - assert(!"Invalid Reference Frame"); + assert(0 && "Invalid Reference Frame"); return VP9_LAST_FLAG; } -static vpx_codec_err_t -update_error_state(vpx_codec_alg_priv_t *ctx, - const struct vpx_internal_error_info *error) { - vpx_codec_err_t res; +static vpx_codec_err_t update_error_state(vpx_codec_alg_priv_t *ctx, + const struct vpx_internal_error_info *error) { + const vpx_codec_err_t res = error->error_code; - if ((res = error->error_code)) - ctx->base.err_detail = error->has_detail - ? error->detail - : NULL; + if (res != VPX_CODEC_OK) + ctx->base.err_detail = error->has_detail ? error->detail : NULL; return res; } @@ -143,24 +130,20 @@ update_error_state(vpx_codec_alg_priv_t *ctx, if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean");\ } while (0) -static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, +static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, const vpx_codec_enc_cfg_t *cfg, - const struct vp9_extracfg *vp8_cfg) { - RANGE_CHECK(cfg, g_w, 1, 65535); /* 16 bits available */ - RANGE_CHECK(cfg, g_h, 1, 65535); /* 16 bits available */ + const struct vp9_extracfg *extra_cfg) { + RANGE_CHECK(cfg, g_w, 1, 65535); // 16 bits available + RANGE_CHECK(cfg, g_h, 1, 65535); // 16 bits available RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000); RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den); RANGE_CHECK_HI(cfg, g_profile, 3); RANGE_CHECK_HI(cfg, rc_max_quantizer, 63); RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer); - RANGE_CHECK_BOOL(vp8_cfg, lossless); - if (vp8_cfg->lossless) { - RANGE_CHECK_HI(cfg, rc_max_quantizer, 0); - RANGE_CHECK_HI(cfg, rc_min_quantizer, 0); - } - RANGE_CHECK(vp8_cfg, aq_mode, 0, AQ_MODES_COUNT - 1); - + RANGE_CHECK_BOOL(extra_cfg, lossless); + RANGE_CHECK(extra_cfg, aq_mode, 0, AQ_MODE_COUNT - 1); + RANGE_CHECK(extra_cfg, frame_periodic_boost, 0, 1); RANGE_CHECK_HI(cfg, g_threads, 64); RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS); RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_Q); @@ -168,63 +151,148 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK_HI(cfg, rc_overshoot_pct, 1000); RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100); RANGE_CHECK(cfg, kf_mode, VPX_KF_DISABLED, VPX_KF_AUTO); - // RANGE_CHECK_BOOL(cfg, g_delete_firstpassfile); RANGE_CHECK_BOOL(cfg, rc_resize_allowed); RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100); RANGE_CHECK_HI(cfg, rc_resize_up_thresh, 100); RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100); RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_LAST_PASS); - RANGE_CHECK(cfg, ss_number_layers, 1, - VPX_SS_MAX_LAYERS); /*Spatial layers max */ - /* VP8 does not support a lower bound on the keyframe interval in - * automatic keyframe placement mode. - */ - if (cfg->kf_mode != VPX_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist - && cfg->kf_min_dist > 0) + if (cfg->rc_resize_allowed == 1) { + RANGE_CHECK(cfg, rc_scaled_width, 1, cfg->g_w); + RANGE_CHECK(cfg, rc_scaled_height, 1, cfg->g_h); + } + + RANGE_CHECK(cfg, ss_number_layers, 1, VPX_SS_MAX_LAYERS); + RANGE_CHECK(cfg, ts_number_layers, 1, VPX_TS_MAX_LAYERS); + + if (cfg->ts_number_layers > 1) { + unsigned int i; + for (i = 1; i < cfg->ts_number_layers; ++i) + if (cfg->ts_target_bitrate[i] < cfg->ts_target_bitrate[i - 1]) + ERROR("ts_target_bitrate entries are not increasing"); + + RANGE_CHECK(cfg, ts_rate_decimator[cfg->ts_number_layers - 1], 1, 1); + for (i = cfg->ts_number_layers - 2; i > 0; --i) + if (cfg->ts_rate_decimator[i - 1] != 2 * cfg->ts_rate_decimator[i]) + ERROR("ts_rate_decimator factors are not powers of 2"); + } + +#if CONFIG_SPATIAL_SVC + if (cfg->ss_number_layers * cfg->ts_number_layers > REF_FRAMES) + ERROR("Too many layers. Maximum 8 layers could be set"); + + if ((cfg->ss_number_layers > 1 || cfg->ts_number_layers > 1) && + cfg->g_pass == VPX_RC_LAST_PASS) { + unsigned int i, alt_ref_sum = 0; + for (i = 0; i < cfg->ss_number_layers; ++i) { + if (cfg->ss_enable_auto_alt_ref[i]) + ++alt_ref_sum; + } + if (alt_ref_sum > + REF_FRAMES - cfg->ss_number_layers * cfg->ts_number_layers) + ERROR("Not enough ref buffers for svc alt ref frames"); + if ((cfg->ss_number_layers > 3 || + cfg->ss_number_layers * cfg->ts_number_layers > 4) && + cfg->g_error_resilient == 0) + ERROR("Multiple frame context are not supported for more than 3 spatial " + "layers or more than 4 spatial x temporal layers"); + } +#endif + + // VP9 does not support a lower bound on the keyframe interval in + // automatic keyframe placement mode. + if (cfg->kf_mode != VPX_KF_DISABLED && + cfg->kf_min_dist != cfg->kf_max_dist && + cfg->kf_min_dist > 0) ERROR("kf_min_dist not supported in auto mode, use 0 " "or kf_max_dist instead."); - RANGE_CHECK_BOOL(vp8_cfg, enable_auto_alt_ref); - RANGE_CHECK(vp8_cfg, cpu_used, -16, 16); + RANGE_CHECK_BOOL(extra_cfg, enable_auto_alt_ref); + RANGE_CHECK(extra_cfg, cpu_used, -16, 16); + RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6); + RANGE_CHECK(extra_cfg, tile_columns, 0, 6); + RANGE_CHECK(extra_cfg, tile_rows, 0, 2); + RANGE_CHECK_HI(extra_cfg, sharpness, 7); + RANGE_CHECK(extra_cfg, arnr_max_frames, 0, 15); + RANGE_CHECK_HI(extra_cfg, arnr_strength, 6); + RANGE_CHECK(extra_cfg, cq_level, 0, 63); + RANGE_CHECK(cfg, g_bit_depth, VPX_BITS_8, VPX_BITS_12); + RANGE_CHECK(cfg, g_input_bit_depth, 8, 12); + RANGE_CHECK(extra_cfg, content, + VP9E_CONTENT_DEFAULT, VP9E_CONTENT_INVALID - 1); - RANGE_CHECK_HI(vp8_cfg, noise_sensitivity, 6); - - RANGE_CHECK(vp8_cfg, tile_columns, 0, 6); - RANGE_CHECK(vp8_cfg, tile_rows, 0, 2); - RANGE_CHECK_HI(vp8_cfg, Sharpness, 7); - RANGE_CHECK(vp8_cfg, arnr_max_frames, 0, 15); - RANGE_CHECK_HI(vp8_cfg, arnr_strength, 6); - RANGE_CHECK(vp8_cfg, arnr_type, 1, 3); - RANGE_CHECK(vp8_cfg, cq_level, 0, 63); + // TODO(yaowu): remove this when ssim tuning is implemented for vp9 + if (extra_cfg->tuning == VP8_TUNE_SSIM) + ERROR("Option --tune=ssim is not currently supported in VP9."); if (cfg->g_pass == VPX_RC_LAST_PASS) { - size_t packet_sz = sizeof(FIRSTPASS_STATS); - int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz); - FIRSTPASS_STATS *stats; + const size_t packet_sz = sizeof(FIRSTPASS_STATS); + const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz); + const FIRSTPASS_STATS *stats; - if (!cfg->rc_twopass_stats_in.buf) + if (cfg->rc_twopass_stats_in.buf == NULL) ERROR("rc_twopass_stats_in.buf not set."); if (cfg->rc_twopass_stats_in.sz % packet_sz) ERROR("rc_twopass_stats_in.sz indicates truncated packet."); - if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz) - ERROR("rc_twopass_stats_in requires at least two packets."); + if (cfg->ss_number_layers > 1 || cfg->ts_number_layers > 1) { + int i; + unsigned int n_packets_per_layer[VPX_SS_MAX_LAYERS] = {0}; - stats = (void *)((char *)cfg->rc_twopass_stats_in.buf - + (n_packets - 1) * packet_sz); + stats = cfg->rc_twopass_stats_in.buf; + for (i = 0; i < n_packets; ++i) { + const int layer_id = (int)stats[i].spatial_layer_id; + if (layer_id >= 0 && layer_id < (int)cfg->ss_number_layers) { + ++n_packets_per_layer[layer_id]; + } + } - if ((int)(stats->count + 0.5) != n_packets - 1) - ERROR("rc_twopass_stats_in missing EOS stats packet"); + for (i = 0; i < (int)cfg->ss_number_layers; ++i) { + unsigned int layer_id; + if (n_packets_per_layer[i] < 2) { + ERROR("rc_twopass_stats_in requires at least two packets for each " + "layer."); + } + + stats = (const FIRSTPASS_STATS *)cfg->rc_twopass_stats_in.buf + + n_packets - cfg->ss_number_layers + i; + layer_id = (int)stats->spatial_layer_id; + + if (layer_id >= cfg->ss_number_layers + ||(unsigned int)(stats->count + 0.5) != + n_packets_per_layer[layer_id] - 1) + ERROR("rc_twopass_stats_in missing EOS stats packet"); + } + } else { + if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz) + ERROR("rc_twopass_stats_in requires at least two packets."); + + stats = + (const FIRSTPASS_STATS *)cfg->rc_twopass_stats_in.buf + n_packets - 1; + + if ((int)(stats->count + 0.5) != n_packets - 1) + ERROR("rc_twopass_stats_in missing EOS stats packet"); + } } +#if !CONFIG_VP9_HIGHBITDEPTH + if (cfg->g_profile > (unsigned int)PROFILE_1) + ERROR("Profile > 1 not supported in this build configuration"); +#endif + if (cfg->g_profile <= (unsigned int)PROFILE_1 && + extra_cfg->bit_depth > VPX_BITS_8) + ERROR("Codec high bit-depth not supported in profile < 2"); + if (cfg->g_profile > (unsigned int)PROFILE_1 && + extra_cfg->bit_depth == VPX_BITS_8) + ERROR("Codec bit-depth 8 not supported in profile > 1"); + return VPX_CODEC_OK; } static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx, - const vpx_image_t *img) { + const vpx_image_t *img) { switch (img->fmt) { case VPX_IMG_FMT_YV12: case VPX_IMG_FMT_I420: @@ -234,121 +302,160 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx, default: ERROR("Invalid image format. Only YV12, I420, I422, I444 images are " "supported."); + break; } - if ((img->d_w != ctx->cfg.g_w) || (img->d_h != ctx->cfg.g_h)) + if (img->d_w != ctx->cfg.g_w || img->d_h != ctx->cfg.g_h) ERROR("Image size must match encoder init configuration size"); return VPX_CODEC_OK; } - -static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf, - vpx_codec_enc_cfg_t cfg, - struct vp9_extracfg vp8_cfg) { - oxcf->version = cfg.g_profile | (vp8_cfg.experimental ? 0x4 : 0); - oxcf->width = cfg.g_w; - oxcf->height = cfg.g_h; - /* guess a frame rate if out of whack, use 30 */ - oxcf->framerate = (double)(cfg.g_timebase.den) - / (double)(cfg.g_timebase.num); - - if (oxcf->framerate > 180) { - oxcf->framerate = 30; +static int get_image_bps(const vpx_image_t *img) { + switch (img->fmt) { + case VPX_IMG_FMT_YV12: + case VPX_IMG_FMT_I420: return 12; + case VPX_IMG_FMT_I422: return 16; + case VPX_IMG_FMT_I444: return 24; + case VPX_IMG_FMT_I42016: return 24; + case VPX_IMG_FMT_I42216: return 32; + case VPX_IMG_FMT_I44416: return 48; + default: assert(0 && "Invalid image format"); break; } + return 0; +} - switch (cfg.g_pass) { +static vpx_codec_err_t set_encoder_config( + VP9EncoderConfig *oxcf, + const vpx_codec_enc_cfg_t *cfg, + const struct vp9_extracfg *extra_cfg) { + const int is_vbr = cfg->rc_end_usage == VPX_VBR; + oxcf->profile = cfg->g_profile; + oxcf->width = cfg->g_w; + oxcf->height = cfg->g_h; + oxcf->bit_depth = extra_cfg->bit_depth; + oxcf->input_bit_depth = cfg->g_input_bit_depth; + // guess a frame rate if out of whack, use 30 + oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num; + if (oxcf->init_framerate > 180) + oxcf->init_framerate = 30; + + oxcf->mode = GOOD; + + switch (cfg->g_pass) { case VPX_RC_ONE_PASS: - oxcf->Mode = MODE_GOODQUALITY; + oxcf->pass = 0; break; case VPX_RC_FIRST_PASS: - oxcf->Mode = MODE_FIRSTPASS; + oxcf->pass = 1; break; case VPX_RC_LAST_PASS: - oxcf->Mode = MODE_SECONDPASS_BEST; + oxcf->pass = 2; break; } - if (cfg.g_pass == VPX_RC_FIRST_PASS) { - oxcf->allow_lag = 0; - oxcf->lag_in_frames = 0; - } else { - oxcf->allow_lag = (cfg.g_lag_in_frames) > 0; - oxcf->lag_in_frames = cfg.g_lag_in_frames; - } + oxcf->lag_in_frames = cfg->g_pass == VPX_RC_FIRST_PASS ? 0 + : cfg->g_lag_in_frames; + oxcf->rc_mode = cfg->rc_end_usage; - // VBR only supported for now. - // CBR code has been deprectated for experimental phase. - // CQ mode not yet tested - oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK; - if (cfg.rc_end_usage == VPX_CQ) - oxcf->end_usage = USAGE_CONSTRAINED_QUALITY; - else if (cfg.rc_end_usage == VPX_Q) - oxcf->end_usage = USAGE_CONSTANT_QUALITY; - else if (cfg.rc_end_usage == VPX_CBR) - oxcf->end_usage = USAGE_STREAM_FROM_SERVER; + // Convert target bandwidth from Kbit/s to Bit/s + oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate; + oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct; - oxcf->target_bandwidth = cfg.rc_target_bitrate; - oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct; - - oxcf->best_allowed_q = cfg.rc_min_quantizer; - oxcf->worst_allowed_q = cfg.rc_max_quantizer; - oxcf->cq_level = vp8_cfg.cq_level; + oxcf->best_allowed_q = + extra_cfg->lossless ? 0 : vp9_quantizer_to_qindex(cfg->rc_min_quantizer); + oxcf->worst_allowed_q = + extra_cfg->lossless ? 0 : vp9_quantizer_to_qindex(cfg->rc_max_quantizer); + oxcf->cq_level = vp9_quantizer_to_qindex(extra_cfg->cq_level); oxcf->fixed_q = -1; - oxcf->under_shoot_pct = cfg.rc_undershoot_pct; - oxcf->over_shoot_pct = cfg.rc_overshoot_pct; + oxcf->under_shoot_pct = cfg->rc_undershoot_pct; + oxcf->over_shoot_pct = cfg->rc_overshoot_pct; - oxcf->maximum_buffer_size = cfg.rc_buf_sz; - oxcf->starting_buffer_level = cfg.rc_buf_initial_sz; - oxcf->optimal_buffer_level = cfg.rc_buf_optimal_sz; + oxcf->allow_spatial_resampling = cfg->rc_resize_allowed; + oxcf->scaled_frame_width = cfg->rc_scaled_width; + oxcf->scaled_frame_height = cfg->rc_scaled_height; - oxcf->two_pass_vbrbias = cfg.rc_2pass_vbr_bias_pct; - oxcf->two_pass_vbrmin_section = cfg.rc_2pass_vbr_minsection_pct; - oxcf->two_pass_vbrmax_section = cfg.rc_2pass_vbr_maxsection_pct; + oxcf->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz; + oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz; + oxcf->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz; - oxcf->auto_key = cfg.kf_mode == VPX_KF_AUTO - && cfg.kf_min_dist != cfg.kf_max_dist; - // oxcf->kf_min_dist = cfg.kf_min_dis; - oxcf->key_freq = cfg.kf_max_dist; + oxcf->drop_frames_water_mark = cfg->rc_dropframe_thresh; - // oxcf->delete_first_pass_file = cfg.g_delete_firstpassfile; - // strcpy(oxcf->first_pass_file, cfg.g_firstpass_file); + oxcf->two_pass_vbrbias = cfg->rc_2pass_vbr_bias_pct; + oxcf->two_pass_vbrmin_section = cfg->rc_2pass_vbr_minsection_pct; + oxcf->two_pass_vbrmax_section = cfg->rc_2pass_vbr_maxsection_pct; - oxcf->cpu_used = vp8_cfg.cpu_used; - oxcf->encode_breakout = vp8_cfg.static_thresh; - oxcf->play_alternate = vp8_cfg.enable_auto_alt_ref; - oxcf->noise_sensitivity = vp8_cfg.noise_sensitivity; - oxcf->Sharpness = vp8_cfg.Sharpness; + oxcf->auto_key = cfg->kf_mode == VPX_KF_AUTO && + cfg->kf_min_dist != cfg->kf_max_dist; - oxcf->two_pass_stats_in = cfg.rc_twopass_stats_in; - oxcf->output_pkt_list = vp8_cfg.pkt_list; + oxcf->key_freq = cfg->kf_max_dist; - oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames; - oxcf->arnr_strength = vp8_cfg.arnr_strength; - oxcf->arnr_type = vp8_cfg.arnr_type; + oxcf->speed = abs(extra_cfg->cpu_used); + oxcf->encode_breakout = extra_cfg->static_thresh; + oxcf->play_alternate = extra_cfg->enable_auto_alt_ref; + oxcf->noise_sensitivity = extra_cfg->noise_sensitivity; + oxcf->sharpness = extra_cfg->sharpness; - oxcf->tuning = vp8_cfg.tuning; + oxcf->two_pass_stats_in = cfg->rc_twopass_stats_in; - oxcf->tile_columns = vp8_cfg.tile_columns; - oxcf->tile_rows = vp8_cfg.tile_rows; +#if CONFIG_FP_MB_STATS + oxcf->firstpass_mb_stats_in = cfg->rc_firstpass_mb_stats_in; +#endif - oxcf->lossless = vp8_cfg.lossless; + oxcf->arnr_max_frames = extra_cfg->arnr_max_frames; + oxcf->arnr_strength = extra_cfg->arnr_strength; - oxcf->error_resilient_mode = cfg.g_error_resilient; - oxcf->frame_parallel_decoding_mode = vp8_cfg.frame_parallel_decoding_mode; + oxcf->tuning = extra_cfg->tuning; + oxcf->content = extra_cfg->content; - oxcf->aq_mode = vp8_cfg.aq_mode; + oxcf->tile_columns = extra_cfg->tile_columns; + oxcf->tile_rows = extra_cfg->tile_rows; + + oxcf->error_resilient_mode = cfg->g_error_resilient; + oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode; + + oxcf->aq_mode = extra_cfg->aq_mode; + + oxcf->frame_periodic_boost = extra_cfg->frame_periodic_boost; + + oxcf->ss_number_layers = cfg->ss_number_layers; + + if (oxcf->ss_number_layers > 1) { + int i; + for (i = 0; i < VPX_SS_MAX_LAYERS; ++i) { + oxcf->ss_target_bitrate[i] = 1000 * cfg->ss_target_bitrate[i]; +#if CONFIG_SPATIAL_SVC + oxcf->ss_play_alternate[i] = cfg->ss_enable_auto_alt_ref[i]; +#endif + } + } else if (oxcf->ss_number_layers == 1) { + oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth; +#if CONFIG_SPATIAL_SVC + oxcf->ss_play_alternate[0] = extra_cfg->enable_auto_alt_ref; +#endif + } + + oxcf->ts_number_layers = cfg->ts_number_layers; + + if (oxcf->ts_number_layers > 1) { + int i; + for (i = 0; i < VPX_TS_MAX_LAYERS; ++i) { + oxcf->ts_target_bitrate[i] = 1000 * cfg->ts_target_bitrate[i]; + oxcf->ts_rate_decimator[i] = cfg->ts_rate_decimator[i]; + } + } else if (oxcf->ts_number_layers == 1) { + oxcf->ts_target_bitrate[0] = (int)oxcf->target_bandwidth; + oxcf->ts_rate_decimator[0] = 1; + } - oxcf->ss_number_layers = cfg.ss_number_layers; /* printf("Current VP9 Settings: \n"); printf("target_bandwidth: %d\n", oxcf->target_bandwidth); printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity); - printf("Sharpness: %d\n", oxcf->Sharpness); + printf("sharpness: %d\n", oxcf->sharpness); printf("cpu_used: %d\n", oxcf->cpu_used); - printf("Mode: %d\n", oxcf->Mode); - // printf("delete_first_pass_file: %d\n", oxcf->delete_first_pass_file); + printf("Mode: %d\n", oxcf->mode); printf("auto_key: %d\n", oxcf->auto_key); printf("key_freq: %d\n", oxcf->key_freq); printf("end_usage: %d\n", oxcf->end_usage); @@ -360,10 +467,12 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf, printf("fixed_q: %d\n", oxcf->fixed_q); printf("worst_allowed_q: %d\n", oxcf->worst_allowed_q); printf("best_allowed_q: %d\n", oxcf->best_allowed_q); + printf("allow_spatial_resampling: %d\n", oxcf->allow_spatial_resampling); + printf("scaled_frame_width: %d\n", oxcf->scaled_frame_width); + printf("scaled_frame_height: %d\n", oxcf->scaled_frame_height); printf("two_pass_vbrbias: %d\n", oxcf->two_pass_vbrbias); printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section); printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section); - printf("allow_lag: %d\n", oxcf->allow_lag); printf("lag_in_frames: %d\n", oxcf->lag_in_frames); printf("play_alternate: %d\n", oxcf->play_alternate); printf("Version: %d\n", oxcf->Version); @@ -375,222 +484,262 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf, return VPX_CODEC_OK; } -static vpx_codec_err_t vp9e_set_config(vpx_codec_alg_priv_t *ctx, - const vpx_codec_enc_cfg_t *cfg) { +static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx, + const vpx_codec_enc_cfg_t *cfg) { vpx_codec_err_t res; - if ((cfg->g_w != ctx->cfg.g_w) || (cfg->g_h != ctx->cfg.g_h)) + if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) ERROR("Cannot change width or height after initialization"); - /* Prevent increasing lag_in_frames. This check is stricter than it needs - * to be -- the limit is not increasing past the first lag_in_frames - * value, but we don't track the initial config, only the last successful - * config. - */ - if ((cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames)) + // Prevent increasing lag_in_frames. This check is stricter than it needs + // to be -- the limit is not increasing past the first lag_in_frames + // value, but we don't track the initial config, only the last successful + // config. + if (cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames) ERROR("Cannot increase lag_in_frames"); - res = validate_config(ctx, cfg, &ctx->vp8_cfg); + res = validate_config(ctx, cfg, &ctx->extra_cfg); - if (!res) { + if (res == VPX_CODEC_OK) { ctx->cfg = *cfg; - set_vp9e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg); + set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); vp9_change_config(ctx->cpi, &ctx->oxcf); } return res; } - -int vp9_reverse_trans(int q); - - -static vpx_codec_err_t get_param(vpx_codec_alg_priv_t *ctx, - int ctrl_id, - va_list args) { - void *arg = va_arg(args, void *); - -#define MAP(id, var) case id: *(RECAST(id, arg)) = var; break - - if (!arg) +static vpx_codec_err_t ctrl_get_quantizer(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return VPX_CODEC_INVALID_PARAM; - - switch (ctrl_id) { - MAP(VP8E_GET_LAST_QUANTIZER, vp9_get_quantizer(ctx->cpi)); - MAP(VP8E_GET_LAST_QUANTIZER_64, - vp9_reverse_trans(vp9_get_quantizer(ctx->cpi))); - } - + *arg = vp9_get_quantizer(ctx->cpi); return VPX_CODEC_OK; -#undef MAP } +static vpx_codec_err_t ctrl_get_quantizer64(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) + return VPX_CODEC_INVALID_PARAM; + *arg = vp9_qindex_to_quantizer(vp9_get_quantizer(ctx->cpi)); + return VPX_CODEC_OK; +} -static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx, - int ctrl_id, - va_list args) { - vpx_codec_err_t res = VPX_CODEC_OK; - struct vp9_extracfg xcfg = ctx->vp8_cfg; - -#define MAP(id, var) case id: var = CAST(id, args); break; - - switch (ctrl_id) { - MAP(VP8E_SET_CPUUSED, xcfg.cpu_used); - MAP(VP8E_SET_ENABLEAUTOALTREF, xcfg.enable_auto_alt_ref); - MAP(VP8E_SET_NOISE_SENSITIVITY, xcfg.noise_sensitivity); - MAP(VP8E_SET_SHARPNESS, xcfg.Sharpness); - MAP(VP8E_SET_STATIC_THRESHOLD, xcfg.static_thresh); - MAP(VP9E_SET_TILE_COLUMNS, xcfg.tile_columns); - MAP(VP9E_SET_TILE_ROWS, xcfg.tile_rows); - MAP(VP8E_SET_ARNR_MAXFRAMES, xcfg.arnr_max_frames); - MAP(VP8E_SET_ARNR_STRENGTH, xcfg.arnr_strength); - MAP(VP8E_SET_ARNR_TYPE, xcfg.arnr_type); - MAP(VP8E_SET_TUNING, xcfg.tuning); - MAP(VP8E_SET_CQ_LEVEL, xcfg.cq_level); - MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT, xcfg.rc_max_intra_bitrate_pct); - MAP(VP9E_SET_LOSSLESS, xcfg.lossless); - MAP(VP9E_SET_FRAME_PARALLEL_DECODING, xcfg.frame_parallel_decoding_mode); - MAP(VP9E_SET_AQ_MODE, xcfg.aq_mode); - } - - res = validate_config(ctx, &ctx->cfg, &xcfg); - - if (!res) { - ctx->vp8_cfg = xcfg; - set_vp9e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg); +static vpx_codec_err_t update_extra_cfg(vpx_codec_alg_priv_t *ctx, + const struct vp9_extracfg *extra_cfg) { + const vpx_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg); + if (res == VPX_CODEC_OK) { + ctx->extra_cfg = *extra_cfg; + set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg); vp9_change_config(ctx->cpi, &ctx->oxcf); } - return res; -#undef MAP } +static vpx_codec_err_t ctrl_set_cpuused(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.cpu_used = CAST(VP8E_SET_CPUUSED, args); + return update_extra_cfg(ctx, &extra_cfg); +} -static vpx_codec_err_t vp9e_common_init(vpx_codec_ctx_t *ctx, - int experimental) { - vpx_codec_err_t res = VPX_CODEC_OK; - struct vpx_codec_alg_priv *priv; - vpx_codec_enc_cfg_t *cfg; - unsigned int i; +static vpx_codec_err_t ctrl_set_enable_auto_alt_ref(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_auto_alt_ref = CAST(VP8E_SET_ENABLEAUTOALTREF, args); + return update_extra_cfg(ctx, &extra_cfg); +} - VP9_PTR optr; +static vpx_codec_err_t ctrl_set_noise_sensitivity(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.noise_sensitivity = CAST(VP8E_SET_NOISE_SENSITIVITY, args); + return update_extra_cfg(ctx, &extra_cfg); +} - if (!ctx->priv) { - priv = calloc(1, sizeof(struct vpx_codec_alg_priv)); +static vpx_codec_err_t ctrl_set_sharpness(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.sharpness = CAST(VP8E_SET_SHARPNESS, args); + return update_extra_cfg(ctx, &extra_cfg); +} - if (!priv) { +static vpx_codec_err_t ctrl_set_static_thresh(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.static_thresh = CAST(VP8E_SET_STATIC_THRESHOLD, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_tile_columns(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.tile_columns = CAST(VP9E_SET_TILE_COLUMNS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_tile_rows(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.tile_rows = CAST(VP9E_SET_TILE_ROWS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_arnr_max_frames(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.arnr_max_frames = CAST(VP8E_SET_ARNR_MAXFRAMES, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_arnr_strength(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.arnr_strength = CAST(VP8E_SET_ARNR_STRENGTH, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_arnr_type(vpx_codec_alg_priv_t *ctx, + va_list args) { + (void)ctx; + (void)args; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_tuning(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.tuning = CAST(VP8E_SET_TUNING, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_cq_level(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.cq_level = CAST(VP8E_SET_CQ_LEVEL, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_rc_max_intra_bitrate_pct( + vpx_codec_alg_priv_t *ctx, va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.rc_max_intra_bitrate_pct = + CAST(VP8E_SET_MAX_INTRA_BITRATE_PCT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_lossless(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.lossless = CAST(VP9E_SET_LOSSLESS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_frame_parallel_decoding_mode( + vpx_codec_alg_priv_t *ctx, va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.frame_parallel_decoding_mode = + CAST(VP9E_SET_FRAME_PARALLEL_DECODING, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_aq_mode(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.aq_mode = CAST(VP9E_SET_AQ_MODE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t ctrl_set_frame_periodic_boost(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.frame_periodic_boost = CAST(VP9E_SET_FRAME_PERIODIC_BOOST, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, + vpx_codec_priv_enc_mr_cfg_t *data) { + vpx_codec_err_t res = VPX_CODEC_OK; + (void)data; + + if (ctx->priv == NULL) { + vpx_codec_alg_priv_t *const priv = vpx_calloc(1, sizeof(*priv)); + if (priv == NULL) return VPX_CODEC_MEM_ERROR; - } - ctx->priv = &priv->base; - ctx->priv->sz = sizeof(*ctx->priv); - ctx->priv->iface = ctx->iface; - ctx->priv->alg_priv = priv; + ctx->priv = (vpx_codec_priv_t *)priv; ctx->priv->init_flags = ctx->init_flags; ctx->priv->enc.total_encoders = 1; if (ctx->config.enc) { - /* Update the reference to the config structure to an - * internal copy. - */ - ctx->priv->alg_priv->cfg = *ctx->config.enc; - ctx->config.enc = &ctx->priv->alg_priv->cfg; - } - - cfg = &ctx->priv->alg_priv->cfg; - - /* Select the extra vp6 configuration table based on the current - * usage value. If the current usage value isn't found, use the - * values for usage case 0. - */ - for (i = 0; - extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage; - i++) {} - - priv->vp8_cfg = extracfg_map[i].cfg; - priv->vp8_cfg.pkt_list = &priv->pkt_list.head; - priv->vp8_cfg.experimental = experimental; - - // TODO(agrange) Check the limits set on this buffer, or the check that is - // applied in vp9e_encode. - priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 8; -// priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2; - - if (priv->cx_data_sz < 4096) priv->cx_data_sz = 4096; - - priv->cx_data = malloc(priv->cx_data_sz); - - if (!priv->cx_data) { - return VPX_CODEC_MEM_ERROR; + // Update the reference to the config structure to an internal copy. + priv->cfg = *ctx->config.enc; + ctx->config.enc = &priv->cfg; } + priv->extra_cfg = default_extra_cfg; vp9_initialize_enc(); - res = validate_config(priv, &priv->cfg, &priv->vp8_cfg); + res = validate_config(priv, &priv->cfg, &priv->extra_cfg); - if (!res) { - set_vp9e_config(&ctx->priv->alg_priv->oxcf, - ctx->priv->alg_priv->cfg, - ctx->priv->alg_priv->vp8_cfg); - optr = vp9_create_compressor(&ctx->priv->alg_priv->oxcf); - - if (!optr) + if (res == VPX_CODEC_OK) { + set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg); + priv->cpi = vp9_create_compressor(&priv->oxcf); + if (priv->cpi == NULL) res = VPX_CODEC_MEM_ERROR; else - ctx->priv->alg_priv->cpi = optr; + priv->cpi->output_pkt_list = &priv->pkt_list.head; } } return res; } - -static vpx_codec_err_t vp9e_init(vpx_codec_ctx_t *ctx, - vpx_codec_priv_enc_mr_cfg_t *data) { - return vp9e_common_init(ctx, 0); -} - - -#if CONFIG_EXPERIMENTAL -static vpx_codec_err_t vp9e_exp_init(vpx_codec_ctx_t *ctx, - vpx_codec_priv_enc_mr_cfg_t *data) { - return vp9e_common_init(ctx, 1); -} -#endif - - -static vpx_codec_err_t vp9e_destroy(vpx_codec_alg_priv_t *ctx) { +static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) { free(ctx->cx_data); - vp9_remove_compressor(&ctx->cpi); - free(ctx); + vp9_remove_compressor(ctx->cpi); + vpx_free(ctx); return VPX_CODEC_OK; } -static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, - unsigned long duration, - unsigned long deadline) { - unsigned int new_qc; +static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx, + unsigned long duration, + unsigned long deadline) { + MODE new_mode = BEST; - /* Use best quality mode if no deadline is given. */ - if (deadline) - new_qc = MODE_GOODQUALITY; - else - new_qc = MODE_BESTQUALITY; + switch (ctx->cfg.g_pass) { + case VPX_RC_ONE_PASS: + if (deadline > 0) { + const vpx_codec_enc_cfg_t *const cfg = &ctx->cfg; - if (ctx->cfg.g_pass == VPX_RC_FIRST_PASS) - new_qc = MODE_FIRSTPASS; - else if (ctx->cfg.g_pass == VPX_RC_LAST_PASS) - new_qc = (new_qc == MODE_BESTQUALITY) - ? MODE_SECONDPASS_BEST - : MODE_SECONDPASS; + // Convert duration parameter from stream timebase to microseconds. + const uint64_t duration_us = (uint64_t)duration * 1000000 * + (uint64_t)cfg->g_timebase.num /(uint64_t)cfg->g_timebase.den; - if (ctx->oxcf.Mode != new_qc) { - ctx->oxcf.Mode = new_qc; + // If the deadline is more that the duration this frame is to be shown, + // use good quality mode. Otherwise use realtime mode. + new_mode = (deadline > duration_us) ? GOOD : REALTIME; + } else { + new_mode = BEST; + } + break; + case VPX_RC_FIRST_PASS: + break; + case VPX_RC_LAST_PASS: + new_mode = deadline > 0 ? GOOD : BEST; + break; + } + + if (ctx->oxcf.mode != new_mode) { + ctx->oxcf.mode = new_mode; vp9_change_config(ctx->cpi, &ctx->oxcf); } } - +// Turn on to test if supplemental superframe data breaks decoding +// #define TEST_SUPPLEMENTAL_SUPERFRAME_DATA static int write_superframe_index(vpx_codec_alg_priv_t *ctx) { uint8_t marker = 0xc0; unsigned int mask; @@ -599,10 +748,10 @@ static int write_superframe_index(vpx_codec_alg_priv_t *ctx) { assert(ctx->pending_frame_count); assert(ctx->pending_frame_count <= 8); - /* Add the number of frames to the marker byte */ + // Add the number of frames to the marker byte marker |= ctx->pending_frame_count - 1; - /* Choose the magnitude */ + // Choose the magnitude for (mag = 0, mask = 0xff; mag < 4; mag++) { if (ctx->pending_frame_magnitude < mask) break; @@ -611,15 +760,29 @@ static int write_superframe_index(vpx_codec_alg_priv_t *ctx) { } marker |= mag << 3; - /* Write the index */ + // Write the index index_sz = 2 + (mag + 1) * ctx->pending_frame_count; if (ctx->pending_cx_data_sz + index_sz < ctx->cx_data_sz) { uint8_t *x = ctx->pending_cx_data + ctx->pending_cx_data_sz; int i, j; +#ifdef TEST_SUPPLEMENTAL_SUPERFRAME_DATA + uint8_t marker_test = 0xc0; + int mag_test = 2; // 1 - 4 + int frames_test = 4; // 1 - 8 + int index_sz_test = 2 + mag_test * frames_test; + marker_test |= frames_test - 1; + marker_test |= (mag_test - 1) << 3; + *x++ = marker_test; + for (i = 0; i < mag_test * frames_test; ++i) + *x++ = 0; // fill up with arbitrary data + *x++ = marker_test; + ctx->pending_cx_data_sz += index_sz_test; + printf("Added supplemental superframe data\n"); +#endif *x++ = marker; for (i = 0; i < ctx->pending_frame_count; i++) { - int this_sz = ctx->pending_frame_sizes[i]; + unsigned int this_sz = (unsigned int)ctx->pending_frame_sizes[i]; for (j = 0; j <= mag; j++) { *x++ = this_sz & 0xff; @@ -628,114 +791,121 @@ static int write_superframe_index(vpx_codec_alg_priv_t *ctx) { } *x++ = marker; ctx->pending_cx_data_sz += index_sz; +#ifdef TEST_SUPPLEMENTAL_SUPERFRAME_DATA + index_sz += index_sz_test; +#endif } return index_sz; } -static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t *ctx, - const vpx_image_t *img, - vpx_codec_pts_t pts, - unsigned long duration, - vpx_enc_frame_flags_t flags, - unsigned long deadline) { - vpx_codec_err_t res = VPX_CODEC_OK; +// vp9 uses 10,000,000 ticks/second as time stamp +#define TICKS_PER_SEC 10000000LL - if (img) +static int64_t timebase_units_to_ticks(const vpx_rational_t *timebase, + int64_t n) { + return n * TICKS_PER_SEC * timebase->num / timebase->den; +} + +static int64_t ticks_to_timebase_units(const vpx_rational_t *timebase, + int64_t n) { + const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1; + return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC; +} + +static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi, + unsigned int lib_flags) { + vpx_codec_frame_flags_t flags = lib_flags << 16; + + if (lib_flags & FRAMEFLAGS_KEY +#if CONFIG_SPATIAL_SVC + || (is_two_pass_svc(cpi) && cpi->svc.layer_context[0].is_key_frame) +#endif + ) + flags |= VPX_FRAME_IS_KEY; + + if (cpi->droppable) + flags |= VPX_FRAME_IS_DROPPABLE; + + return flags; +} + +static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, + const vpx_image_t *img, + vpx_codec_pts_t pts, + unsigned long duration, + vpx_enc_frame_flags_t flags, + unsigned long deadline) { + vpx_codec_err_t res = VPX_CODEC_OK; + VP9_COMP *const cpi = ctx->cpi; + const vpx_rational_t *const timebase = &ctx->cfg.g_timebase; + + if (img != NULL) { res = validate_img(ctx, img); + // TODO(jzern) the checks related to cpi's validity should be treated as a + // failure condition, encoder setup is done fully in init() currently. + if (res == VPX_CODEC_OK && cpi != NULL && ctx->cx_data == NULL) { + // There's no codec control for multiple alt-refs so check the encoder + // instance for its status to determine the compressed data size. + ctx->cx_data_sz = ctx->cfg.g_w * ctx->cfg.g_h * + get_image_bps(img) / 8 * + (cpi->multi_arf_allowed ? 8 : 2); + if (ctx->cx_data_sz < 4096) ctx->cx_data_sz = 4096; + + ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz); + if (ctx->cx_data == NULL) { + return VPX_CODEC_MEM_ERROR; + } + } + } pick_quickcompress_mode(ctx, duration, deadline); vpx_codec_pkt_list_init(&ctx->pkt_list); - /* Handle Flags */ - if (((flags & VP8_EFLAG_NO_UPD_GF) && (flags & VP8_EFLAG_FORCE_GF)) - || ((flags & VP8_EFLAG_NO_UPD_ARF) && (flags & VP8_EFLAG_FORCE_ARF))) { + // Handle Flags + if (((flags & VP8_EFLAG_NO_UPD_GF) && (flags & VP8_EFLAG_FORCE_GF)) || + ((flags & VP8_EFLAG_NO_UPD_ARF) && (flags & VP8_EFLAG_FORCE_ARF))) { ctx->base.err_detail = "Conflicting flags."; return VPX_CODEC_INVALID_PARAM; } - if (flags & (VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF - | VP8_EFLAG_NO_REF_ARF)) { - int ref = 7; + vp9_apply_encoding_flags(cpi, flags); - if (flags & VP8_EFLAG_NO_REF_LAST) - ref ^= VP9_LAST_FLAG; - - if (flags & VP8_EFLAG_NO_REF_GF) - ref ^= VP9_GOLD_FLAG; - - if (flags & VP8_EFLAG_NO_REF_ARF) - ref ^= VP9_ALT_FLAG; - - vp9_use_as_reference(ctx->cpi, ref); - } - - if (flags & (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF - | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_FORCE_GF - | VP8_EFLAG_FORCE_ARF)) { - int upd = 7; - - if (flags & VP8_EFLAG_NO_UPD_LAST) - upd ^= VP9_LAST_FLAG; - - if (flags & VP8_EFLAG_NO_UPD_GF) - upd ^= VP9_GOLD_FLAG; - - if (flags & VP8_EFLAG_NO_UPD_ARF) - upd ^= VP9_ALT_FLAG; - - vp9_update_reference(ctx->cpi, upd); - } - - if (flags & VP8_EFLAG_NO_UPD_ENTROPY) { - vp9_update_entropy(ctx->cpi, 0); - } - - /* Handle fixed keyframe intervals */ - if (ctx->cfg.kf_mode == VPX_KF_AUTO - && ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) { + // Handle fixed keyframe intervals + if (ctx->cfg.kf_mode == VPX_KF_AUTO && + ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) { if (++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist) { flags |= VPX_EFLAG_FORCE_KF; ctx->fixed_kf_cntr = 1; } } - /* Initialize the encoder instance on the first frame*/ - if (!res && ctx->cpi) { - unsigned int lib_flags; + // Initialize the encoder instance on the first frame. + if (res == VPX_CODEC_OK && cpi != NULL) { + unsigned int lib_flags = 0; YV12_BUFFER_CONFIG sd; - int64_t dst_time_stamp, dst_end_time_stamp; - unsigned long size, cx_data_sz; + int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts); + int64_t dst_end_time_stamp = + timebase_units_to_ticks(timebase, pts + duration); + size_t size, cx_data_sz; unsigned char *cx_data; - /* Set up internal flags */ + // Set up internal flags if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) - ((VP9_COMP *)ctx->cpi)->b_calculate_psnr = 1; - - // if (ctx->base.init_flags & VPX_CODEC_USE_OUTPUT_PARTITION) - // ((VP9_COMP *)ctx->cpi)->output_partition = 1; - - /* Convert API flags to internal codec lib flags */ - lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; - - /* vp8 use 10,000,000 ticks/second as time stamp */ - dst_time_stamp = pts * 10000000 * ctx->cfg.g_timebase.num - / ctx->cfg.g_timebase.den; - dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num / - ctx->cfg.g_timebase.den; + cpi->b_calculate_psnr = 1; if (img != NULL) { res = image2yuvconfig(img, &sd); - if (vp9_receive_raw_frame(ctx->cpi, lib_flags, + // Store the original flags in to the frame buffer. Will extract the + // key frame flag when we actually encode this frame. + if (vp9_receive_raw_frame(cpi, flags, &sd, dst_time_stamp, dst_end_time_stamp)) { - VP9_COMP *cpi = (VP9_COMP *)ctx->cpi; res = update_error_state(ctx, &cpi->common.error); } } cx_data = ctx->cx_data; cx_data_sz = ctx->cx_data_sz; - lib_flags = 0; /* Any pending invisible frames? */ if (ctx->pending_cx_data) { @@ -754,17 +924,25 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t *ctx, } while (cx_data_sz >= ctx->cx_data_sz / 2 && - -1 != vp9_get_compressed_data(ctx->cpi, &lib_flags, &size, + -1 != vp9_get_compressed_data(cpi, &lib_flags, &size, cx_data, &dst_time_stamp, &dst_end_time_stamp, !img)) { if (size) { - vpx_codec_pts_t round, delta; vpx_codec_cx_pkt_t pkt; - VP9_COMP *cpi = (VP9_COMP *)ctx->cpi; - /* Pack invisible frames with the next visible frame */ - if (!cpi->common.show_frame) { - if (!ctx->pending_cx_data) +#if CONFIG_SPATIAL_SVC + if (is_two_pass_svc(cpi)) + cpi->svc.layer_context[cpi->svc.spatial_layer_id].layer_size += size; +#endif + + // Pack invisible frames with the next visible frame + if (!cpi->common.show_frame +#if CONFIG_SPATIAL_SVC + || (is_two_pass_svc(cpi) && + cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) +#endif + ) { + if (ctx->pending_cx_data == 0) ctx->pending_cx_data = cx_data; ctx->pending_cx_data_sz += size; ctx->pending_frame_sizes[ctx->pending_frame_count++] = size; @@ -774,79 +952,46 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t *ctx, continue; } - /* Add the frame packet to the list of returned packets. */ - round = (vpx_codec_pts_t)1000000 * ctx->cfg.g_timebase.num / 2 - 1; - delta = (dst_end_time_stamp - dst_time_stamp); + // Add the frame packet to the list of returned packets. pkt.kind = VPX_CODEC_CX_FRAME_PKT; - pkt.data.frame.pts = - (dst_time_stamp * ctx->cfg.g_timebase.den + round) - / ctx->cfg.g_timebase.num / 10000000; - pkt.data.frame.duration = (unsigned long) - ((delta * ctx->cfg.g_timebase.den + round) - / ctx->cfg.g_timebase.num / 10000000); - pkt.data.frame.flags = lib_flags << 16; + pkt.data.frame.pts = ticks_to_timebase_units(timebase, dst_time_stamp); + pkt.data.frame.duration = + (unsigned long)ticks_to_timebase_units(timebase, + dst_end_time_stamp - dst_time_stamp); + pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags); - if (lib_flags & FRAMEFLAGS_KEY) - pkt.data.frame.flags |= VPX_FRAME_IS_KEY; - - if (!cpi->common.show_frame) { - pkt.data.frame.flags |= VPX_FRAME_IS_INVISIBLE; - - // This timestamp should be as close as possible to the - // prior PTS so that if a decoder uses pts to schedule when - // to do this, we start right after last frame was decoded. - // Invisible frames have no duration. - pkt.data.frame.pts = ((cpi->last_time_stamp_seen - * ctx->cfg.g_timebase.den + round) - / ctx->cfg.g_timebase.num / 10000000) + 1; - pkt.data.frame.duration = 0; + if (ctx->pending_cx_data) { + ctx->pending_frame_sizes[ctx->pending_frame_count++] = size; + ctx->pending_frame_magnitude |= size; + ctx->pending_cx_data_sz += size; + size += write_superframe_index(ctx); + pkt.data.frame.buf = ctx->pending_cx_data; + pkt.data.frame.sz = ctx->pending_cx_data_sz; + ctx->pending_cx_data = NULL; + ctx->pending_cx_data_sz = 0; + ctx->pending_frame_count = 0; + ctx->pending_frame_magnitude = 0; + } else { + pkt.data.frame.buf = cx_data; + pkt.data.frame.sz = size; } - - if (cpi->droppable) - pkt.data.frame.flags |= VPX_FRAME_IS_DROPPABLE; - - /*if (cpi->output_partition) - { - int i; - const int num_partitions = 1; - - pkt.data.frame.flags |= VPX_FRAME_IS_FRAGMENT; - - for (i = 0; i < num_partitions; ++i) - { - pkt.data.frame.buf = cx_data; - pkt.data.frame.sz = cpi->partition_sz[i]; - pkt.data.frame.partition_id = i; - // don't set the fragment bit for the last partition - if (i == (num_partitions - 1)) - pkt.data.frame.flags &= ~VPX_FRAME_IS_FRAGMENT; - vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt); - cx_data += cpi->partition_sz[i]; - cx_data_sz -= cpi->partition_sz[i]; - } - } - else*/ - { - if (ctx->pending_cx_data) { - ctx->pending_frame_sizes[ctx->pending_frame_count++] = size; - ctx->pending_frame_magnitude |= size; - ctx->pending_cx_data_sz += size; - size += write_superframe_index(ctx); - pkt.data.frame.buf = ctx->pending_cx_data; - pkt.data.frame.sz = ctx->pending_cx_data_sz; - ctx->pending_cx_data = NULL; - ctx->pending_cx_data_sz = 0; - ctx->pending_frame_count = 0; - ctx->pending_frame_magnitude = 0; - } else { - pkt.data.frame.buf = cx_data; - pkt.data.frame.sz = size; + pkt.data.frame.partition_id = -1; + vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt); + cx_data += size; + cx_data_sz -= size; +#if CONFIG_SPATIAL_SVC + if (is_two_pass_svc(cpi)) { + vpx_codec_cx_pkt_t pkt; + int i; + vp9_zero(pkt); + pkt.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES; + for (i = 0; i < cpi->svc.number_spatial_layers; ++i) { + pkt.data.layer_sizes[i] = cpi->svc.layer_context[i].layer_size; + cpi->svc.layer_context[i].layer_size = 0; } - pkt.data.frame.partition_id = -1; vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt); - cx_data += size; - cx_data_sz -= size; } +#endif } } } @@ -854,19 +999,16 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t *ctx, return res; } - -static const vpx_codec_cx_pkt_t *vp9e_get_cxdata(vpx_codec_alg_priv_t *ctx, - vpx_codec_iter_t *iter) { +static const vpx_codec_cx_pkt_t *encoder_get_cxdata(vpx_codec_alg_priv_t *ctx, + vpx_codec_iter_t *iter) { return vpx_codec_pkt_list_get(&ctx->pkt_list.head, iter); } -static vpx_codec_err_t vp9e_set_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, +static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx, va_list args) { - vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); + vpx_ref_frame_t *const frame = va_arg(args, vpx_ref_frame_t *); - if (data) { - vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data; + if (frame != NULL) { YV12_BUFFER_CONFIG sd; image2yuvconfig(&frame->img, &sd); @@ -878,13 +1020,11 @@ static vpx_codec_err_t vp9e_set_reference(vpx_codec_alg_priv_t *ctx, } } -static vpx_codec_err_t vp9e_copy_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, +static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx, va_list args) { - vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); + vpx_ref_frame_t *const frame = va_arg(args, vpx_ref_frame_t *); - if (data) { - vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data; + if (frame != NULL) { YV12_BUFFER_CONFIG sd; image2yuvconfig(&frame->img, &sd); @@ -896,55 +1036,51 @@ static vpx_codec_err_t vp9e_copy_reference(vpx_codec_alg_priv_t *ctx, } } -static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) { - vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *); +static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, + va_list args) { + vp9_ref_frame_t *const frame = va_arg(args, vp9_ref_frame_t *); - if (data) { - YV12_BUFFER_CONFIG* fb; + if (frame != NULL) { + YV12_BUFFER_CONFIG *fb = get_ref_frame(&ctx->cpi->common, frame->idx); + if (fb == NULL) return VPX_CODEC_ERROR; - vp9_get_reference_enc(ctx->cpi, data->idx, &fb); - yuvconfig2image(&data->img, fb, NULL); + yuvconfig2image(&frame->img, fb, NULL); return VPX_CODEC_OK; } else { return VPX_CODEC_INVALID_PARAM; } } -static vpx_codec_err_t vp9e_set_previewpp(vpx_codec_alg_priv_t *ctx, - int ctr_id, +static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx, va_list args) { #if CONFIG_VP9_POSTPROC - vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *); - (void)ctr_id; - - if (data) { - ctx->preview_ppcfg = *((vp8_postproc_cfg_t *)data); + vp8_postproc_cfg_t *config = va_arg(args, vp8_postproc_cfg_t *); + if (config != NULL) { + ctx->preview_ppcfg = *config; return VPX_CODEC_OK; } else { return VPX_CODEC_INVALID_PARAM; } #else (void)ctx; - (void)ctr_id; (void)args; return VPX_CODEC_INCAPABLE; #endif } -static vpx_image_t *vp9e_get_preview(vpx_codec_alg_priv_t *ctx) { +static vpx_image_t *encoder_get_preview(vpx_codec_alg_priv_t *ctx) { YV12_BUFFER_CONFIG sd; - vp9_ppflags_t flags = {0}; + vp9_ppflags_t flags; + vp9_zero(flags); if (ctx->preview_ppcfg.post_proc_flag) { - flags.post_proc_flag = ctx->preview_ppcfg.post_proc_flag; - flags.deblocking_level = ctx->preview_ppcfg.deblocking_level; - flags.noise_level = ctx->preview_ppcfg.noise_level; + flags.post_proc_flag = ctx->preview_ppcfg.post_proc_flag; + flags.deblocking_level = ctx->preview_ppcfg.deblocking_level; + flags.noise_level = ctx->preview_ppcfg.noise_level; } - if (0 == vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags)) { + if (vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags) == 0) { yuvconfig2image(&ctx->preview_img, &sd, NULL); return &ctx->preview_img; } else { @@ -952,252 +1088,270 @@ static vpx_image_t *vp9e_get_preview(vpx_codec_alg_priv_t *ctx) { } } -static vpx_codec_err_t vp9e_update_entropy(vpx_codec_alg_priv_t *ctx, - int ctr_id, +static vpx_codec_err_t ctrl_update_entropy(vpx_codec_alg_priv_t *ctx, va_list args) { - int update = va_arg(args, int); + const int update = va_arg(args, int); + vp9_update_entropy(ctx->cpi, update); return VPX_CODEC_OK; } -static vpx_codec_err_t vp9e_update_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, +static vpx_codec_err_t ctrl_update_reference(vpx_codec_alg_priv_t *ctx, va_list args) { - int update = va_arg(args, int); - vp9_update_reference(ctx->cpi, update); + const int ref_frame_flags = va_arg(args, int); + + vp9_update_reference(ctx->cpi, ref_frame_flags); return VPX_CODEC_OK; } -static vpx_codec_err_t vp9e_use_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, +static vpx_codec_err_t ctrl_use_reference(vpx_codec_alg_priv_t *ctx, va_list args) { - int reference_flag = va_arg(args, int); + const int reference_flag = va_arg(args, int); + vp9_use_as_reference(ctx->cpi, reference_flag); return VPX_CODEC_OK; } -static vpx_codec_err_t vp9e_set_roi_map(vpx_codec_alg_priv_t *ctx, - int ctr_id, +static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx, va_list args) { + (void)ctx; + (void)args; + // TODO(yaowu): Need to re-implement and test for VP9. return VPX_CODEC_INVALID_PARAM; } -static vpx_codec_err_t vp9e_set_activemap(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) { - // TODO(yaowu): Need to re-implement and test for VP9. - return VPX_CODEC_INVALID_PARAM; -} +static vpx_codec_err_t ctrl_set_active_map(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_active_map_t *const map = va_arg(args, vpx_active_map_t *); -static vpx_codec_err_t vp9e_set_scalemode(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) { - vpx_scaling_mode_t *data = va_arg(args, vpx_scaling_mode_t *); - - if (data) { - int res; - vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data; - res = vp9_set_internal_size(ctx->cpi, - (VPX_SCALING)scalemode.h_scaling_mode, - (VPX_SCALING)scalemode.v_scaling_mode); - - if (!res) { + if (map) { + if (!vp9_set_active_map(ctx->cpi, map->active_map, + (int)map->rows, (int)map->cols)) return VPX_CODEC_OK; - } else { + else return VPX_CODEC_INVALID_PARAM; - } } else { return VPX_CODEC_INVALID_PARAM; } } -static vpx_codec_err_t vp9e_set_svc(vpx_codec_alg_priv_t *ctx, int ctr_id, - va_list args) { +static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *); + + if (mode) { + const int res = vp9_set_internal_size(ctx->cpi, + (VPX_SCALING)mode->h_scaling_mode, + (VPX_SCALING)mode->v_scaling_mode); + return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM; + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + +static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, va_list args) { int data = va_arg(args, int); + const vpx_codec_enc_cfg_t *cfg = &ctx->cfg; + vp9_set_svc(ctx->cpi, data); + // CBR or two pass mode for SVC with both temporal and spatial layers + // not yet supported. + if (data == 1 && + (cfg->rc_end_usage == VPX_CBR || + cfg->g_pass == VPX_RC_FIRST_PASS || + cfg->g_pass == VPX_RC_LAST_PASS) && + cfg->ss_number_layers > 1 && + cfg->ts_number_layers > 1) { + return VPX_CODEC_INVALID_PARAM; + } return VPX_CODEC_OK; } -static vpx_codec_err_t vp9e_set_svc_parameters(vpx_codec_alg_priv_t *ctx, - int ctr_id, va_list args) { - vpx_svc_parameters_t *data = va_arg(args, vpx_svc_parameters_t *); - VP9_COMP *cpi = (VP9_COMP *)ctx->cpi; - vpx_svc_parameters_t params; +static vpx_codec_err_t ctrl_set_svc_layer_id(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_svc_layer_id_t *const data = va_arg(args, vpx_svc_layer_id_t *); + VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi; + SVC *const svc = &cpi->svc; - if (data == NULL) { + svc->spatial_layer_id = data->spatial_layer_id; + svc->temporal_layer_id = data->temporal_layer_id; + // Checks on valid layer_id input. + if (svc->temporal_layer_id < 0 || + svc->temporal_layer_id >= (int)ctx->cfg.ts_number_layers) { return VPX_CODEC_INVALID_PARAM; } - - params = *(vpx_svc_parameters_t *)data; - - cpi->current_layer = params.layer; - cpi->lst_fb_idx = params.lst_fb_idx; - cpi->gld_fb_idx = params.gld_fb_idx; - cpi->alt_fb_idx = params.alt_fb_idx; - - if (vp9_set_size_literal(ctx->cpi, params.width, params.height) != 0) { + if (svc->spatial_layer_id < 0 || + svc->spatial_layer_id >= (int)ctx->cfg.ss_number_layers) { return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_OK; +} - ctx->cfg.rc_max_quantizer = params.max_quantizer; - ctx->cfg.rc_min_quantizer = params.min_quantizer; +static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + vpx_svc_parameters_t *const params = va_arg(args, vpx_svc_parameters_t *); - set_vp9e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg); - vp9_change_config(ctx->cpi, &ctx->oxcf); + if (params == NULL || params->spatial_layer < 0 || + params->spatial_layer >= cpi->svc.number_spatial_layers) + return VPX_CODEC_INVALID_PARAM; + + if (params->spatial_layer == 0) { + int i; + for (i = 0; i < cpi->svc.number_spatial_layers; ++i) { + cpi->svc.layer_context[i].svc_params_received.spatial_layer = -1; + } + } + + cpi->svc.layer_context[params->spatial_layer].svc_params_received = + *params; return VPX_CODEC_OK; } -static vpx_codec_ctrl_fn_map_t vp9e_ctf_maps[] = { - {VP8_SET_REFERENCE, vp9e_set_reference}, - {VP8_COPY_REFERENCE, vp9e_copy_reference}, - {VP8_SET_POSTPROC, vp9e_set_previewpp}, - {VP8E_UPD_ENTROPY, vp9e_update_entropy}, - {VP8E_UPD_REFERENCE, vp9e_update_reference}, - {VP8E_USE_REFERENCE, vp9e_use_reference}, - {VP8E_SET_ROI_MAP, vp9e_set_roi_map}, - {VP8E_SET_ACTIVEMAP, vp9e_set_activemap}, - {VP8E_SET_SCALEMODE, vp9e_set_scalemode}, - {VP8E_SET_CPUUSED, set_param}, - {VP8E_SET_NOISE_SENSITIVITY, set_param}, - {VP8E_SET_ENABLEAUTOALTREF, set_param}, - {VP8E_SET_SHARPNESS, set_param}, - {VP8E_SET_STATIC_THRESHOLD, set_param}, - {VP9E_SET_TILE_COLUMNS, set_param}, - {VP9E_SET_TILE_ROWS, set_param}, - {VP8E_GET_LAST_QUANTIZER, get_param}, - {VP8E_GET_LAST_QUANTIZER_64, get_param}, - {VP8E_SET_ARNR_MAXFRAMES, set_param}, - {VP8E_SET_ARNR_STRENGTH, set_param}, - {VP8E_SET_ARNR_TYPE, set_param}, - {VP8E_SET_TUNING, set_param}, - {VP8E_SET_CQ_LEVEL, set_param}, - {VP8E_SET_MAX_INTRA_BITRATE_PCT, set_param}, - {VP9E_SET_LOSSLESS, set_param}, - {VP9E_SET_FRAME_PARALLEL_DECODING, set_param}, - {VP9E_SET_AQ_MODE, set_param}, - {VP9_GET_REFERENCE, get_reference}, - {VP9E_SET_SVC, vp9e_set_svc}, - {VP9E_SET_SVC_PARAMETERS, vp9e_set_svc_parameters}, +static vpx_codec_err_t ctrl_set_tune_content(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.content = CAST(VP9E_SET_TUNE_CONTENT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { + {VP8_COPY_REFERENCE, ctrl_copy_reference}, + {VP8E_UPD_ENTROPY, ctrl_update_entropy}, + {VP8E_UPD_REFERENCE, ctrl_update_reference}, + {VP8E_USE_REFERENCE, ctrl_use_reference}, + + // Setters + {VP8_SET_REFERENCE, ctrl_set_reference}, + {VP8_SET_POSTPROC, ctrl_set_previewpp}, + {VP8E_SET_ROI_MAP, ctrl_set_roi_map}, + {VP8E_SET_ACTIVEMAP, ctrl_set_active_map}, + {VP8E_SET_SCALEMODE, ctrl_set_scale_mode}, + {VP8E_SET_CPUUSED, ctrl_set_cpuused}, + {VP8E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity}, + {VP8E_SET_ENABLEAUTOALTREF, ctrl_set_enable_auto_alt_ref}, + {VP8E_SET_SHARPNESS, ctrl_set_sharpness}, + {VP8E_SET_STATIC_THRESHOLD, ctrl_set_static_thresh}, + {VP9E_SET_TILE_COLUMNS, ctrl_set_tile_columns}, + {VP9E_SET_TILE_ROWS, ctrl_set_tile_rows}, + {VP8E_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames}, + {VP8E_SET_ARNR_STRENGTH, ctrl_set_arnr_strength}, + {VP8E_SET_ARNR_TYPE, ctrl_set_arnr_type}, + {VP8E_SET_TUNING, ctrl_set_tuning}, + {VP8E_SET_CQ_LEVEL, ctrl_set_cq_level}, + {VP8E_SET_MAX_INTRA_BITRATE_PCT, ctrl_set_rc_max_intra_bitrate_pct}, + {VP9E_SET_LOSSLESS, ctrl_set_lossless}, + {VP9E_SET_FRAME_PARALLEL_DECODING, ctrl_set_frame_parallel_decoding_mode}, + {VP9E_SET_AQ_MODE, ctrl_set_aq_mode}, + {VP9E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost}, + {VP9E_SET_SVC, ctrl_set_svc}, + {VP9E_SET_SVC_PARAMETERS, ctrl_set_svc_parameters}, + {VP9E_SET_SVC_LAYER_ID, ctrl_set_svc_layer_id}, + {VP9E_SET_TUNE_CONTENT, ctrl_set_tune_content}, + + // Getters + {VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer}, + {VP8E_GET_LAST_QUANTIZER_64, ctrl_get_quantizer64}, + {VP9_GET_REFERENCE, ctrl_get_reference}, + { -1, NULL}, }; -static vpx_codec_enc_cfg_map_t vp9e_usage_cfg_map[] = { +static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { { 0, { // NOLINT - 0, /* g_usage */ - 0, /* g_threads */ - 0, /* g_profile */ + 0, // g_usage + 0, // g_threads + 0, // g_profile - 320, /* g_width */ - 240, /* g_height */ - {1, 30}, /* g_timebase */ + 320, // g_width + 240, // g_height + VPX_BITS_8, // g_bit_depth + 8, // g_input_bit_depth - 0, /* g_error_resilient */ + {1, 30}, // g_timebase - VPX_RC_ONE_PASS, /* g_pass */ + 0, // g_error_resilient - 25, /* g_lag_in_frames */ + VPX_RC_ONE_PASS, // g_pass - 0, /* rc_dropframe_thresh */ - 0, /* rc_resize_allowed */ - 60, /* rc_resize_down_thresold */ - 30, /* rc_resize_up_thresold */ + 25, // g_lag_in_frames - VPX_VBR, /* rc_end_usage */ + 0, // rc_dropframe_thresh + 0, // rc_resize_allowed + 1, // rc_scaled_width + 1, // rc_scaled_height + 60, // rc_resize_down_thresold + 30, // rc_resize_up_thresold + + VPX_VBR, // rc_end_usage #if VPX_ENCODER_ABI_VERSION > (1 + VPX_CODEC_ABI_VERSION) - {0}, /* rc_twopass_stats_in */ + {NULL, 0}, // rc_twopass_stats_in + {NULL, 0}, // rc_firstpass_mb_stats_in #endif - 256, /* rc_target_bandwidth */ - 0, /* rc_min_quantizer */ - 63, /* rc_max_quantizer */ - 100, /* rc_undershoot_pct */ - 100, /* rc_overshoot_pct */ + 256, // rc_target_bandwidth + 0, // rc_min_quantizer + 63, // rc_max_quantizer + 100, // rc_undershoot_pct + 100, // rc_overshoot_pct - 6000, /* rc_max_buffer_size */ - 4000, /* rc_buffer_initial_size; */ - 5000, /* rc_buffer_optimal_size; */ + 6000, // rc_max_buffer_size + 4000, // rc_buffer_initial_size + 5000, // rc_buffer_optimal_size - 50, /* rc_two_pass_vbrbias */ - 0, /* rc_two_pass_vbrmin_section */ - 2000, /* rc_two_pass_vbrmax_section */ + 50, // rc_two_pass_vbrbias + 0, // rc_two_pass_vbrmin_section + 2000, // rc_two_pass_vbrmax_section - /* keyframing settings (kf) */ - VPX_KF_AUTO, /* g_kfmode*/ - 0, /* kf_min_dist */ - 9999, /* kf_max_dist */ - - VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */ + // keyframing settings (kf) + VPX_KF_AUTO, // g_kfmode + 0, // kf_min_dist + 9999, // kf_max_dist + VPX_SS_DEFAULT_LAYERS, // ss_number_layers + {0}, + {0}, // ss_target_bitrate + 1, // ts_number_layers + {0}, // ts_target_bitrate + {0}, // ts_rate_decimator + 0, // ts_periodicity + {0}, // ts_layer_id #if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION) - 1, /* g_delete_first_pass_file */ - "vp8.fpf" /* first pass filename */ + "vp8.fpf" // first pass filename #endif } }, - { -1, {NOT_IMPLEMENTED}} }; - #ifndef VERSION_STRING #define VERSION_STRING #endif CODEC_INTERFACE(vpx_codec_vp9_cx) = { "WebM Project VP9 Encoder" VERSION_STRING, VPX_CODEC_INTERNAL_ABI_VERSION, - VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR | - VPX_CODEC_CAP_OUTPUT_PARTITION, - /* vpx_codec_caps_t caps; */ - vp9e_init, /* vpx_codec_init_fn_t init; */ - vp9e_destroy, /* vpx_codec_destroy_fn_t destroy; */ - vp9e_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */ - NOT_IMPLEMENTED, /* vpx_codec_get_mmap_fn_t get_mmap; */ - NOT_IMPLEMENTED, /* vpx_codec_set_mmap_fn_t set_mmap; */ + VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR, // vpx_codec_caps_t + encoder_init, // vpx_codec_init_fn_t + encoder_destroy, // vpx_codec_destroy_fn_t + encoder_ctrl_maps, // vpx_codec_ctrl_fn_map_t { // NOLINT - NOT_IMPLEMENTED, /* vpx_codec_peek_si_fn_t peek_si; */ - NOT_IMPLEMENTED, /* vpx_codec_get_si_fn_t get_si; */ - NOT_IMPLEMENTED, /* vpx_codec_decode_fn_t decode; */ - NOT_IMPLEMENTED, /* vpx_codec_frame_get_fn_t frame_get; */ + NULL, // vpx_codec_peek_si_fn_t + NULL, // vpx_codec_get_si_fn_t + NULL, // vpx_codec_decode_fn_t + NULL, // vpx_codec_frame_get_fn_t + NULL // vpx_codec_set_fb_fn_t }, { // NOLINT - vp9e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t peek_si; */ - vp9e_encode, /* vpx_codec_encode_fn_t encode; */ - vp9e_get_cxdata, /* vpx_codec_get_cx_data_fn_t frame_get; */ - vp9e_set_config, - NOT_IMPLEMENTED, - vp9e_get_preview, - } /* encoder functions */ + 1, // 1 cfg map + encoder_usage_cfg_map, // vpx_codec_enc_cfg_map_t + encoder_encode, // vpx_codec_encode_fn_t + encoder_get_cxdata, // vpx_codec_get_cx_data_fn_t + encoder_set_config, // vpx_codec_enc_config_set_fn_t + NULL, // vpx_codec_get_global_headers_fn_t + encoder_get_preview, // vpx_codec_get_preview_frame_fn_t + NULL // vpx_codec_enc_mr_get_mem_loc_fn_t + } }; - - -#if CONFIG_EXPERIMENTAL - -CODEC_INTERFACE(vpx_codec_vp9x_cx) = { - "VP8 Experimental Encoder" VERSION_STRING, - VPX_CODEC_INTERNAL_ABI_VERSION, - VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR, - /* vpx_codec_caps_t caps; */ - vp9e_exp_init, /* vpx_codec_init_fn_t init; */ - vp9e_destroy, /* vpx_codec_destroy_fn_t destroy; */ - vp9e_ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */ - NOT_IMPLEMENTED, /* vpx_codec_get_mmap_fn_t get_mmap; */ - NOT_IMPLEMENTED, /* vpx_codec_set_mmap_fn_t set_mmap; */ - { // NOLINT - NOT_IMPLEMENTED, /* vpx_codec_peek_si_fn_t peek_si; */ - NOT_IMPLEMENTED, /* vpx_codec_get_si_fn_t get_si; */ - NOT_IMPLEMENTED, /* vpx_codec_decode_fn_t decode; */ - NOT_IMPLEMENTED, /* vpx_codec_frame_get_fn_t frame_get; */ - }, - { // NOLINT - vp9e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t peek_si; */ - vp9e_encode, /* vpx_codec_encode_fn_t encode; */ - vp9e_get_cxdata, /* vpx_codec_get_cx_data_fn_t frame_get; */ - vp9e_set_config, - NOT_IMPLEMENTED, - vp9e_get_preview, - } /* encoder functions */ -}; -#endif diff --git a/media/libvpx/vp9/vp9_dx_iface.c b/media/libvpx/vp9/vp9_dx_iface.c index 5dacab45460d..393c66ebdad2 100644 --- a/media/libvpx/vp9/vp9_dx_iface.c +++ b/media/libvpx/vp9/vp9_dx_iface.c @@ -8,465 +8,436 @@ * be found in the AUTHORS file in the root of the source tree. */ - #include #include -#include "vpx/vpx_decoder.h" -#include "vpx/vp8dx.h" -#include "vpx/internal/vpx_codec_internal.h" + #include "./vpx_version.h" -#include "vp9/decoder/vp9_onyxd.h" -#include "vp9/decoder/vp9_onyxd_int.h" + +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx/vp8dx.h" +#include "vpx/vpx_decoder.h" + +#include "vp9/common/vp9_frame_buffers.h" + +#include "vp9/decoder/vp9_decoder.h" +#include "vp9/decoder/vp9_decodeframe.h" #include "vp9/decoder/vp9_read_bit_buffer.h" + #include "vp9/vp9_iface_common.h" #define VP9_CAP_POSTPROC (CONFIG_VP9_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0) -typedef vpx_codec_stream_info_t vp9_stream_info_t; -/* Structures for handling memory allocations */ -typedef enum { - VP9_SEG_ALG_PRIV = 256, - VP9_SEG_MAX -} mem_seg_id_t; -#define NELEMENTS(x) ((int)(sizeof(x)/sizeof(x[0]))) - -static unsigned long priv_sz(const vpx_codec_dec_cfg_t *si, - vpx_codec_flags_t flags); - -static const mem_req_t vp9_mem_req_segs[] = { - {VP9_SEG_ALG_PRIV, 0, 8, VPX_CODEC_MEM_ZERO, priv_sz}, - {VP9_SEG_MAX, 0, 0, 0, NULL} -}; +typedef vpx_codec_stream_info_t vp9_stream_info_t; struct vpx_codec_alg_priv { vpx_codec_priv_t base; - vpx_codec_mmap_t mmaps[NELEMENTS(vp9_mem_req_segs) - 1]; vpx_codec_dec_cfg_t cfg; vp9_stream_info_t si; - int defer_alloc; - int decoder_init; - VP9D_PTR pbi; + struct VP9Decoder *pbi; int postproc_cfg_set; vp8_postproc_cfg_t postproc_cfg; -#if CONFIG_POSTPROC_VISUALIZER - unsigned int dbg_postproc_flag; - int dbg_color_ref_frame_flag; - int dbg_color_mb_modes_flag; - int dbg_color_b_modes_flag; - int dbg_display_mv_flag; -#endif + vpx_decrypt_cb decrypt_cb; + void *decrypt_state; vpx_image_t img; - int img_setup; int img_avail; + int flushed; int invert_tile_order; + int frame_parallel_decode; // frame-based threading. + + // External frame buffer info to save for VP9 common. + void *ext_priv; // Private data associated with the external frame buffers. + vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb; + vpx_release_frame_buffer_cb_fn_t release_ext_fb_cb; }; -static unsigned long priv_sz(const vpx_codec_dec_cfg_t *si, - vpx_codec_flags_t flags) { - /* Although this declaration is constant, we can't use it in the requested - * segments list because we want to define the requested segments list - * before defining the private type (so that the number of memory maps is - * known) - */ - (void)si; - return sizeof(vpx_codec_alg_priv_t); -} +static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx, + vpx_codec_priv_enc_mr_cfg_t *data) { + // This function only allocates space for the vpx_codec_alg_priv_t + // structure. More memory may be required at the time the stream + // information becomes known. + (void)data; -static void vp9_init_ctx(vpx_codec_ctx_t *ctx, const vpx_codec_mmap_t *mmap) { - int i; - - ctx->priv = mmap->base; - ctx->priv->sz = sizeof(*ctx->priv); - ctx->priv->iface = ctx->iface; - ctx->priv->alg_priv = mmap->base; - - for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++) - ctx->priv->alg_priv->mmaps[i].id = vp9_mem_req_segs[i].id; - - ctx->priv->alg_priv->mmaps[0] = *mmap; - ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si); - ctx->priv->init_flags = ctx->init_flags; - - if (ctx->config.dec) { - /* Update the reference to the config structure to an internal copy. */ - ctx->priv->alg_priv->cfg = *ctx->config.dec; - ctx->config.dec = &ctx->priv->alg_priv->cfg; - } -} - -static void vp9_finalize_mmaps(vpx_codec_alg_priv_t *ctx) { - /* nothing to clean up */ -} - -static vpx_codec_err_t vp9_init(vpx_codec_ctx_t *ctx, - vpx_codec_priv_enc_mr_cfg_t *data) { - vpx_codec_err_t res = VPX_CODEC_OK; - - /* This function only allocates space for the vpx_codec_alg_priv_t - * structure. More memory may be required at the time the stream - * information becomes known. - */ if (!ctx->priv) { - vpx_codec_mmap_t mmap; + vpx_codec_alg_priv_t *const priv = vpx_calloc(1, sizeof(*priv)); + if (priv == NULL) + return VPX_CODEC_MEM_ERROR; - mmap.id = vp9_mem_req_segs[0].id; - mmap.sz = sizeof(vpx_codec_alg_priv_t); - mmap.align = vp9_mem_req_segs[0].align; - mmap.flags = vp9_mem_req_segs[0].flags; + ctx->priv = (vpx_codec_priv_t *)priv; + ctx->priv->init_flags = ctx->init_flags; - res = vpx_mmap_alloc(&mmap); + priv->si.sz = sizeof(priv->si); + priv->flushed = 0; + priv->frame_parallel_decode = + (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING); + priv->frame_parallel_decode = 0; // Disable for now - if (!res) { - vp9_init_ctx(ctx, &mmap); - - ctx->priv->alg_priv->defer_alloc = 1; - /*post processing level initialized to do nothing */ + if (ctx->config.dec) { + priv->cfg = *ctx->config.dec; + ctx->config.dec = &priv->cfg; } } - return res; -} - -static vpx_codec_err_t vp9_destroy(vpx_codec_alg_priv_t *ctx) { - int i; - - vp9_remove_decompressor(ctx->pbi); - - for (i = NELEMENTS(ctx->mmaps) - 1; i >= 0; i--) { - if (ctx->mmaps[i].dtor) - ctx->mmaps[i].dtor(&ctx->mmaps[i]); - } - return VPX_CODEC_OK; } -static vpx_codec_err_t vp9_peek_si(const uint8_t *data, - unsigned int data_sz, - vpx_codec_stream_info_t *si) { - if (data_sz <= 8) return VPX_CODEC_UNSUP_BITSTREAM; - if (data + data_sz <= data) return VPX_CODEC_INVALID_PARAM; +static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) { + if (ctx->pbi) { + vp9_decoder_remove(ctx->pbi); + ctx->pbi = NULL; + } + + vpx_free(ctx); + + return VPX_CODEC_OK; +} + +static int parse_bitdepth_colorspace_sampling( + BITSTREAM_PROFILE profile, struct vp9_read_bit_buffer *rb) { + const int sRGB = 7; + int colorspace; + if (profile >= PROFILE_2) + rb->bit_offset += 1; // Bit-depth 10 or 12. + colorspace = vp9_rb_read_literal(rb, 3); + if (colorspace != sRGB) { + rb->bit_offset += 1; // [16,235] (including xvycc) vs [0,255] range. + if (profile == PROFILE_1 || profile == PROFILE_3) { + rb->bit_offset += 2; // subsampling x/y. + rb->bit_offset += 1; // unused. + } + } else { + if (profile == PROFILE_1 || profile == PROFILE_3) { + rb->bit_offset += 1; // unused + } else { + // RGB is only available in version 1. + return 0; + } + } + return 1; +} + +static vpx_codec_err_t decoder_peek_si_internal(const uint8_t *data, + unsigned int data_sz, + vpx_codec_stream_info_t *si, + int *is_intra_only, + vpx_decrypt_cb decrypt_cb, + void *decrypt_state) { + int intra_only_flag = 0; + uint8_t clear_buffer[9]; + + if (data + data_sz <= data) + return VPX_CODEC_INVALID_PARAM; si->is_kf = 0; si->w = si->h = 0; + if (decrypt_cb) { + data_sz = MIN(sizeof(clear_buffer), data_sz); + decrypt_cb(decrypt_state, data, clear_buffer, data_sz); + data = clear_buffer; + } + { + int show_frame; + int error_resilient; struct vp9_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL }; const int frame_marker = vp9_rb_read_literal(&rb, 2); - const int version = vp9_rb_read_bit(&rb) | (vp9_rb_read_bit(&rb) << 1); - if (frame_marker != 0x2) return VPX_CODEC_UNSUP_BITSTREAM; -#if CONFIG_NON420 - if (version > 1) return VPX_CODEC_UNSUP_BITSTREAM; -#else - if (version != 0) return VPX_CODEC_UNSUP_BITSTREAM; -#endif + const BITSTREAM_PROFILE profile = vp9_read_profile(&rb); + + if (frame_marker != VP9_FRAME_MARKER) + return VPX_CODEC_UNSUP_BITSTREAM; + + if (profile >= MAX_PROFILES) return VPX_CODEC_UNSUP_BITSTREAM; if (vp9_rb_read_bit(&rb)) { // show an existing frame + vp9_rb_read_literal(&rb, 3); // Frame buffer to show. return VPX_CODEC_OK; } + if (data_sz <= 8) + return VPX_CODEC_UNSUP_BITSTREAM; + si->is_kf = !vp9_rb_read_bit(&rb); + show_frame = vp9_rb_read_bit(&rb); + error_resilient = vp9_rb_read_bit(&rb); + if (si->is_kf) { - const int sRGB = 7; - int colorspace; - - rb.bit_offset += 1; // show frame - rb.bit_offset += 1; // error resilient - - if (vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_0 || - vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_1 || - vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_2) { + if (!vp9_read_sync_code(&rb)) return VPX_CODEC_UNSUP_BITSTREAM; - } - colorspace = vp9_rb_read_literal(&rb, 3); - if (colorspace != sRGB) { - rb.bit_offset += 1; // [16,235] (including xvycc) vs [0,255] range - if (version == 1) { - rb.bit_offset += 2; // subsampling x/y - rb.bit_offset += 1; // has extra plane - } - } else { - if (version == 1) { - rb.bit_offset += 1; // has extra plane - } else { - // RGB is only available in version 1 + if (!parse_bitdepth_colorspace_sampling(profile, &rb)) + return VPX_CODEC_UNSUP_BITSTREAM; + vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h); + } else { + intra_only_flag = show_frame ? 0 : vp9_rb_read_bit(&rb); + + rb.bit_offset += error_resilient ? 0 : 2; // reset_frame_context + + if (intra_only_flag) { + if (!vp9_read_sync_code(&rb)) return VPX_CODEC_UNSUP_BITSTREAM; + if (profile > PROFILE_0) { + if (!parse_bitdepth_colorspace_sampling(profile, &rb)) + return VPX_CODEC_UNSUP_BITSTREAM; } + rb.bit_offset += REF_FRAMES; // refresh_frame_flags + vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h); } - - // TODO(jzern): these are available on non-keyframes in intra only mode. - si->w = vp9_rb_read_literal(&rb, 16) + 1; - si->h = vp9_rb_read_literal(&rb, 16) + 1; } } - + if (is_intra_only != NULL) + *is_intra_only = intra_only_flag; return VPX_CODEC_OK; } -static vpx_codec_err_t vp9_get_si(vpx_codec_alg_priv_t *ctx, - vpx_codec_stream_info_t *si) { - unsigned int sz; - - if (si->sz >= sizeof(vp9_stream_info_t)) - sz = sizeof(vp9_stream_info_t); - else - sz = sizeof(vpx_codec_stream_info_t); +static vpx_codec_err_t decoder_peek_si(const uint8_t *data, + unsigned int data_sz, + vpx_codec_stream_info_t *si) { + return decoder_peek_si_internal(data, data_sz, si, NULL, NULL, NULL); +} +static vpx_codec_err_t decoder_get_si(vpx_codec_alg_priv_t *ctx, + vpx_codec_stream_info_t *si) { + const size_t sz = (si->sz >= sizeof(vp9_stream_info_t)) + ? sizeof(vp9_stream_info_t) + : sizeof(vpx_codec_stream_info_t); memcpy(si, &ctx->si, sz); - si->sz = sz; + si->sz = (unsigned int)sz; return VPX_CODEC_OK; } +static vpx_codec_err_t update_error_state(vpx_codec_alg_priv_t *ctx, + const struct vpx_internal_error_info *error) { + if (error->error_code) + ctx->base.err_detail = error->has_detail ? error->detail : NULL; -static vpx_codec_err_t -update_error_state(vpx_codec_alg_priv_t *ctx, - const struct vpx_internal_error_info *error) { - vpx_codec_err_t res; - - if ((res = error->error_code)) - ctx->base.err_detail = error->has_detail - ? error->detail - : NULL; - - return res; + return error->error_code; } -static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, - const uint8_t **data, - unsigned int data_sz, - void *user_priv, - long deadline) { - vpx_codec_err_t res = VPX_CODEC_OK; +static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) { + VP9_COMMON *const cm = &ctx->pbi->common; + cm->new_fb_idx = -1; + + if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) { + cm->get_fb_cb = ctx->get_ext_fb_cb; + cm->release_fb_cb = ctx->release_ext_fb_cb; + cm->cb_priv = ctx->ext_priv; + } else { + cm->get_fb_cb = vp9_get_frame_buffer; + cm->release_fb_cb = vp9_release_frame_buffer; + + if (vp9_alloc_internal_frame_buffers(&cm->int_frame_buffers)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to initialize internal frame buffers"); + + cm->cb_priv = &cm->int_frame_buffers; + } +} + +static void set_default_ppflags(vp8_postproc_cfg_t *cfg) { + cfg->post_proc_flag = VP8_DEBLOCK | VP8_DEMACROBLOCK; + cfg->deblocking_level = 4; + cfg->noise_level = 0; +} + +static void set_ppflags(const vpx_codec_alg_priv_t *ctx, + vp9_ppflags_t *flags) { + flags->post_proc_flag = + ctx->postproc_cfg.post_proc_flag; + + flags->deblocking_level = ctx->postproc_cfg.deblocking_level; + flags->noise_level = ctx->postproc_cfg.noise_level; +} + +static void init_decoder(vpx_codec_alg_priv_t *ctx) { + ctx->pbi = vp9_decoder_create(); + if (ctx->pbi == NULL) + return; + + ctx->pbi->max_threads = ctx->cfg.threads; + ctx->pbi->inv_tile_order = ctx->invert_tile_order; + ctx->pbi->frame_parallel_decode = ctx->frame_parallel_decode; + + // If postprocessing was enabled by the application and a + // configuration has not been provided, default it. + if (!ctx->postproc_cfg_set && + (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) + set_default_ppflags(&ctx->postproc_cfg); + + init_buffer_callbacks(ctx); +} + +static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, + const uint8_t **data, unsigned int data_sz, + void *user_priv, int64_t deadline) { + YV12_BUFFER_CONFIG sd; + vp9_ppflags_t flags = {0, 0, 0}; + VP9_COMMON *cm = NULL; + + (void)deadline; + + vp9_zero(sd); ctx->img_avail = 0; - /* Determine the stream parameters. Note that we rely on peek_si to - * validate that we have a buffer that does not wrap around the top - * of the heap. - */ - if (!ctx->si.h) - res = ctx->base.iface->dec.peek_si(*data, data_sz, &ctx->si); + // Determine the stream parameters. Note that we rely on peek_si to + // validate that we have a buffer that does not wrap around the top + // of the heap. + if (!ctx->si.h) { + int is_intra_only = 0; + const vpx_codec_err_t res = + decoder_peek_si_internal(*data, data_sz, &ctx->si, &is_intra_only, + ctx->decrypt_cb, ctx->decrypt_state); + if (res != VPX_CODEC_OK) + return res; - - /* Perform deferred allocations, if required */ - if (!res && ctx->defer_alloc) { - int i; - - for (i = 1; !res && i < NELEMENTS(ctx->mmaps); i++) { - vpx_codec_dec_cfg_t cfg; - - cfg.w = ctx->si.w; - cfg.h = ctx->si.h; - ctx->mmaps[i].id = vp9_mem_req_segs[i].id; - ctx->mmaps[i].sz = vp9_mem_req_segs[i].sz; - ctx->mmaps[i].align = vp9_mem_req_segs[i].align; - ctx->mmaps[i].flags = vp9_mem_req_segs[i].flags; - - if (!ctx->mmaps[i].sz) - ctx->mmaps[i].sz = vp9_mem_req_segs[i].calc_sz(&cfg, - ctx->base.init_flags); - - res = vpx_mmap_alloc(&ctx->mmaps[i]); - } - - if (!res) - vp9_finalize_mmaps(ctx); - - ctx->defer_alloc = 0; + if (!ctx->si.is_kf && !is_intra_only) + return VPX_CODEC_ERROR; } - /* Initialize the decoder instance on the first frame*/ - if (!res && !ctx->decoder_init) { - res = vpx_validate_mmaps(&ctx->si, ctx->mmaps, - vp9_mem_req_segs, NELEMENTS(vp9_mem_req_segs), - ctx->base.init_flags); - - if (!res) { - VP9D_CONFIG oxcf; - VP9D_PTR optr; - - vp9_initialize_dec(); - - oxcf.width = ctx->si.w; - oxcf.height = ctx->si.h; - oxcf.version = 9; - oxcf.postprocess = 0; - oxcf.max_threads = ctx->cfg.threads; - oxcf.inv_tile_order = ctx->invert_tile_order; - optr = vp9_create_decompressor(&oxcf); - - /* If postprocessing was enabled by the application and a - * configuration has not been provided, default it. - */ - if (!ctx->postproc_cfg_set - && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) { - ctx->postproc_cfg.post_proc_flag = - VP8_DEBLOCK | VP8_DEMACROBLOCK; - ctx->postproc_cfg.deblocking_level = 4; - ctx->postproc_cfg.noise_level = 0; - } - - if (!optr) - res = VPX_CODEC_ERROR; - else - ctx->pbi = optr; - } - - ctx->decoder_init = 1; + // Initialize the decoder instance on the first frame + if (ctx->pbi == NULL) { + init_decoder(ctx); + if (ctx->pbi == NULL) + return VPX_CODEC_ERROR; } - if (!res && ctx->pbi) { - YV12_BUFFER_CONFIG sd; - int64_t time_stamp = 0, time_end_stamp = 0; - vp9_ppflags_t flags = {0}; + // Set these even if already initialized. The caller may have changed the + // decrypt config between frames. + ctx->pbi->decrypt_cb = ctx->decrypt_cb; + ctx->pbi->decrypt_state = ctx->decrypt_state; - if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) { - flags.post_proc_flag = -#if CONFIG_POSTPROC_VISUALIZER - ((ctx->dbg_color_ref_frame_flag != 0) ? - VP9D_DEBUG_CLR_FRM_REF_BLKS : 0) - | ((ctx->dbg_color_mb_modes_flag != 0) ? - VP9D_DEBUG_CLR_BLK_MODES : 0) - | ((ctx->dbg_color_b_modes_flag != 0) ? - VP9D_DEBUG_CLR_BLK_MODES : 0) - | ((ctx->dbg_display_mv_flag != 0) ? - VP9D_DEBUG_DRAW_MV : 0) - | -#endif - ctx->postproc_cfg.post_proc_flag; + cm = &ctx->pbi->common; - flags.deblocking_level = ctx->postproc_cfg.deblocking_level; - flags.noise_level = ctx->postproc_cfg.noise_level; -#if CONFIG_POSTPROC_VISUALIZER - flags.display_ref_frame_flag = ctx->dbg_color_ref_frame_flag; - flags.display_mb_modes_flag = ctx->dbg_color_mb_modes_flag; - flags.display_b_modes_flag = ctx->dbg_color_b_modes_flag; - flags.display_mv_flag = ctx->dbg_display_mv_flag; -#endif - } + if (vp9_receive_compressed_data(ctx->pbi, data_sz, data)) + return update_error_state(ctx, &cm->error); - if (vp9_receive_compressed_data(ctx->pbi, data_sz, data, deadline)) { - VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi; - res = update_error_state(ctx, &pbi->common.error); - } + if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) + set_ppflags(ctx, &flags); - if (!res && 0 == vp9_get_raw_frame(ctx->pbi, &sd, &time_stamp, - &time_end_stamp, &flags)) { - yuvconfig2image(&ctx->img, &sd, user_priv); - ctx->img_avail = 1; - } - } + if (vp9_get_raw_frame(ctx->pbi, &sd, &flags)) + return update_error_state(ctx, &cm->error); - return res; + yuvconfig2image(&ctx->img, &sd, user_priv); + ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv; + ctx->img_avail = 1; + + return VPX_CODEC_OK; } -static void parse_superframe_index(const uint8_t *data, - size_t data_sz, - uint32_t sizes[8], - int *count) { - uint8_t marker; - - assert(data_sz); - marker = data[data_sz - 1]; - *count = 0; - - if ((marker & 0xe0) == 0xc0) { - const uint32_t frames = (marker & 0x7) + 1; - const uint32_t mag = ((marker >> 3) & 0x3) + 1; - const size_t index_sz = 2 + mag * frames; - - if (data_sz >= index_sz && data[data_sz - index_sz] == marker) { - // found a valid superframe index - uint32_t i, j; - const uint8_t *x = data + data_sz - index_sz + 1; - - for (i = 0; i < frames; i++) { - uint32_t this_sz = 0; - - for (j = 0; j < mag; j++) - this_sz |= (*x++) << (j * 8); - sizes[i] = this_sz; - } - - *count = frames; - } - } -} - -static vpx_codec_err_t vp9_decode(vpx_codec_alg_priv_t *ctx, - const uint8_t *data, - unsigned int data_sz, - void *user_priv, - long deadline) { +static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, + const uint8_t *data, unsigned int data_sz, + void *user_priv, long deadline) { const uint8_t *data_start = data; - const uint8_t *data_end = data + data_sz; - vpx_codec_err_t res = 0; - uint32_t sizes[8]; - int frames_this_pts, frame_count = 0; + const uint8_t * const data_end = data + data_sz; + vpx_codec_err_t res; + uint32_t frame_sizes[8]; + int frame_count; - if (data == NULL || data_sz == 0) return VPX_CODEC_INVALID_PARAM; + if (data == NULL && data_sz == 0) { + ctx->flushed = 1; + return VPX_CODEC_OK; + } - parse_superframe_index(data, data_sz, sizes, &frames_this_pts); + // Reset flushed when receiving a valid frame. + ctx->flushed = 0; - do { - // Skip over the superframe index, if present - if (data_sz && (*data_start & 0xe0) == 0xc0) { - const uint8_t marker = *data_start; - const uint32_t frames = (marker & 0x7) + 1; - const uint32_t mag = ((marker >> 3) & 0x3) + 1; - const uint32_t index_sz = 2 + mag * frames; + res = vp9_parse_superframe_index(data, data_sz, frame_sizes, &frame_count, + ctx->decrypt_cb, ctx->decrypt_state); + if (res != VPX_CODEC_OK) + return res; - if (data_sz >= index_sz && data_start[index_sz - 1] == marker) { - data_start += index_sz; - data_sz -= index_sz; - if (data_start < data_end) - continue; - else - break; + if (ctx->frame_parallel_decode) { + // Decode in frame parallel mode. When decoding in this mode, the frame + // passed to the decoder must be either a normal frame or a superframe with + // superframe index so the decoder could get each frame's start position + // in the superframe. + if (frame_count > 0) { + int i; + + for (i = 0; i < frame_count; ++i) { + const uint8_t *data_start_copy = data_start; + const uint32_t frame_size = frame_sizes[i]; + vpx_codec_err_t res; + if (data_start < data + || frame_size > (uint32_t) (data_end - data_start)) { + ctx->base.err_detail = "Invalid frame size in index"; + return VPX_CODEC_CORRUPT_FRAME; + } + + res = decode_one(ctx, &data_start_copy, frame_size, user_priv, + deadline); + if (res != VPX_CODEC_OK) + return res; + + data_start += frame_size; + } + } else { + res = decode_one(ctx, &data_start, data_sz, user_priv, deadline); + if (res != VPX_CODEC_OK) + return res; + + // Extra data detected after the frame. + if (data_start < data_end - 1) { + ctx->base.err_detail = "Fail to decode frame in parallel mode"; + return VPX_CODEC_INCAPABLE; } } + } else { + // Decode in serial mode. + if (frame_count > 0) { + int i; - // Use the correct size for this frame, if an index is present. - if (frames_this_pts) { - uint32_t this_sz = sizes[frame_count]; + for (i = 0; i < frame_count; ++i) { + const uint8_t *data_start_copy = data_start; + const uint32_t frame_size = frame_sizes[i]; + vpx_codec_err_t res; + if (data_start < data + || frame_size > (uint32_t) (data_end - data_start)) { + ctx->base.err_detail = "Invalid frame size in index"; + return VPX_CODEC_CORRUPT_FRAME; + } - if (data_sz < this_sz) { - ctx->base.err_detail = "Invalid frame size in index"; - return VPX_CODEC_CORRUPT_FRAME; + res = decode_one(ctx, &data_start_copy, frame_size, user_priv, + deadline); + if (res != VPX_CODEC_OK) + return res; + + data_start += frame_size; } + } else { + while (data_start < data_end) { + const uint32_t frame_size = (uint32_t) (data_end - data_start); + const vpx_codec_err_t res = decode_one(ctx, &data_start, frame_size, + user_priv, deadline); + if (res != VPX_CODEC_OK) + return res; - data_sz = this_sz; - frame_count++; + // Account for suboptimal termination by the encoder. + while (data_start < data_end) { + const uint8_t marker = read_marker(ctx->decrypt_cb, + ctx->decrypt_state, data_start); + if (marker) + break; + ++data_start; + } + } } + } - res = decode_one(ctx, &data_start, data_sz, user_priv, deadline); - assert(data_start >= data); - assert(data_start <= data_end); - - /* Early exit if there was a decode error */ - if (res) - break; - - /* Account for suboptimal termination by the encoder. */ - while (data_start < data_end && *data_start == 0) - data_start++; - - data_sz = data_end - data_start; - } while (data_start < data_end); - return res; + return VPX_CODEC_OK; } -static vpx_image_t *vp9_get_frame(vpx_codec_alg_priv_t *ctx, - vpx_codec_iter_t *iter) { +static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx, + vpx_codec_iter_t *iter) { vpx_image_t *img = NULL; if (ctx->img_avail) { - /* iter acts as a flip flop, so an image is only returned on the first - * call to get_frame. - */ + // iter acts as a flip flop, so an image is only returned on the first + // call to get_frame. if (!(*iter)) { img = &ctx->img; + img->bit_depth = (int)ctx->pbi->common.bit_depth; *iter = img; } } @@ -475,96 +446,42 @@ static vpx_image_t *vp9_get_frame(vpx_codec_alg_priv_t *ctx, return img; } -static vpx_codec_err_t vp9_xma_get_mmap(const vpx_codec_ctx_t *ctx, - vpx_codec_mmap_t *mmap, - vpx_codec_iter_t *iter) { - vpx_codec_err_t res; - const mem_req_t *seg_iter = *iter; +static vpx_codec_err_t decoder_set_fb_fn( + vpx_codec_alg_priv_t *ctx, + vpx_get_frame_buffer_cb_fn_t cb_get, + vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) { + if (cb_get == NULL || cb_release == NULL) { + return VPX_CODEC_INVALID_PARAM; + } else if (ctx->pbi == NULL) { + // If the decoder has already been initialized, do not accept changes to + // the frame buffer functions. + ctx->get_ext_fb_cb = cb_get; + ctx->release_ext_fb_cb = cb_release; + ctx->ext_priv = cb_priv; + return VPX_CODEC_OK; + } - /* Get address of next segment request */ - do { - if (!seg_iter) - seg_iter = vp9_mem_req_segs; - else if (seg_iter->id != VP9_SEG_MAX) - seg_iter++; - - *iter = (vpx_codec_iter_t)seg_iter; - - if (seg_iter->id != VP9_SEG_MAX) { - mmap->id = seg_iter->id; - mmap->sz = seg_iter->sz; - mmap->align = seg_iter->align; - mmap->flags = seg_iter->flags; - - if (!seg_iter->sz) - mmap->sz = seg_iter->calc_sz(ctx->config.dec, ctx->init_flags); - - res = VPX_CODEC_OK; - } else { - res = VPX_CODEC_LIST_END; - } - } while (!mmap->sz && res != VPX_CODEC_LIST_END); - - return res; + return VPX_CODEC_ERROR; } -static vpx_codec_err_t vp9_xma_set_mmap(vpx_codec_ctx_t *ctx, - const vpx_codec_mmap_t *mmap) { - vpx_codec_err_t res = VPX_CODEC_MEM_ERROR; - int i, done; - - if (!ctx->priv) { - if (mmap->id == VP9_SEG_ALG_PRIV) { - if (!ctx->priv) { - vp9_init_ctx(ctx, mmap); - res = VPX_CODEC_OK; - } - } - } - - done = 1; - - if (!res && ctx->priv->alg_priv) { - for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++) { - if (ctx->priv->alg_priv->mmaps[i].id == mmap->id) - if (!ctx->priv->alg_priv->mmaps[i].base) { - ctx->priv->alg_priv->mmaps[i] = *mmap; - res = VPX_CODEC_OK; - } - - done &= (ctx->priv->alg_priv->mmaps[i].base != NULL); - } - } - - if (done && !res) { - vp9_finalize_mmaps(ctx->priv->alg_priv); - res = ctx->iface->init(ctx, NULL); - } - - return res; -} - -static vpx_codec_err_t set_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) { - vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); +static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_ref_frame_t *const data = va_arg(args, vpx_ref_frame_t *); if (data) { - vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data; + vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data; YV12_BUFFER_CONFIG sd; image2yuvconfig(&frame->img, &sd); - - return vp9_set_reference_dec(ctx->pbi, + return vp9_set_reference_dec(&ctx->pbi->common, (VP9_REFFRAME)frame->frame_type, &sd); } else { return VPX_CODEC_INVALID_PARAM; } } -static vpx_codec_err_t copy_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) { +static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx, + va_list args) { vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *); if (data) { @@ -580,15 +497,14 @@ static vpx_codec_err_t copy_reference(vpx_codec_alg_priv_t *ctx, } } -static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) { +static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, + va_list args) { vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *); if (data) { - YV12_BUFFER_CONFIG* fb; + YV12_BUFFER_CONFIG* fb = get_ref_frame(&ctx->pbi->common, data->idx); + if (fb == NULL) return VPX_CODEC_ERROR; - vp9_get_reference_dec(ctx->pbi, data->idx, &fb); yuvconfig2image(&data->img, fb, NULL); return VPX_CODEC_OK; } else { @@ -596,9 +512,8 @@ static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx, } } -static vpx_codec_err_t set_postproc(vpx_codec_alg_priv_t *ctx, - int ctr_id, - va_list args) { +static vpx_codec_err_t ctrl_set_postproc(vpx_codec_alg_priv_t *ctx, + va_list args) { #if CONFIG_VP9_POSTPROC vp8_postproc_cfg_t *data = va_arg(args, vp8_postproc_cfg_t *); @@ -610,56 +525,26 @@ static vpx_codec_err_t set_postproc(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_INVALID_PARAM; } #else + (void)ctx; + (void)args; return VPX_CODEC_INCAPABLE; #endif } -static vpx_codec_err_t set_dbg_options(vpx_codec_alg_priv_t *ctx, - int ctrl_id, - va_list args) { -#if CONFIG_POSTPROC_VISUALIZER && CONFIG_POSTPROC - int data = va_arg(args, int); - -#define MAP(id, var) case id: var = data; break; - - switch (ctrl_id) { - MAP(VP8_SET_DBG_COLOR_REF_FRAME, ctx->dbg_color_ref_frame_flag); - MAP(VP8_SET_DBG_COLOR_MB_MODES, ctx->dbg_color_mb_modes_flag); - MAP(VP8_SET_DBG_COLOR_B_MODES, ctx->dbg_color_b_modes_flag); - MAP(VP8_SET_DBG_DISPLAY_MV, ctx->dbg_display_mv_flag); - } - - return VPX_CODEC_OK; -#else - return VPX_CODEC_INCAPABLE; -#endif -} - -static vpx_codec_err_t get_last_ref_updates(vpx_codec_alg_priv_t *ctx, - int ctrl_id, +static vpx_codec_err_t ctrl_set_dbg_options(vpx_codec_alg_priv_t *ctx, va_list args) { - int *update_info = va_arg(args, int *); - VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi; + (void)ctx; + (void)args; + return VPX_CODEC_INCAPABLE; +} + +static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *const update_info = va_arg(args, int *); if (update_info) { - *update_info = pbi->refresh_frame_flags; - - return VPX_CODEC_OK; - } else { - return VPX_CODEC_INVALID_PARAM; - } -} - - -static vpx_codec_err_t get_frame_corrupted(vpx_codec_alg_priv_t *ctx, - int ctrl_id, - va_list args) { - int *corrupted = va_arg(args, int *); - - if (corrupted) { - VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi; - if (pbi) - *corrupted = pbi->common.frame_to_show->corrupted; + if (ctx->pbi) + *update_info = ctx->pbi->refresh_frame_flags; else return VPX_CODEC_ERROR; return VPX_CODEC_OK; @@ -668,55 +553,119 @@ static vpx_codec_err_t get_frame_corrupted(vpx_codec_alg_priv_t *ctx, } } -static vpx_codec_err_t set_invert_tile_order(vpx_codec_alg_priv_t *ctx, - int ctr_id, + +static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx, + va_list args) { + int *corrupted = va_arg(args, int *); + + if (corrupted != NULL && ctx->pbi != NULL) { + const YV12_BUFFER_CONFIG *const frame = ctx->pbi->common.frame_to_show; + if (frame == NULL) return VPX_CODEC_ERROR; + *corrupted = frame->corrupted; + return VPX_CODEC_OK; + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + +static vpx_codec_err_t ctrl_get_display_size(vpx_codec_alg_priv_t *ctx, va_list args) { + int *const display_size = va_arg(args, int *); + + if (display_size) { + if (ctx->pbi) { + const VP9_COMMON *const cm = &ctx->pbi->common; + display_size[0] = cm->display_width; + display_size[1] = cm->display_height; + } else { + return VPX_CODEC_ERROR; + } + return VPX_CODEC_OK; + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + +static vpx_codec_err_t ctrl_get_bit_depth(vpx_codec_alg_priv_t *ctx, + va_list args) { + unsigned int *const bit_depth = va_arg(args, unsigned int *); + + if (bit_depth) { + if (ctx->pbi) { + const VP9_COMMON *const cm = &ctx->pbi->common; + *bit_depth = cm->bit_depth; + return VPX_CODEC_OK; + } else { + return VPX_CODEC_ERROR; + } + } else { + return VPX_CODEC_INVALID_PARAM; + } +} + +static vpx_codec_err_t ctrl_set_invert_tile_order(vpx_codec_alg_priv_t *ctx, + va_list args) { ctx->invert_tile_order = va_arg(args, int); return VPX_CODEC_OK; } -static vpx_codec_ctrl_fn_map_t ctf_maps[] = { - {VP8_SET_REFERENCE, set_reference}, - {VP8_COPY_REFERENCE, copy_reference}, - {VP8_SET_POSTPROC, set_postproc}, - {VP8_SET_DBG_COLOR_REF_FRAME, set_dbg_options}, - {VP8_SET_DBG_COLOR_MB_MODES, set_dbg_options}, - {VP8_SET_DBG_COLOR_B_MODES, set_dbg_options}, - {VP8_SET_DBG_DISPLAY_MV, set_dbg_options}, - {VP8D_GET_LAST_REF_UPDATES, get_last_ref_updates}, - {VP8D_GET_FRAME_CORRUPTED, get_frame_corrupted}, - {VP9_GET_REFERENCE, get_reference}, - {VP9_INVERT_TILE_DECODE_ORDER, set_invert_tile_order}, +static vpx_codec_err_t ctrl_set_decryptor(vpx_codec_alg_priv_t *ctx, + va_list args) { + vpx_decrypt_init *init = va_arg(args, vpx_decrypt_init *); + ctx->decrypt_cb = init ? init->decrypt_cb : NULL; + ctx->decrypt_state = init ? init->decrypt_state : NULL; + return VPX_CODEC_OK; +} + +static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { + {VP8_COPY_REFERENCE, ctrl_copy_reference}, + + // Setters + {VP8_SET_REFERENCE, ctrl_set_reference}, + {VP8_SET_POSTPROC, ctrl_set_postproc}, + {VP8_SET_DBG_COLOR_REF_FRAME, ctrl_set_dbg_options}, + {VP8_SET_DBG_COLOR_MB_MODES, ctrl_set_dbg_options}, + {VP8_SET_DBG_COLOR_B_MODES, ctrl_set_dbg_options}, + {VP8_SET_DBG_DISPLAY_MV, ctrl_set_dbg_options}, + {VP9_INVERT_TILE_DECODE_ORDER, ctrl_set_invert_tile_order}, + {VPXD_SET_DECRYPTOR, ctrl_set_decryptor}, + + // Getters + {VP8D_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates}, + {VP8D_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted}, + {VP9_GET_REFERENCE, ctrl_get_reference}, + {VP9D_GET_DISPLAY_SIZE, ctrl_get_display_size}, + {VP9D_GET_BIT_DEPTH, ctrl_get_bit_depth}, + { -1, NULL}, }; - #ifndef VERSION_STRING #define VERSION_STRING #endif CODEC_INTERFACE(vpx_codec_vp9_dx) = { "WebM Project VP9 Decoder" VERSION_STRING, VPX_CODEC_INTERNAL_ABI_VERSION, - VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC, - /* vpx_codec_caps_t caps; */ - vp9_init, /* vpx_codec_init_fn_t init; */ - vp9_destroy, /* vpx_codec_destroy_fn_t destroy; */ - ctf_maps, /* vpx_codec_ctrl_fn_map_t *ctrl_maps; */ - vp9_xma_get_mmap, /* vpx_codec_get_mmap_fn_t get_mmap; */ - vp9_xma_set_mmap, /* vpx_codec_set_mmap_fn_t set_mmap; */ + VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC | + VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER, // vpx_codec_caps_t + decoder_init, // vpx_codec_init_fn_t + decoder_destroy, // vpx_codec_destroy_fn_t + decoder_ctrl_maps, // vpx_codec_ctrl_fn_map_t { // NOLINT - vp9_peek_si, /* vpx_codec_peek_si_fn_t peek_si; */ - vp9_get_si, /* vpx_codec_get_si_fn_t get_si; */ - vp9_decode, /* vpx_codec_decode_fn_t decode; */ - vp9_get_frame, /* vpx_codec_frame_get_fn_t frame_get; */ + decoder_peek_si, // vpx_codec_peek_si_fn_t + decoder_get_si, // vpx_codec_get_si_fn_t + decoder_decode, // vpx_codec_decode_fn_t + decoder_get_frame, // vpx_codec_frame_get_fn_t + decoder_set_fb_fn, // vpx_codec_set_fb_fn_t }, { // NOLINT - /* encoder functions */ - NOT_IMPLEMENTED, - NOT_IMPLEMENTED, - NOT_IMPLEMENTED, - NOT_IMPLEMENTED, - NOT_IMPLEMENTED, - NOT_IMPLEMENTED + 0, + NULL, // vpx_codec_enc_cfg_map_t + NULL, // vpx_codec_encode_fn_t + NULL, // vpx_codec_get_cx_data_fn_t + NULL, // vpx_codec_enc_config_set_fn_t + NULL, // vpx_codec_get_global_headers_fn_t + NULL, // vpx_codec_get_preview_frame_fn_t + NULL // vpx_codec_enc_mr_get_mem_loc_fn_t } }; diff --git a/media/libvpx/vp9/vp9_iface_common.h b/media/libvpx/vp9/vp9_iface_common.h index ed0122c1b8fa..fc98b62c5e86 100644 --- a/media/libvpx/vp9/vp9_iface_common.h +++ b/media/libvpx/vp9/vp9_iface_common.h @@ -16,9 +16,11 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, * the Y, U, and V planes, nor other alignment adjustments that * might be representable by a YV12_BUFFER_CONFIG, so we just * initialize all the fields.*/ - int bps = 12; - if (yv12->uv_height == yv12->y_height) { - if (yv12->uv_width == yv12->y_width) { + const int ss_x = yv12->uv_crop_width < yv12->y_crop_width; + const int ss_y = yv12->uv_crop_height < yv12->y_crop_height; + int bps; + if (!ss_y) { + if (!ss_x) { img->fmt = VPX_IMG_FMT_I444; bps = 24; } else { @@ -27,21 +29,23 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, } } else { img->fmt = VPX_IMG_FMT_I420; + bps = 12; } + img->bit_depth = 8; img->w = yv12->y_stride; - img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9BORDERINPIXELS, 3); + img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3); img->d_w = yv12->y_crop_width; img->d_h = yv12->y_crop_height; - img->x_chroma_shift = yv12->uv_width < yv12->y_width; - img->y_chroma_shift = yv12->uv_height < yv12->y_height; + img->x_chroma_shift = ss_x; + img->y_chroma_shift = ss_y; img->planes[VPX_PLANE_Y] = yv12->y_buffer; img->planes[VPX_PLANE_U] = yv12->u_buffer; img->planes[VPX_PLANE_V] = yv12->v_buffer; - img->planes[VPX_PLANE_ALPHA] = yv12->alpha_buffer; + img->planes[VPX_PLANE_ALPHA] = NULL; img->stride[VPX_PLANE_Y] = yv12->y_stride; img->stride[VPX_PLANE_U] = yv12->uv_stride; img->stride[VPX_PLANE_V] = yv12->uv_stride; - img->stride[VPX_PLANE_ALPHA] = yv12->alpha_stride; + img->stride[VPX_PLANE_ALPHA] = yv12->y_stride; img->bps = bps; img->user_priv = user_priv; img->img_data = yv12->buffer_alloc; @@ -54,7 +58,6 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, yv12->y_buffer = img->planes[VPX_PLANE_Y]; yv12->u_buffer = img->planes[VPX_PLANE_U]; yv12->v_buffer = img->planes[VPX_PLANE_V]; - yv12->alpha_buffer = img->planes[VPX_PLANE_ALPHA]; yv12->y_crop_width = img->d_w; yv12->y_crop_height = img->d_h; @@ -66,21 +69,10 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, yv12->uv_height = img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2 : yv12->y_height; - yv12->alpha_width = yv12->alpha_buffer ? img->d_w : 0; - yv12->alpha_height = yv12->alpha_buffer ? img->d_h : 0; - yv12->y_stride = img->stride[VPX_PLANE_Y]; yv12->uv_stride = img->stride[VPX_PLANE_U]; - yv12->alpha_stride = yv12->alpha_buffer ? img->stride[VPX_PLANE_ALPHA] : 0; yv12->border = (img->stride[VPX_PLANE_Y] - img->w) / 2; -#if CONFIG_ALPHA - // For development purposes, force alpha to hold the same data a Y for now. - yv12->alpha_buffer = yv12->y_buffer; - yv12->alpha_width = yv12->y_width; - yv12->alpha_height = yv12->y_height; - yv12->alpha_stride = yv12->y_stride; -#endif return VPX_CODEC_OK; } diff --git a/media/libvpx/vp9_rtcd_armv7-android-gcc.h b/media/libvpx/vp9_rtcd_armv7-android-gcc.h index f6dad7d48efe..c4da1238be15 100644 --- a/media/libvpx/vp9_rtcd_armv7-android-gcc.h +++ b/media/libvpx/vp9_rtcd_armv7-android-gcc.h @@ -7,6 +7,10 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + /* * VP9 */ @@ -19,220 +23,18 @@ struct macroblockd; /* Encoder forward decls */ struct macroblock; struct vp9_variance_vtable; - -#define DEC_MVCOSTS int *mvjcost, int *mvcost[2] +struct search_site_config; +struct mv; union int_mv; struct yv12_buffer_config; -void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c - -void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c - -void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c - -void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c - -void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c - -void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c - -void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c - -void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c - -void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c - -void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c - -void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c - -void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c - -void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c - -void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c - -void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c - -void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c - -void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c - -void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c - -void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c - -void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c - -void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c - -void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c - -void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c - -void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c - -void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c - -void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c - -void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c - -void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c - -void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c - -void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c - -void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c - -void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c - -void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c - -void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c - -void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c - -void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c - -void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c - -void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c - -void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c - -void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c - -void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c - -void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c - -void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c - -void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c - -void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c - -void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c - -void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c - -void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c - -void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c - -void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c - -void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c - -void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c - -void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); -void vp9_mb_lpf_vertical_edge_w_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); -RTCD_EXTERN void (*vp9_mb_lpf_vertical_edge_w)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); - -void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mbloop_filter_vertical_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_mbloop_filter_vertical_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_loop_filter_vertical_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_loop_filter_vertical_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mb_lpf_horizontal_edge_w_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_mb_lpf_horizontal_edge_w)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mbloop_filter_horizontal_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_mbloop_filter_horizontal_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_loop_filter_horizontal_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_loop_filter_horizontal_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); - -void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); +#define vp9_block_error vp9_block_error_c void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*vp9_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); - -void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); - void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*vp9_convolve8_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); @@ -245,25 +47,222 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t void vp9_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*vp9_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct4x4_1_add_neon(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct4x4_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct4x4_16_add_neon(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct4x4_16_add)(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct8x8_1_add_neon(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct8x8_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct8x8_64_add_neon(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct8x8_64_add)(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct8x8_10_add_neon(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct8x8_10_add)(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c + +void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c + +void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c + +void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c + +void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c + +void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c + +void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c + +void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c + +void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c + +void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c + +void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c + +void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c + +void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c + +void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c + +void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c + +void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c + +void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c + +void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c + +void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c + +void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c + +void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c + +void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c + +void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c + +void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c + +void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c + +void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c + +void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c + +void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c + +void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c + +void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c + +void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c + +void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c + +void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c + +void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c + +void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c + +void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c + +void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c + +void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c + +void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c + +void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct16x16 vp9_fdct16x16_c + +void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct16x16_1 vp9_fdct16x16_1_c + +void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct32x32 vp9_fdct32x32_c + +void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct32x32_1 vp9_fdct32x32_1_c + +void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct32x32_rd vp9_fdct32x32_rd_c + +void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct4x4 vp9_fdct4x4_c + +void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct4x4_1 vp9_fdct4x4_1_c + +void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct8x8_neon(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct8x8)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct8x8_1)(const int16_t *input, int16_t *output, int stride); + +void vp9_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type); +#define vp9_fht16x16 vp9_fht16x16_c + +void vp9_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type); +#define vp9_fht4x4 vp9_fht4x4_c + +void vp9_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type); +#define vp9_fht8x8 vp9_fht8x8_c + +int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_full_range_search vp9_full_range_search_c + +int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +#define vp9_full_search_sad vp9_full_search_sad_c + +void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_c + +void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vp9_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vp9_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vp9_get_mb_ss_c(const int16_t *); +#define vp9_get_mb_ss vp9_get_mb_ss_c + +void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct16x16_10_add)(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct16x16_1_add_neon(const int16_t *input, uint8_t *dest, int dest_stride); @@ -273,21 +272,41 @@ void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int dest_strid void vp9_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest, int dest_stride); RTCD_EXTERN void (*vp9_idct16x16_256_add)(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct16x16_10_add)(const int16_t *input, uint8_t *dest, int dest_stride); - void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct32x32_1024_add_neon(const int16_t *input, uint8_t *dest, int dest_stride); RTCD_EXTERN void (*vp9_idct32x32_1024_add)(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct32x32_34_add vp9_idct32x32_34_add_c - void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct32x32_1_add_neon(const int16_t *input, uint8_t *dest, int dest_stride); RTCD_EXTERN void (*vp9_idct32x32_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct32x32_1024_add_neon(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct32x32_34_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct4x4_16_add_neon(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct4x4_16_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct4x4_1_add_neon(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct4x4_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_12_add_neon(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct8x8_12_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_1_add_neon(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct8x8_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_64_add_neon(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct8x8_64_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c + void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); void vp9_iht4x4_16_add_neon(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); RTCD_EXTERN void (*vp9_iht4x4_16_add)(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); @@ -296,403 +315,417 @@ void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride, i void vp9_iht8x8_64_add_neon(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); RTCD_EXTERN void (*vp9_iht8x8_64_add)(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type); -#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c +void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c void vp9_iwht4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); #define vp9_iwht4x4_1_add vp9_iwht4x4_1_add_c -void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c +void vp9_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_horizontal_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x16 vp9_variance32x16_c +void vp9_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_horizontal_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x32 vp9_variance16x32_c +void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vp9_lpf_horizontal_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x32 vp9_variance64x32_c +void vp9_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_horizontal_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x64 vp9_variance32x64_c +void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vp9_lpf_horizontal_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x32 vp9_variance32x32_c +void vp9_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vp9_lpf_vertical_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vp9_lpf_vertical_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x64 vp9_variance64x64_c +void vp9_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vp9_lpf_vertical_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x16 vp9_variance16x16_c +void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_vertical_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_vertical_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x8 vp9_variance16x8_c +void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vp9_lpf_vertical_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x16 vp9_variance8x16_c +void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_vertical_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_vertical_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x8 vp9_variance8x8_c - -void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vp9_get_sse_sum_8x8 vp9_get_sse_sum_8x8_c - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x4 vp9_variance8x4_c - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x8 vp9_variance4x8_c - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x4 vp9_variance4x4_c - -unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance64x64 vp9_sub_pixel_variance64x64_c - -unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance64x64 vp9_sub_pixel_avg_variance64x64_c - -unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance32x64 vp9_sub_pixel_variance32x64_c - -unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance32x64 vp9_sub_pixel_avg_variance32x64_c - -unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance64x32 vp9_sub_pixel_variance64x32_c - -unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance64x32 vp9_sub_pixel_avg_variance64x32_c - -unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance32x16 vp9_sub_pixel_variance32x16_c - -unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance32x16 vp9_sub_pixel_avg_variance32x16_c - -unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance16x32 vp9_sub_pixel_variance16x32_c - -unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance16x32 vp9_sub_pixel_avg_variance16x32_c - -unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance32x32 vp9_sub_pixel_variance32x32_c - -unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance32x32 vp9_sub_pixel_avg_variance32x32_c - -unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance16x16 vp9_sub_pixel_variance16x16_c - -unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance16x16 vp9_sub_pixel_avg_variance16x16_c - -unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance8x16 vp9_sub_pixel_variance8x16_c - -unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance8x16 vp9_sub_pixel_avg_variance8x16_c - -unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance16x8 vp9_sub_pixel_variance16x8_c - -unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance16x8 vp9_sub_pixel_avg_variance16x8_c - -unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance8x8 vp9_sub_pixel_variance8x8_c - -unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance8x8 vp9_sub_pixel_avg_variance8x8_c - -unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance8x4 vp9_sub_pixel_variance8x4_c - -unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance8x4 vp9_sub_pixel_avg_variance8x4_c - -unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance4x8 vp9_sub_pixel_variance4x8_c - -unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance4x8 vp9_sub_pixel_avg_variance4x8_c - -unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance4x4 vp9_sub_pixel_variance4x4_c - -unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance4x4 vp9_sub_pixel_avg_variance4x4_c - -unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad64x64 vp9_sad64x64_c - -unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x64 vp9_sad32x64_c - -unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad64x32 vp9_sad64x32_c - -unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x16 vp9_sad32x16_c - -unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad16x32 vp9_sad16x32_c - -unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x32 vp9_sad32x32_c - -unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad16x16 vp9_sad16x16_c - -unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad16x8 vp9_sad16x8_c - -unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad8x16 vp9_sad8x16_c - -unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad8x8 vp9_sad8x8_c - -unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad8x4 vp9_sad8x4_c - -unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad4x8 vp9_sad4x8_c - -unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad4x4 vp9_sad4x4_c - -unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad64x64_avg vp9_sad64x64_avg_c - -unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x64_avg vp9_sad32x64_avg_c - -unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad64x32_avg vp9_sad64x32_avg_c - -unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x16_avg vp9_sad32x16_avg_c - -unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x32_avg vp9_sad16x32_avg_c - -unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x32_avg vp9_sad32x32_avg_c - -unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x16_avg vp9_sad16x16_avg_c - -unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x8_avg vp9_sad16x8_avg_c - -unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x16_avg vp9_sad8x16_avg_c - -unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x8_avg vp9_sad8x8_avg_c - -unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x4_avg vp9_sad8x4_avg_c - -unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad4x8_avg vp9_sad4x8_avg_c - -unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad4x4_avg vp9_sad4x4_avg_c - -unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_c - -unsigned int vp9_variance_halfpixvar16x16_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_c - -unsigned int vp9_variance_halfpixvar16x16_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_c - -unsigned int vp9_variance_halfpixvar64x64_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_h vp9_variance_halfpixvar64x64_h_c - -unsigned int vp9_variance_halfpixvar64x64_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_v vp9_variance_halfpixvar64x64_v_c - -unsigned int vp9_variance_halfpixvar64x64_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_hv vp9_variance_halfpixvar64x64_hv_c - -unsigned int vp9_variance_halfpixvar32x32_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_h vp9_variance_halfpixvar32x32_h_c - -unsigned int vp9_variance_halfpixvar32x32_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_v vp9_variance_halfpixvar32x32_v_c - -unsigned int vp9_variance_halfpixvar32x32_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_hv vp9_variance_halfpixvar32x32_hv_c - -void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad64x64x3 vp9_sad64x64x3_c - -void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad32x32x3 vp9_sad32x32x3_c - -void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad16x16x3 vp9_sad16x16x3_c - -void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad16x8x3 vp9_sad16x8x3_c - -void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad8x16x3 vp9_sad8x16x3_c - -void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad8x8x3 vp9_sad8x8x3_c - -void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad4x4x3 vp9_sad4x4x3_c - -void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad64x64x8 vp9_sad64x64x8_c - -void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad32x32x8 vp9_sad32x32x8_c - -void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad16x16x8 vp9_sad16x16x8_c - -void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad16x8x8 vp9_sad16x8x8_c - -void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x16x8 vp9_sad8x16x8_c - -void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x8x8 vp9_sad8x8x8_c - -void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x4x8 vp9_sad8x4x8_c - -void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad4x8x8 vp9_sad4x8x8_c - -void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad4x4x8 vp9_sad4x4x8_c - -void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad64x64x4d vp9_sad64x64x4d_c - -void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad32x64x4d vp9_sad32x64x4d_c - -void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad64x32x4d vp9_sad64x32x4d_c - -void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad32x16x4d vp9_sad32x16x4d_c - -void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad16x32x4d vp9_sad16x32x4d_c - -void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad32x32x4d vp9_sad32x32x4d_c - -void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad16x16x4d vp9_sad16x16x4d_c - -void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad16x8x4d vp9_sad16x8x4d_c - -void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad8x16x4d vp9_sad8x16x4d_c - -void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad8x8x4d vp9_sad8x8x4d_c - -void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad8x4x4d vp9_sad8x4x4d_c - -void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad4x8x4d vp9_sad4x8x4d_c - -void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad4x4x4d vp9_sad4x4x4d_c +void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vp9_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); #define vp9_mse16x16 vp9_mse16x16_c -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_c - unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); #define vp9_mse16x8 vp9_mse16x8_c +unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse8x16 vp9_mse8x16_c + unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); #define vp9_mse8x8 vp9_mse8x8_c -unsigned int vp9_sub_pixel_mse64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_mse64x64 vp9_sub_pixel_mse64x64_c - -unsigned int vp9_sub_pixel_mse32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_mse32x32 vp9_sub_pixel_mse32x32_c - -unsigned int vp9_get_mb_ss_c(const int16_t *); -#define vp9_get_mb_ss vp9_get_mb_ss_c - -int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); -#define vp9_block_error vp9_block_error_c - -void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); -#define vp9_subtract_block vp9_subtract_block_c - void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_b vp9_quantize_b_c void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_b_32x32 vp9_quantize_b_32x32_c -void vp9_short_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type); -#define vp9_short_fht4x4 vp9_short_fht4x4_c +void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +RTCD_EXTERN void (*vp9_quantize_fp)(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_short_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type); -#define vp9_short_fht8x8 vp9_short_fht8x8_c +void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c -void vp9_short_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type); -#define vp9_short_fht16x16 vp9_short_fht16x16_c - -void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride); -#define vp9_fwht4x4 vp9_fwht4x4_c - -void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct4x4 vp9_fdct4x4_c - -void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct8x8 vp9_fdct8x8_c - -void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct16x16 vp9_fdct16x16_c - -void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct32x32 vp9_fdct32x32_c - -void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct32x32_rd vp9_fdct32x32_rd_c - -int vp9_full_search_sad_c(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -#define vp9_full_search_sad vp9_full_search_sad_c - -int vp9_refining_search_sad_c(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); +int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); #define vp9_refining_search_sad vp9_refining_search_sad_c -int vp9_diamond_search_sad_c(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -#define vp9_diamond_search_sad vp9_diamond_search_sad_c +unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x16_avg vp9_sad16x16_avg_c + +void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad16x16x3 vp9_sad16x16x3_c + +void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad16x16x4d vp9_sad16x16x4d_c + +void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad16x16x8 vp9_sad16x16x8_c + +unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x32 vp9_sad16x32_c + +unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x32_avg vp9_sad16x32_avg_c + +void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad16x32x4d vp9_sad16x32x4d_c + +unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x8 vp9_sad16x8_c + +unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x8_avg vp9_sad16x8_avg_c + +void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad16x8x3 vp9_sad16x8x3_c + +void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad16x8x4d vp9_sad16x8x4d_c + +void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad16x8x8 vp9_sad16x8x8_c + +unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad32x16 vp9_sad32x16_c + +unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x16_avg vp9_sad32x16_avg_c + +void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad32x16x4d vp9_sad32x16x4d_c + +unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x32_avg vp9_sad32x32_avg_c + +void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad32x32x3 vp9_sad32x32x3_c + +void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad32x32x4d vp9_sad32x32x4d_c + +void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad32x32x8 vp9_sad32x32x8_c + +unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad32x64 vp9_sad32x64_c + +unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x64_avg vp9_sad32x64_avg_c + +void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad32x64x4d vp9_sad32x64x4d_c + +unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad4x4 vp9_sad4x4_c + +unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad4x4_avg vp9_sad4x4_avg_c + +void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad4x4x3 vp9_sad4x4x3_c + +void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad4x4x4d vp9_sad4x4x4d_c + +void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad4x4x8 vp9_sad4x4x8_c + +unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad4x8 vp9_sad4x8_c + +unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad4x8_avg vp9_sad4x8_avg_c + +void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad4x8x4d vp9_sad4x8x4d_c + +void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad4x8x8 vp9_sad4x8x8_c + +unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad64x32 vp9_sad64x32_c + +unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad64x32_avg vp9_sad64x32_avg_c + +void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad64x32x4d vp9_sad64x32x4d_c + +unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad64x64_avg vp9_sad64x64_avg_c + +void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad64x64x3 vp9_sad64x64x3_c + +void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad64x64x4d vp9_sad64x64x4d_c + +void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad64x64x8 vp9_sad64x64x8_c + +unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x16 vp9_sad8x16_c + +unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x16_avg vp9_sad8x16_avg_c + +void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad8x16x3 vp9_sad8x16x3_c + +void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad8x16x4d vp9_sad8x16x4d_c + +void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x16x8 vp9_sad8x16x8_c + +unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x4 vp9_sad8x4_c + +unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x4_avg vp9_sad8x4_avg_c + +void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad8x4x4d vp9_sad8x4x4d_c + +void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x4x8 vp9_sad8x4x8_c + +unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x8_avg vp9_sad8x8_avg_c + +void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad8x8x3 vp9_sad8x8x3_c + +void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad8x8x4d vp9_sad8x8x4d_c + +void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x8x8 vp9_sad8x8x8_c + +unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance16x16 vp9_sub_pixel_avg_variance16x16_c + +unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance16x32 vp9_sub_pixel_avg_variance16x32_c + +unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance16x8 vp9_sub_pixel_avg_variance16x8_c + +unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance32x16 vp9_sub_pixel_avg_variance32x16_c + +unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance32x32 vp9_sub_pixel_avg_variance32x32_c + +unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance32x64 vp9_sub_pixel_avg_variance32x64_c + +unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance4x4 vp9_sub_pixel_avg_variance4x4_c + +unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance4x8 vp9_sub_pixel_avg_variance4x8_c + +unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance64x32 vp9_sub_pixel_avg_variance64x32_c + +unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance64x64 vp9_sub_pixel_avg_variance64x64_c + +unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance8x16 vp9_sub_pixel_avg_variance8x16_c + +unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance8x4 vp9_sub_pixel_avg_variance8x4_c + +unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance8x8 vp9_sub_pixel_avg_variance8x8_c + +unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance16x32 vp9_sub_pixel_variance16x32_c + +unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance16x8 vp9_sub_pixel_variance16x8_c + +unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance32x16 vp9_sub_pixel_variance32x16_c + +unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance32x64 vp9_sub_pixel_variance32x64_c + +unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance4x4 vp9_sub_pixel_variance4x4_c + +unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance4x8 vp9_sub_pixel_variance4x8_c + +unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance64x32 vp9_sub_pixel_variance64x32_c + +unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance64x64 vp9_sub_pixel_variance64x64_c + +unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance8x16 vp9_sub_pixel_variance8x16_c + +unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance8x4 vp9_sub_pixel_variance8x4_c + +unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vp9_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +RTCD_EXTERN void (*vp9_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); + +void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); #define vp9_temporal_filter_apply vp9_temporal_filter_apply_c -void vp9_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction); -#define vp9_yv12_copy_partial_frame vp9_yv12_copy_partial_frame_c +void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance16x32 vp9_variance16x32_c + +unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance16x8 vp9_variance16x8_c + +unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance32x16 vp9_variance32x16_c + +unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance32x64 vp9_variance32x64_c + +unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance4x4 vp9_variance4x4_c + +unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance4x8 vp9_variance4x8_c + +unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance64x32 vp9_variance64x32_c + +unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance64x64 vp9_variance64x64_c + +unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x16 vp9_variance8x16_c + +unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x4 vp9_variance8x4_c + +unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); void vp9_rtcd(void); + #include "vpx_config.h" #ifdef RTCD_C @@ -703,139 +736,131 @@ static void setup_rtcd_internal(void) (void)flags; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - vp9_mb_lpf_vertical_edge_w = vp9_mb_lpf_vertical_edge_w_c; - if (flags & HAS_NEON) vp9_mb_lpf_vertical_edge_w = vp9_mb_lpf_vertical_edge_w_neon; - - vp9_mbloop_filter_vertical_edge = vp9_mbloop_filter_vertical_edge_c; - if (flags & HAS_NEON) vp9_mbloop_filter_vertical_edge = vp9_mbloop_filter_vertical_edge_neon; - - vp9_loop_filter_vertical_edge = vp9_loop_filter_vertical_edge_c; - if (flags & HAS_NEON) vp9_loop_filter_vertical_edge = vp9_loop_filter_vertical_edge_neon; - - vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_c; - if (flags & HAS_NEON) vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_neon; - - vp9_mbloop_filter_horizontal_edge = vp9_mbloop_filter_horizontal_edge_c; - if (flags & HAS_NEON) vp9_mbloop_filter_horizontal_edge = vp9_mbloop_filter_horizontal_edge_neon; - - vp9_loop_filter_horizontal_edge = vp9_loop_filter_horizontal_edge_c; - if (flags & HAS_NEON) vp9_loop_filter_horizontal_edge = vp9_loop_filter_horizontal_edge_neon; - - - - - vp9_convolve_copy = vp9_convolve_copy_c; - if (flags & HAS_NEON) vp9_convolve_copy = vp9_convolve_copy_neon; - - vp9_convolve_avg = vp9_convolve_avg_c; - if (flags & HAS_NEON) vp9_convolve_avg = vp9_convolve_avg_neon; - vp9_convolve8 = vp9_convolve8_c; if (flags & HAS_NEON) vp9_convolve8 = vp9_convolve8_neon; - - vp9_convolve8_horiz = vp9_convolve8_horiz_c; - if (flags & HAS_NEON) vp9_convolve8_horiz = vp9_convolve8_horiz_neon; - - vp9_convolve8_vert = vp9_convolve8_vert_c; - if (flags & HAS_NEON) vp9_convolve8_vert = vp9_convolve8_vert_neon; - vp9_convolve8_avg = vp9_convolve8_avg_c; if (flags & HAS_NEON) vp9_convolve8_avg = vp9_convolve8_avg_neon; - vp9_convolve8_avg_horiz = vp9_convolve8_avg_horiz_c; if (flags & HAS_NEON) vp9_convolve8_avg_horiz = vp9_convolve8_avg_horiz_neon; - vp9_convolve8_avg_vert = vp9_convolve8_avg_vert_c; if (flags & HAS_NEON) vp9_convolve8_avg_vert = vp9_convolve8_avg_vert_neon; - - vp9_idct4x4_1_add = vp9_idct4x4_1_add_c; - if (flags & HAS_NEON) vp9_idct4x4_1_add = vp9_idct4x4_1_add_neon; - - vp9_idct4x4_16_add = vp9_idct4x4_16_add_c; - if (flags & HAS_NEON) vp9_idct4x4_16_add = vp9_idct4x4_16_add_neon; - - vp9_idct8x8_1_add = vp9_idct8x8_1_add_c; - if (flags & HAS_NEON) vp9_idct8x8_1_add = vp9_idct8x8_1_add_neon; - - vp9_idct8x8_64_add = vp9_idct8x8_64_add_c; - if (flags & HAS_NEON) vp9_idct8x8_64_add = vp9_idct8x8_64_add_neon; - - vp9_idct8x8_10_add = vp9_idct8x8_10_add_c; - if (flags & HAS_NEON) vp9_idct8x8_10_add = vp9_idct8x8_10_add_neon; - - vp9_idct16x16_1_add = vp9_idct16x16_1_add_c; - if (flags & HAS_NEON) vp9_idct16x16_1_add = vp9_idct16x16_1_add_neon; - - vp9_idct16x16_256_add = vp9_idct16x16_256_add_c; - if (flags & HAS_NEON) vp9_idct16x16_256_add = vp9_idct16x16_256_add_neon; - + vp9_convolve8_horiz = vp9_convolve8_horiz_c; + if (flags & HAS_NEON) vp9_convolve8_horiz = vp9_convolve8_horiz_neon; + vp9_convolve8_vert = vp9_convolve8_vert_c; + if (flags & HAS_NEON) vp9_convolve8_vert = vp9_convolve8_vert_neon; + vp9_convolve_avg = vp9_convolve_avg_c; + if (flags & HAS_NEON) vp9_convolve_avg = vp9_convolve_avg_neon; + vp9_convolve_copy = vp9_convolve_copy_c; + if (flags & HAS_NEON) vp9_convolve_copy = vp9_convolve_copy_neon; + vp9_fdct8x8 = vp9_fdct8x8_c; + if (flags & HAS_NEON) vp9_fdct8x8 = vp9_fdct8x8_neon; + vp9_fdct8x8_1 = vp9_fdct8x8_1_c; + if (flags & HAS_NEON) vp9_fdct8x8_1 = vp9_fdct8x8_1_neon; + vp9_get16x16var = vp9_get16x16var_c; + if (flags & HAS_NEON) vp9_get16x16var = vp9_get16x16var_neon; + vp9_get8x8var = vp9_get8x8var_c; + if (flags & HAS_NEON) vp9_get8x8var = vp9_get8x8var_neon; + vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c; + if (flags & HAS_NEON) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_neon; + vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c; + if (flags & HAS_NEON) vp9_h_predictor_32x32 = vp9_h_predictor_32x32_neon; + vp9_h_predictor_4x4 = vp9_h_predictor_4x4_c; + if (flags & HAS_NEON) vp9_h_predictor_4x4 = vp9_h_predictor_4x4_neon; + vp9_h_predictor_8x8 = vp9_h_predictor_8x8_c; + if (flags & HAS_NEON) vp9_h_predictor_8x8 = vp9_h_predictor_8x8_neon; vp9_idct16x16_10_add = vp9_idct16x16_10_add_c; if (flags & HAS_NEON) vp9_idct16x16_10_add = vp9_idct16x16_10_add_neon; - + vp9_idct16x16_1_add = vp9_idct16x16_1_add_c; + if (flags & HAS_NEON) vp9_idct16x16_1_add = vp9_idct16x16_1_add_neon; + vp9_idct16x16_256_add = vp9_idct16x16_256_add_c; + if (flags & HAS_NEON) vp9_idct16x16_256_add = vp9_idct16x16_256_add_neon; vp9_idct32x32_1024_add = vp9_idct32x32_1024_add_c; if (flags & HAS_NEON) vp9_idct32x32_1024_add = vp9_idct32x32_1024_add_neon; - - vp9_idct32x32_1_add = vp9_idct32x32_1_add_c; if (flags & HAS_NEON) vp9_idct32x32_1_add = vp9_idct32x32_1_add_neon; - + vp9_idct32x32_34_add = vp9_idct32x32_34_add_c; + if (flags & HAS_NEON) vp9_idct32x32_34_add = vp9_idct32x32_1024_add_neon; + vp9_idct4x4_16_add = vp9_idct4x4_16_add_c; + if (flags & HAS_NEON) vp9_idct4x4_16_add = vp9_idct4x4_16_add_neon; + vp9_idct4x4_1_add = vp9_idct4x4_1_add_c; + if (flags & HAS_NEON) vp9_idct4x4_1_add = vp9_idct4x4_1_add_neon; + vp9_idct8x8_12_add = vp9_idct8x8_12_add_c; + if (flags & HAS_NEON) vp9_idct8x8_12_add = vp9_idct8x8_12_add_neon; + vp9_idct8x8_1_add = vp9_idct8x8_1_add_c; + if (flags & HAS_NEON) vp9_idct8x8_1_add = vp9_idct8x8_1_add_neon; + vp9_idct8x8_64_add = vp9_idct8x8_64_add_c; + if (flags & HAS_NEON) vp9_idct8x8_64_add = vp9_idct8x8_64_add_neon; vp9_iht4x4_16_add = vp9_iht4x4_16_add_c; if (flags & HAS_NEON) vp9_iht4x4_16_add = vp9_iht4x4_16_add_neon; - vp9_iht8x8_64_add = vp9_iht8x8_64_add_c; if (flags & HAS_NEON) vp9_iht8x8_64_add = vp9_iht8x8_64_add_neon; + vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_c; + if (flags & HAS_NEON) vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_neon; + vp9_lpf_horizontal_4 = vp9_lpf_horizontal_4_c; + if (flags & HAS_NEON) vp9_lpf_horizontal_4 = vp9_lpf_horizontal_4_neon; + vp9_lpf_horizontal_4_dual = vp9_lpf_horizontal_4_dual_c; + if (flags & HAS_NEON) vp9_lpf_horizontal_4_dual = vp9_lpf_horizontal_4_dual_neon; + vp9_lpf_horizontal_8 = vp9_lpf_horizontal_8_c; + if (flags & HAS_NEON) vp9_lpf_horizontal_8 = vp9_lpf_horizontal_8_neon; + vp9_lpf_horizontal_8_dual = vp9_lpf_horizontal_8_dual_c; + if (flags & HAS_NEON) vp9_lpf_horizontal_8_dual = vp9_lpf_horizontal_8_dual_neon; + vp9_lpf_vertical_16 = vp9_lpf_vertical_16_c; + if (flags & HAS_NEON) vp9_lpf_vertical_16 = vp9_lpf_vertical_16_neon; + vp9_lpf_vertical_16_dual = vp9_lpf_vertical_16_dual_c; + if (flags & HAS_NEON) vp9_lpf_vertical_16_dual = vp9_lpf_vertical_16_dual_neon; + vp9_lpf_vertical_4 = vp9_lpf_vertical_4_c; + if (flags & HAS_NEON) vp9_lpf_vertical_4 = vp9_lpf_vertical_4_neon; + vp9_lpf_vertical_4_dual = vp9_lpf_vertical_4_dual_c; + if (flags & HAS_NEON) vp9_lpf_vertical_4_dual = vp9_lpf_vertical_4_dual_neon; + vp9_lpf_vertical_8 = vp9_lpf_vertical_8_c; + if (flags & HAS_NEON) vp9_lpf_vertical_8 = vp9_lpf_vertical_8_neon; + vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_c; + if (flags & HAS_NEON) vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_neon; + vp9_quantize_fp = vp9_quantize_fp_c; + if (flags & HAS_NEON) vp9_quantize_fp = vp9_quantize_fp_neon; + vp9_sad16x16 = vp9_sad16x16_c; + if (flags & HAS_NEON) vp9_sad16x16 = vp9_sad16x16_neon; + vp9_sad32x32 = vp9_sad32x32_c; + if (flags & HAS_NEON) vp9_sad32x32 = vp9_sad32x32_neon; + vp9_sad64x64 = vp9_sad64x64_c; + if (flags & HAS_NEON) vp9_sad64x64 = vp9_sad64x64_neon; + vp9_sad8x8 = vp9_sad8x8_c; + if (flags & HAS_NEON) vp9_sad8x8 = vp9_sad8x8_neon; + vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_c; + if (flags & HAS_NEON) vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_neon; + vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_c; + if (flags & HAS_NEON) vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_neon; + vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_c; + if (flags & HAS_NEON) vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_neon; + vp9_subtract_block = vp9_subtract_block_c; + if (flags & HAS_NEON) vp9_subtract_block = vp9_subtract_block_neon; + vp9_tm_predictor_16x16 = vp9_tm_predictor_16x16_c; + if (flags & HAS_NEON) vp9_tm_predictor_16x16 = vp9_tm_predictor_16x16_neon; + vp9_tm_predictor_32x32 = vp9_tm_predictor_32x32_c; + if (flags & HAS_NEON) vp9_tm_predictor_32x32 = vp9_tm_predictor_32x32_neon; + vp9_tm_predictor_4x4 = vp9_tm_predictor_4x4_c; + if (flags & HAS_NEON) vp9_tm_predictor_4x4 = vp9_tm_predictor_4x4_neon; + vp9_tm_predictor_8x8 = vp9_tm_predictor_8x8_c; + if (flags & HAS_NEON) vp9_tm_predictor_8x8 = vp9_tm_predictor_8x8_neon; + vp9_v_predictor_16x16 = vp9_v_predictor_16x16_c; + if (flags & HAS_NEON) vp9_v_predictor_16x16 = vp9_v_predictor_16x16_neon; + vp9_v_predictor_32x32 = vp9_v_predictor_32x32_c; + if (flags & HAS_NEON) vp9_v_predictor_32x32 = vp9_v_predictor_32x32_neon; + vp9_v_predictor_4x4 = vp9_v_predictor_4x4_c; + if (flags & HAS_NEON) vp9_v_predictor_4x4 = vp9_v_predictor_4x4_neon; + vp9_v_predictor_8x8 = vp9_v_predictor_8x8_c; + if (flags & HAS_NEON) vp9_v_predictor_8x8 = vp9_v_predictor_8x8_neon; + vp9_variance16x16 = vp9_variance16x16_c; + if (flags & HAS_NEON) vp9_variance16x16 = vp9_variance16x16_neon; + vp9_variance32x32 = vp9_variance32x32_c; + if (flags & HAS_NEON) vp9_variance32x32 = vp9_variance32x32_neon; + vp9_variance8x8 = vp9_variance8x8_c; + if (flags & HAS_NEON) vp9_variance8x8 = vp9_variance8x8_neon; } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vp9_rtcd_generic-gnu.h b/media/libvpx/vp9_rtcd_generic-gnu.h index fca4466af058..5c9b779ef9ec 100644 --- a/media/libvpx/vp9_rtcd_generic-gnu.h +++ b/media/libvpx/vp9_rtcd_generic-gnu.h @@ -7,6 +7,10 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + /* * VP9 */ @@ -19,209 +23,17 @@ struct macroblockd; /* Encoder forward decls */ struct macroblock; struct vp9_variance_vtable; - -#define DEC_MVCOSTS int *mvjcost, int *mvcost[2] +struct search_site_config; +struct mv; union int_mv; struct yv12_buffer_config; -void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c - -void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c - -void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c - -void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c - -void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c - -void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c - -void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c - -void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c - -void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c - -void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c - -void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c - -void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c - -void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c - -void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c - -void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c - -void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c - -void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c - -void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c - -void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c - -void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c - -void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c - -void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c - -void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c - -void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c - -void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c - -void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c - -void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c - -void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c - -void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c - -void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c - -void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c - -void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c - -void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c - -void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c - -void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c - -void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c - -void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c - -void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c - -void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c - -void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c - -void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c - -void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c - -void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c - -void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c - -void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c - -void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c - -void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c - -void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c - -void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c - -void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c - -void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c - -void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c - -void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); -#define vp9_mb_lpf_vertical_edge_w vp9_mb_lpf_vertical_edge_w_c - -void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_mbloop_filter_vertical_edge vp9_mbloop_filter_vertical_edge_c - -void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_c - -void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_c - -void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_mbloop_filter_horizontal_edge vp9_mbloop_filter_horizontal_edge_c - -void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_loop_filter_horizontal_edge vp9_loop_filter_horizontal_edge_c - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve_copy vp9_convolve_copy_c - -void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve_avg vp9_convolve_avg_c +int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); +#define vp9_block_error vp9_block_error_c void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8 vp9_convolve8_c -void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve8_horiz vp9_convolve8_horiz_c - -void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve8_vert vp9_convolve8_vert_c - void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8_avg vp9_convolve8_avg_c @@ -231,20 +43,209 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); #define vp9_convolve8_avg_vert vp9_convolve8_avg_vert_c -void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct4x4_1_add vp9_idct4x4_1_add_c +void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve8_horiz vp9_convolve8_horiz_c -void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct4x4_16_add vp9_idct4x4_16_add_c +void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve8_vert vp9_convolve8_vert_c -void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct8x8_1_add vp9_idct8x8_1_add_c +void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_avg vp9_convolve_avg_c -void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct8x8_64_add vp9_idct8x8_64_add_c +void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_copy vp9_convolve_copy_c -void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct8x8_10_add vp9_idct8x8_10_add_c +void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c + +void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c + +void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c + +void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c + +void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c + +void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c + +void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c + +void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c + +void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c + +void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c + +void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c + +void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c + +void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c + +void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c + +void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c + +void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c + +void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c + +void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c + +void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c + +void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c + +void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c + +void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c + +void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c + +void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c + +void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c + +void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c + +void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c + +void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c + +void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c + +void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c + +void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c + +void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c + +void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c + +void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c + +void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c + +void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c + +void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c + +void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c + +void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c + +void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct16x16 vp9_fdct16x16_c + +void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct16x16_1 vp9_fdct16x16_1_c + +void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct32x32 vp9_fdct32x32_c + +void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct32x32_1 vp9_fdct32x32_1_c + +void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct32x32_rd vp9_fdct32x32_rd_c + +void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct4x4 vp9_fdct4x4_c + +void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct4x4_1 vp9_fdct4x4_1_c + +void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct8x8 vp9_fdct8x8_c + +void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct8x8_1 vp9_fdct8x8_1_c + +void vp9_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type); +#define vp9_fht16x16 vp9_fht16x16_c + +void vp9_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type); +#define vp9_fht4x4 vp9_fht4x4_c + +void vp9_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type); +#define vp9_fht8x8 vp9_fht8x8_c + +int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_full_range_search vp9_full_range_search_c + +int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +#define vp9_full_search_sad vp9_full_search_sad_c + +void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_c + +void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vp9_get16x16var vp9_get16x16var_c + +void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vp9_get8x8var vp9_get8x8var_c + +unsigned int vp9_get_mb_ss_c(const int16_t *); +#define vp9_get_mb_ss vp9_get_mb_ss_c + +void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c + +void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c + +void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c + +void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c + +void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_idct16x16_10_add vp9_idct16x16_10_add_c void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); #define vp9_idct16x16_1_add vp9_idct16x16_1_add_c @@ -252,17 +253,32 @@ void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int dest_stride); #define vp9_idct16x16_256_add vp9_idct16x16_256_add_c -void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct16x16_10_add vp9_idct16x16_10_add_c - void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int dest_stride); #define vp9_idct32x32_1024_add vp9_idct32x32_1024_add_c +void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_idct32x32_1_add vp9_idct32x32_1_add_c + void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride); #define vp9_idct32x32_34_add vp9_idct32x32_34_add_c -void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct32x32_1_add vp9_idct32x32_1_add_c +void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_idct4x4_16_add vp9_idct4x4_16_add_c + +void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_idct4x4_1_add vp9_idct4x4_1_add_c + +void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_idct8x8_12_add vp9_idct8x8_12_add_c + +void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_idct8x8_1_add vp9_idct8x8_1_add_c + +void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_idct8x8_64_add vp9_idct8x8_64_add_c + +void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); #define vp9_iht4x4_16_add vp9_iht4x4_16_add_c @@ -270,409 +286,396 @@ void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride, i void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_c -void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type); -#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c +void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c void vp9_iwht4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); #define vp9_iwht4x4_1_add vp9_iwht4x4_1_add_c -void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c +void vp9_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_lpf_horizontal_16 vp9_lpf_horizontal_16_c -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x16 vp9_variance32x16_c +void vp9_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_lpf_horizontal_4 vp9_lpf_horizontal_4_c -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x32 vp9_variance16x32_c +void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vp9_lpf_horizontal_4_dual vp9_lpf_horizontal_4_dual_c -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x32 vp9_variance64x32_c +void vp9_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_lpf_horizontal_8 vp9_lpf_horizontal_8_c -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x64 vp9_variance32x64_c +void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vp9_lpf_horizontal_8_dual vp9_lpf_horizontal_8_dual_c -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x32 vp9_variance32x32_c +void vp9_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vp9_lpf_vertical_16 vp9_lpf_vertical_16_c -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x64 vp9_variance64x64_c +void vp9_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vp9_lpf_vertical_16_dual vp9_lpf_vertical_16_dual_c -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x16 vp9_variance16x16_c +void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_lpf_vertical_4 vp9_lpf_vertical_4_c -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x8 vp9_variance16x8_c +void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vp9_lpf_vertical_4_dual vp9_lpf_vertical_4_dual_c -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x16 vp9_variance8x16_c +void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_lpf_vertical_8 vp9_lpf_vertical_8_c -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x8 vp9_variance8x8_c - -void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vp9_get_sse_sum_8x8 vp9_get_sse_sum_8x8_c - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x4 vp9_variance8x4_c - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x8 vp9_variance4x8_c - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x4 vp9_variance4x4_c - -unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance64x64 vp9_sub_pixel_variance64x64_c - -unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance64x64 vp9_sub_pixel_avg_variance64x64_c - -unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance32x64 vp9_sub_pixel_variance32x64_c - -unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance32x64 vp9_sub_pixel_avg_variance32x64_c - -unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance64x32 vp9_sub_pixel_variance64x32_c - -unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance64x32 vp9_sub_pixel_avg_variance64x32_c - -unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance32x16 vp9_sub_pixel_variance32x16_c - -unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance32x16 vp9_sub_pixel_avg_variance32x16_c - -unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance16x32 vp9_sub_pixel_variance16x32_c - -unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance16x32 vp9_sub_pixel_avg_variance16x32_c - -unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance32x32 vp9_sub_pixel_variance32x32_c - -unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance32x32 vp9_sub_pixel_avg_variance32x32_c - -unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance16x16 vp9_sub_pixel_variance16x16_c - -unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance16x16 vp9_sub_pixel_avg_variance16x16_c - -unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance8x16 vp9_sub_pixel_variance8x16_c - -unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance8x16 vp9_sub_pixel_avg_variance8x16_c - -unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance16x8 vp9_sub_pixel_variance16x8_c - -unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance16x8 vp9_sub_pixel_avg_variance16x8_c - -unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance8x8 vp9_sub_pixel_variance8x8_c - -unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance8x8 vp9_sub_pixel_avg_variance8x8_c - -unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance8x4 vp9_sub_pixel_variance8x4_c - -unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance8x4 vp9_sub_pixel_avg_variance8x4_c - -unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance4x8 vp9_sub_pixel_variance4x8_c - -unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance4x8 vp9_sub_pixel_avg_variance4x8_c - -unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance4x4 vp9_sub_pixel_variance4x4_c - -unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance4x4 vp9_sub_pixel_avg_variance4x4_c - -unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad64x64 vp9_sad64x64_c - -unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x64 vp9_sad32x64_c - -unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad64x32 vp9_sad64x32_c - -unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x16 vp9_sad32x16_c - -unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad16x32 vp9_sad16x32_c - -unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x32 vp9_sad32x32_c - -unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad16x16 vp9_sad16x16_c - -unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad16x8 vp9_sad16x8_c - -unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad8x16 vp9_sad8x16_c - -unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad8x8 vp9_sad8x8_c - -unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad8x4 vp9_sad8x4_c - -unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad4x8 vp9_sad4x8_c - -unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad4x4 vp9_sad4x4_c - -unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad64x64_avg vp9_sad64x64_avg_c - -unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x64_avg vp9_sad32x64_avg_c - -unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad64x32_avg vp9_sad64x32_avg_c - -unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x16_avg vp9_sad32x16_avg_c - -unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x32_avg vp9_sad16x32_avg_c - -unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x32_avg vp9_sad32x32_avg_c - -unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x16_avg vp9_sad16x16_avg_c - -unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x8_avg vp9_sad16x8_avg_c - -unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x16_avg vp9_sad8x16_avg_c - -unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x8_avg vp9_sad8x8_avg_c - -unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x4_avg vp9_sad8x4_avg_c - -unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad4x8_avg vp9_sad4x8_avg_c - -unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad4x4_avg vp9_sad4x4_avg_c - -unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_c - -unsigned int vp9_variance_halfpixvar16x16_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_c - -unsigned int vp9_variance_halfpixvar16x16_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_c - -unsigned int vp9_variance_halfpixvar64x64_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_h vp9_variance_halfpixvar64x64_h_c - -unsigned int vp9_variance_halfpixvar64x64_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_v vp9_variance_halfpixvar64x64_v_c - -unsigned int vp9_variance_halfpixvar64x64_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_hv vp9_variance_halfpixvar64x64_hv_c - -unsigned int vp9_variance_halfpixvar32x32_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_h vp9_variance_halfpixvar32x32_h_c - -unsigned int vp9_variance_halfpixvar32x32_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_v vp9_variance_halfpixvar32x32_v_c - -unsigned int vp9_variance_halfpixvar32x32_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_hv vp9_variance_halfpixvar32x32_hv_c - -void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad64x64x3 vp9_sad64x64x3_c - -void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad32x32x3 vp9_sad32x32x3_c - -void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad16x16x3 vp9_sad16x16x3_c - -void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad16x8x3 vp9_sad16x8x3_c - -void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad8x16x3 vp9_sad8x16x3_c - -void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad8x8x3 vp9_sad8x8x3_c - -void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad4x4x3 vp9_sad4x4x3_c - -void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad64x64x8 vp9_sad64x64x8_c - -void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad32x32x8 vp9_sad32x32x8_c - -void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad16x16x8 vp9_sad16x16x8_c - -void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad16x8x8 vp9_sad16x8x8_c - -void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x16x8 vp9_sad8x16x8_c - -void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x8x8 vp9_sad8x8x8_c - -void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x4x8 vp9_sad8x4x8_c - -void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad4x8x8 vp9_sad4x8x8_c - -void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad4x4x8 vp9_sad4x4x8_c - -void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad64x64x4d vp9_sad64x64x4d_c - -void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad32x64x4d vp9_sad32x64x4d_c - -void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad64x32x4d vp9_sad64x32x4d_c - -void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad32x16x4d vp9_sad32x16x4d_c - -void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad16x32x4d vp9_sad16x32x4d_c - -void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad32x32x4d vp9_sad32x32x4d_c - -void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad16x16x4d vp9_sad16x16x4d_c - -void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad16x8x4d vp9_sad16x8x4d_c - -void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad8x16x4d vp9_sad8x16x4d_c - -void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad8x8x4d vp9_sad8x8x4d_c - -void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad8x4x4d vp9_sad8x4x4d_c - -void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad4x8x4d vp9_sad4x8x4d_c - -void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad4x4x4d vp9_sad4x4x4d_c +void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vp9_lpf_vertical_8_dual vp9_lpf_vertical_8_dual_c unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); #define vp9_mse16x16 vp9_mse16x16_c -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_c - unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); #define vp9_mse16x8 vp9_mse16x8_c +unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse8x16 vp9_mse8x16_c + unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); #define vp9_mse8x8 vp9_mse8x8_c -unsigned int vp9_sub_pixel_mse64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_mse64x64 vp9_sub_pixel_mse64x64_c - -unsigned int vp9_sub_pixel_mse32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_mse32x32 vp9_sub_pixel_mse32x32_c - -unsigned int vp9_get_mb_ss_c(const int16_t *); -#define vp9_get_mb_ss vp9_get_mb_ss_c - -int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); -#define vp9_block_error vp9_block_error_c - -void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); -#define vp9_subtract_block vp9_subtract_block_c - void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_b vp9_quantize_b_c void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_b_32x32 vp9_quantize_b_32x32_c -void vp9_short_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type); -#define vp9_short_fht4x4 vp9_short_fht4x4_c +void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_quantize_fp vp9_quantize_fp_c -void vp9_short_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type); -#define vp9_short_fht8x8 vp9_short_fht8x8_c +void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c -void vp9_short_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type); -#define vp9_short_fht16x16 vp9_short_fht16x16_c - -void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride); -#define vp9_fwht4x4 vp9_fwht4x4_c - -void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct4x4 vp9_fdct4x4_c - -void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct8x8 vp9_fdct8x8_c - -void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct16x16 vp9_fdct16x16_c - -void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct32x32 vp9_fdct32x32_c - -void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct32x32_rd vp9_fdct32x32_rd_c - -int vp9_full_search_sad_c(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -#define vp9_full_search_sad vp9_full_search_sad_c - -int vp9_refining_search_sad_c(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); +int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); #define vp9_refining_search_sad vp9_refining_search_sad_c -int vp9_diamond_search_sad_c(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -#define vp9_diamond_search_sad vp9_diamond_search_sad_c +unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x16 vp9_sad16x16_c -void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x16_avg vp9_sad16x16_avg_c + +void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad16x16x3 vp9_sad16x16x3_c + +void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad16x16x4d vp9_sad16x16x4d_c + +void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad16x16x8 vp9_sad16x16x8_c + +unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x32 vp9_sad16x32_c + +unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x32_avg vp9_sad16x32_avg_c + +void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad16x32x4d vp9_sad16x32x4d_c + +unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x8 vp9_sad16x8_c + +unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x8_avg vp9_sad16x8_avg_c + +void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad16x8x3 vp9_sad16x8x3_c + +void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad16x8x4d vp9_sad16x8x4d_c + +void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad16x8x8 vp9_sad16x8x8_c + +unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad32x16 vp9_sad32x16_c + +unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x16_avg vp9_sad32x16_avg_c + +void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad32x16x4d vp9_sad32x16x4d_c + +unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad32x32 vp9_sad32x32_c + +unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x32_avg vp9_sad32x32_avg_c + +void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad32x32x3 vp9_sad32x32x3_c + +void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad32x32x4d vp9_sad32x32x4d_c + +void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad32x32x8 vp9_sad32x32x8_c + +unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad32x64 vp9_sad32x64_c + +unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x64_avg vp9_sad32x64_avg_c + +void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad32x64x4d vp9_sad32x64x4d_c + +unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad4x4 vp9_sad4x4_c + +unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad4x4_avg vp9_sad4x4_avg_c + +void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad4x4x3 vp9_sad4x4x3_c + +void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad4x4x4d vp9_sad4x4x4d_c + +void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad4x4x8 vp9_sad4x4x8_c + +unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad4x8 vp9_sad4x8_c + +unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad4x8_avg vp9_sad4x8_avg_c + +void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad4x8x4d vp9_sad4x8x4d_c + +void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad4x8x8 vp9_sad4x8x8_c + +unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad64x32 vp9_sad64x32_c + +unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad64x32_avg vp9_sad64x32_avg_c + +void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad64x32x4d vp9_sad64x32x4d_c + +unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad64x64 vp9_sad64x64_c + +unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad64x64_avg vp9_sad64x64_avg_c + +void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad64x64x3 vp9_sad64x64x3_c + +void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad64x64x4d vp9_sad64x64x4d_c + +void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad64x64x8 vp9_sad64x64x8_c + +unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x16 vp9_sad8x16_c + +unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x16_avg vp9_sad8x16_avg_c + +void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad8x16x3 vp9_sad8x16x3_c + +void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad8x16x4d vp9_sad8x16x4d_c + +void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x16x8 vp9_sad8x16x8_c + +unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x4 vp9_sad8x4_c + +unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x4_avg vp9_sad8x4_avg_c + +void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad8x4x4d vp9_sad8x4x4d_c + +void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x4x8 vp9_sad8x4x8_c + +unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x8 vp9_sad8x8_c + +unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x8_avg vp9_sad8x8_avg_c + +void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad8x8x3 vp9_sad8x8x3_c + +void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad8x8x4d vp9_sad8x8x4d_c + +void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x8x8 vp9_sad8x8x8_c + +unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance16x16 vp9_sub_pixel_avg_variance16x16_c + +unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance16x32 vp9_sub_pixel_avg_variance16x32_c + +unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance16x8 vp9_sub_pixel_avg_variance16x8_c + +unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance32x16 vp9_sub_pixel_avg_variance32x16_c + +unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance32x32 vp9_sub_pixel_avg_variance32x32_c + +unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance32x64 vp9_sub_pixel_avg_variance32x64_c + +unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance4x4 vp9_sub_pixel_avg_variance4x4_c + +unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance4x8 vp9_sub_pixel_avg_variance4x8_c + +unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance64x32 vp9_sub_pixel_avg_variance64x32_c + +unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance64x64 vp9_sub_pixel_avg_variance64x64_c + +unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance8x16 vp9_sub_pixel_avg_variance8x16_c + +unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance8x4 vp9_sub_pixel_avg_variance8x4_c + +unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance8x8 vp9_sub_pixel_avg_variance8x8_c + +unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance16x16 vp9_sub_pixel_variance16x16_c + +unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance16x32 vp9_sub_pixel_variance16x32_c + +unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance16x8 vp9_sub_pixel_variance16x8_c + +unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance32x16 vp9_sub_pixel_variance32x16_c + +unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance32x32 vp9_sub_pixel_variance32x32_c + +unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance32x64 vp9_sub_pixel_variance32x64_c + +unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance4x4 vp9_sub_pixel_variance4x4_c + +unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance4x8 vp9_sub_pixel_variance4x8_c + +unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance64x32 vp9_sub_pixel_variance64x32_c + +unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance64x64 vp9_sub_pixel_variance64x64_c + +unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance8x16 vp9_sub_pixel_variance8x16_c + +unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance8x4 vp9_sub_pixel_variance8x4_c + +unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance8x8 vp9_sub_pixel_variance8x8_c + +void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +#define vp9_subtract_block vp9_subtract_block_c + +void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); #define vp9_temporal_filter_apply vp9_temporal_filter_apply_c -void vp9_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction); -#define vp9_yv12_copy_partial_frame vp9_yv12_copy_partial_frame_c +void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c + +void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c + +void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c + +void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c + +void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c + +void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c + +void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c + +void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c + +unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance16x16 vp9_variance16x16_c + +unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance16x32 vp9_variance16x32_c + +unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance16x8 vp9_variance16x8_c + +unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance32x16 vp9_variance32x16_c + +unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance32x32 vp9_variance32x32_c + +unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance32x64 vp9_variance32x64_c + +unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance4x4 vp9_variance4x4_c + +unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance4x8 vp9_variance4x8_c + +unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance64x32 vp9_variance64x32_c + +unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance64x64 vp9_variance64x64_c + +unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x16 vp9_variance8x16_c + +unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x4 vp9_variance8x4_c + +unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x8 vp9_variance8x8_c void vp9_rtcd(void); + #include "vpx_config.h" #ifdef RTCD_C static void setup_rtcd_internal(void) { - } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vp9_rtcd_x86-darwin9-gcc.h b/media/libvpx/vp9_rtcd_x86-darwin9-gcc.h index 4cb72a9878fb..3c1d522907f3 100644 --- a/media/libvpx/vp9_rtcd_x86-darwin9-gcc.h +++ b/media/libvpx/vp9_rtcd_x86-darwin9-gcc.h @@ -7,6 +7,10 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + /* * VP9 */ @@ -19,222 +23,21 @@ struct macroblockd; /* Encoder forward decls */ struct macroblock; struct vp9_variance_vtable; - -#define DEC_MVCOSTS int *mvjcost, int *mvcost[2] +struct search_site_config; +struct mv; union int_mv; struct yv12_buffer_config; -void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c - -void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c - -void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c - -void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c - -void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c - -void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c - -void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c - -void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c - -void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c - -void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c - -void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c - -void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c - -void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c - -void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c - -void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c - -void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c - -void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c - -void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c - -void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c - -void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c - -void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c - -void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c - -void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c - -void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c - -void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c - -void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c - -void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c - -void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c - -void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c - -void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c - -void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c - -void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c - -void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c - -void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c - -void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c - -void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c - -void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c - -void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c - -void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c - -void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c - -void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c - -void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c - -void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c - -void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c - -void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c - -void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c - -void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c - -void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c - -void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c - -void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c - -void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c - -void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c - -void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); -void vp9_mb_lpf_vertical_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); -RTCD_EXTERN void (*vp9_mb_lpf_vertical_edge_w)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); - -void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mbloop_filter_vertical_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_mbloop_filter_vertical_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_loop_filter_vertical_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_loop_filter_vertical_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mb_lpf_horizontal_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mb_lpf_horizontal_edge_w_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_mb_lpf_horizontal_edge_w)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mbloop_filter_horizontal_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_mbloop_filter_horizontal_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_loop_filter_horizontal_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_loop_filter_horizontal_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve_copy vp9_convolve_copy_c - -void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve_avg vp9_convolve_avg_c +int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); +RTCD_EXTERN int64_t (*vp9_block_error)(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*vp9_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); - -void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); - void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); @@ -250,14 +53,268 @@ void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8 void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*vp9_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct4x4_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_avg vp9_convolve_avg_c + +void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_copy vp9_convolve_copy_c + +void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c + +void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c + +void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c + +void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c + +void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c + +void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c + +void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c + +void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c + +void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c + +void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c + +void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c + +void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c + +void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c + +void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c + +void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c + +void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c + +void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c + +void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c + +void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c + +void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c + +void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c + +void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c + +void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c + +void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c + +void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c + +void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c + +void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c + +void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c + +void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c + +void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c + +void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c + +void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c + +void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c + +void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c + +void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c + +void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c + +void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c + +void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c + +void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c + +void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct16x16)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct16x16_1)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_sse2(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_avx2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct32x32)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct32x32_1)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_rd_sse2(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_rd_avx2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct32x32_rd)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct4x4)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct4x4_1)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct8x8)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct8x8_1)(const int16_t *input, int16_t *output, int stride); + +void vp9_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht16x16_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht16x16)(const int16_t *input, int16_t *output, int stride, int tx_type); + +void vp9_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht4x4_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht4x4)(const int16_t *input, int16_t *output, int stride, int tx_type); + +void vp9_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht8x8_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht8x8)(const int16_t *input, int16_t *output, int stride, int tx_type); + +int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_full_range_search vp9_full_range_search_c + +int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +int vp9_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +RTCD_EXTERN int (*vp9_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); + +void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_c + +void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vp9_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vp9_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vp9_get8x8var vp9_get8x8var_c + +unsigned int vp9_get_mb_ss_c(const int16_t *); +#define vp9_get_mb_ss vp9_get_mb_ss_c + +void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c + +void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c + +void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c + +void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c + +void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_10_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct16x16_10_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct16x16_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct16x16_256_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct32x32_1024_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct32x32_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct32x32_34_add)(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); RTCD_EXTERN void (*vp9_idct4x4_16_add)(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct4x4_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct8x8_12_add)(const int16_t *input, uint8_t *dest, int dest_stride); + void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); RTCD_EXTERN void (*vp9_idct8x8_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); @@ -266,33 +323,9 @@ void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); RTCD_EXTERN void (*vp9_idct8x8_64_add)(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct8x8_10_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct16x16_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct16x16_256_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct16x16_10_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct32x32_1024_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct32x32_34_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct32x32_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type); +void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *output, int pitch, int tx_type); +RTCD_EXTERN void (*vp9_iht16x16_256_add)(const int16_t *input, uint8_t *output, int pitch, int tx_type); void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); @@ -302,449 +335,428 @@ void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride, i void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); RTCD_EXTERN void (*vp9_iht8x8_64_add)(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type); -void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *output, int pitch, int tx_type); -RTCD_EXTERN void (*vp9_iht16x16_256_add)(const int16_t *input, uint8_t *output, int pitch, int tx_type); +void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c void vp9_iwht4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); #define vp9_iwht4x4_1_add vp9_iwht4x4_1_add_c -void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c - -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x16 vp9_variance32x16_c - -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x32 vp9_variance16x32_c - -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x32 vp9_variance64x32_c - -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x64 vp9_variance32x64_c - -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x32 vp9_variance32x32_c - -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x64 vp9_variance64x64_c - -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vp9_get_sse_sum_8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x4 vp9_variance8x4_c - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x8 vp9_variance4x8_c - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance64x64 vp9_sub_pixel_variance64x64_c - -unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance64x64 vp9_sub_pixel_avg_variance64x64_c - -unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance32x64 vp9_sub_pixel_variance32x64_c - -unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance32x64 vp9_sub_pixel_avg_variance32x64_c - -unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance64x32 vp9_sub_pixel_variance64x32_c - -unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance64x32 vp9_sub_pixel_avg_variance64x32_c - -unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance32x16 vp9_sub_pixel_variance32x16_c - -unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance32x16 vp9_sub_pixel_avg_variance32x16_c - -unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance16x32 vp9_sub_pixel_variance16x32_c - -unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance16x32 vp9_sub_pixel_avg_variance16x32_c - -unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance32x32 vp9_sub_pixel_variance32x32_c - -unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance32x32 vp9_sub_pixel_avg_variance32x32_c - -unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance16x16 vp9_sub_pixel_variance16x16_c - -unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance16x16 vp9_sub_pixel_avg_variance16x16_c - -unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance8x16 vp9_sub_pixel_variance8x16_c - -unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance8x16 vp9_sub_pixel_avg_variance8x16_c - -unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance16x8 vp9_sub_pixel_variance16x8_c - -unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance16x8 vp9_sub_pixel_avg_variance16x8_c - -unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance8x8 vp9_sub_pixel_variance8x8_c - -unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance8x8 vp9_sub_pixel_avg_variance8x8_c - -unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance8x4 vp9_sub_pixel_variance8x4_c - -unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance8x4 vp9_sub_pixel_avg_variance8x4_c - -unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance4x8 vp9_sub_pixel_variance4x8_c - -unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance4x8 vp9_sub_pixel_avg_variance4x8_c - -unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance4x4 vp9_sub_pixel_variance4x4_c - -unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance4x4 vp9_sub_pixel_avg_variance4x4_c - -unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad64x64 vp9_sad64x64_c - -unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x64 vp9_sad32x64_c - -unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad64x32 vp9_sad64x32_c - -unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x16 vp9_sad32x16_c - -unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad16x32 vp9_sad16x32_c - -unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x32 vp9_sad32x32_c - -unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad8x4 vp9_sad8x4_c - -unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad4x8 vp9_sad4x8_c - -unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad64x64_avg vp9_sad64x64_avg_c - -unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x64_avg vp9_sad32x64_avg_c - -unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad64x32_avg vp9_sad64x32_avg_c - -unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x16_avg vp9_sad32x16_avg_c - -unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x32_avg vp9_sad16x32_avg_c - -unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x32_avg vp9_sad32x32_avg_c - -unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x16_avg vp9_sad16x16_avg_c - -unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x8_avg vp9_sad16x8_avg_c - -unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x16_avg vp9_sad8x16_avg_c - -unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x8_avg vp9_sad8x8_avg_c - -unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x4_avg vp9_sad8x4_avg_c - -unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad4x8_avg vp9_sad4x8_avg_c - -unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad4x4_avg vp9_sad4x4_avg_c - -unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_c - -unsigned int vp9_variance_halfpixvar16x16_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_c - -unsigned int vp9_variance_halfpixvar16x16_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_c - -unsigned int vp9_variance_halfpixvar64x64_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_h vp9_variance_halfpixvar64x64_h_c - -unsigned int vp9_variance_halfpixvar64x64_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_v vp9_variance_halfpixvar64x64_v_c - -unsigned int vp9_variance_halfpixvar64x64_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_hv vp9_variance_halfpixvar64x64_hv_c - -unsigned int vp9_variance_halfpixvar32x32_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_h vp9_variance_halfpixvar32x32_h_c - -unsigned int vp9_variance_halfpixvar32x32_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_v vp9_variance_halfpixvar32x32_v_c - -unsigned int vp9_variance_halfpixvar32x32_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_hv vp9_variance_halfpixvar32x32_hv_c - -void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad64x64x3 vp9_sad64x64x3_c - -void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad32x32x3 vp9_sad32x32x3_c - -void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad8x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad8x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad4x4x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad4x4x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad64x64x8 vp9_sad64x64x8_c - -void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad32x32x8 vp9_sad32x32x8_c - -void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad16x16x8 vp9_sad16x16x8_c - -void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad16x8x8 vp9_sad16x8x8_c - -void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x16x8 vp9_sad8x16x8_c - -void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x8x8 vp9_sad8x8x8_c - -void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x4x8 vp9_sad8x4x8_c - -void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad4x8x8 vp9_sad4x8x8_c - -void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad4x4x8 vp9_sad4x4x8_c - -void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad4x8x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad4x4x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_16_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_horizontal_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); + +void vp9_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_4_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_horizontal_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); + +void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vp9_lpf_horizontal_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vp9_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_horizontal_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); + +void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vp9_lpf_horizontal_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vp9_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vp9_lpf_vertical_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vp9_lpf_vertical_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vp9_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vp9_lpf_vertical_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vp9_lpf_vertical_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_vertical_4_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_vertical_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); + +void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vp9_lpf_vertical_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_vertical_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); + +void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vp9_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vp9_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vp9_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_c - unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); #define vp9_mse16x8 vp9_mse16x8_c +unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse8x16 vp9_mse8x16_c + unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); #define vp9_mse8x8 vp9_mse8x8_c -unsigned int vp9_sub_pixel_mse64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_mse64x64 vp9_sub_pixel_mse64x64_c - -unsigned int vp9_sub_pixel_mse32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_mse32x32 vp9_sub_pixel_mse32x32_c - -unsigned int vp9_get_mb_ss_c(const int16_t *); -unsigned int vp9_get_mb_ss_mmx(const int16_t *); -unsigned int vp9_get_mb_ss_sse2(const int16_t *); -RTCD_EXTERN unsigned int (*vp9_get_mb_ss)(const int16_t *); - -int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); -#define vp9_block_error vp9_block_error_c - -void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); -#define vp9_subtract_block vp9_subtract_block_c - void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_b vp9_quantize_b_c void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_b_32x32 vp9_quantize_b_32x32_c -void vp9_short_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -RTCD_EXTERN void (*vp9_short_fht4x4)(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_quantize_fp vp9_quantize_fp_c -void vp9_short_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -RTCD_EXTERN void (*vp9_short_fht8x8)(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c -void vp9_short_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -RTCD_EXTERN void (*vp9_short_fht16x16)(const int16_t *input, int16_t *output, int stride, int tx_type); +int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_refining_search_sad vp9_refining_search_sad_c -void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride); -#define vp9_fwht4x4 vp9_fwht4x4_c +unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x16 vp9_sad16x16_c -void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride); -RTCD_EXTERN void (*vp9_fdct4x4)(const int16_t *input, int16_t *output, int stride); +unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x16_avg vp9_sad16x16_avg_c -void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride); -RTCD_EXTERN void (*vp9_fdct8x8)(const int16_t *input, int16_t *output, int stride); +void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride); -RTCD_EXTERN void (*vp9_fdct16x16)(const int16_t *input, int16_t *output, int stride); +void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct32x32_sse2(const int16_t *input, int16_t *output, int stride); -RTCD_EXTERN void (*vp9_fdct32x32)(const int16_t *input, int16_t *output, int stride); +void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad16x16x8 vp9_sad16x16x8_c -void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct32x32_rd_sse2(const int16_t *input, int16_t *output, int stride); -RTCD_EXTERN void (*vp9_fdct32x32_rd)(const int16_t *input, int16_t *output, int stride); +unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x32 vp9_sad16x32_c -int vp9_full_search_sad_c(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -int vp9_full_search_sadx3(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -int vp9_full_search_sadx8(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -RTCD_EXTERN int (*vp9_full_search_sad)(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); +unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x32_avg vp9_sad16x32_avg_c -int vp9_refining_search_sad_c(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -int vp9_refining_search_sadx4(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -RTCD_EXTERN int (*vp9_refining_search_sad)(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); +void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -int vp9_diamond_search_sad_c(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -int vp9_diamond_search_sadx4(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -RTCD_EXTERN int (*vp9_diamond_search_sad)(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); +unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x8 vp9_sad16x8_c -void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -RTCD_EXTERN void (*vp9_temporal_filter_apply)(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x8_avg vp9_sad16x8_avg_c -void vp9_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction); -#define vp9_yv12_copy_partial_frame vp9_yv12_copy_partial_frame_c +void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad16x8x8 vp9_sad16x8x8_c + +unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad32x16 vp9_sad32x16_c + +unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x16_avg vp9_sad32x16_avg_c + +void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad32x32 vp9_sad32x32_c + +unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x32_avg vp9_sad32x32_avg_c + +void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad32x32x3 vp9_sad32x32x3_c + +void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad32x32x8 vp9_sad32x32x8_c + +unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad32x64 vp9_sad32x64_c + +unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x64_avg vp9_sad32x64_avg_c + +void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad4x4 vp9_sad4x4_c + +unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad4x4_avg vp9_sad4x4_avg_c + +void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad4x4x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad4x4x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad4x4x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad4x4x8 vp9_sad4x4x8_c + +unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad4x8 vp9_sad4x8_c + +unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad4x8_avg vp9_sad4x8_avg_c + +void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad4x8x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad4x8x8 vp9_sad4x8x8_c + +unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad64x32 vp9_sad64x32_c + +unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad64x32_avg vp9_sad64x32_avg_c + +void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad64x64 vp9_sad64x64_c + +unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad64x64_avg vp9_sad64x64_avg_c + +void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad64x64x3 vp9_sad64x64x3_c + +void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad64x64x8 vp9_sad64x64x8_c + +unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x16 vp9_sad8x16_c + +unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x16_avg vp9_sad8x16_avg_c + +void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad8x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x16x8 vp9_sad8x16x8_c + +unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x4 vp9_sad8x4_c + +unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x4_avg vp9_sad8x4_avg_c + +void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x4x8 vp9_sad8x4x8_c + +unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x8 vp9_sad8x8_c + +unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x8_avg vp9_sad8x8_avg_c + +void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad8x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x8x8 vp9_sad8x8x8_c + +unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance16x16 vp9_sub_pixel_avg_variance16x16_c + +unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance16x32 vp9_sub_pixel_avg_variance16x32_c + +unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance16x8 vp9_sub_pixel_avg_variance16x8_c + +unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance32x16 vp9_sub_pixel_avg_variance32x16_c + +unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance32x64 vp9_sub_pixel_avg_variance32x64_c + +unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance4x4 vp9_sub_pixel_avg_variance4x4_c + +unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance4x8 vp9_sub_pixel_avg_variance4x8_c + +unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance64x32 vp9_sub_pixel_avg_variance64x32_c + +unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance8x16 vp9_sub_pixel_avg_variance8x16_c + +unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance8x4 vp9_sub_pixel_avg_variance8x4_c + +unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance8x8 vp9_sub_pixel_avg_variance8x8_c + +unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance16x16 vp9_sub_pixel_variance16x16_c + +unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance16x32 vp9_sub_pixel_variance16x32_c + +unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance16x8 vp9_sub_pixel_variance16x8_c + +unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance32x16 vp9_sub_pixel_variance32x16_c + +unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance32x64 vp9_sub_pixel_variance32x64_c + +unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance4x4 vp9_sub_pixel_variance4x4_c + +unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance4x8 vp9_sub_pixel_variance4x8_c + +unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance64x32 vp9_sub_pixel_variance64x32_c + +unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance8x16 vp9_sub_pixel_variance8x16_c + +unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance8x4 vp9_sub_pixel_variance8x4_c + +unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance8x8 vp9_sub_pixel_variance8x8_c + +void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +#define vp9_subtract_block vp9_subtract_block_c + +void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +RTCD_EXTERN void (*vp9_temporal_filter_apply)(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); + +void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c + +void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c + +void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c + +void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c + +void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c + +void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c + +void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c + +void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c + +unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance16x32 vp9_variance16x32_c + +unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance16x8 vp9_variance16x8_c + +unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance32x64 vp9_variance32x64_c + +unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance4x4 vp9_variance4x4_c + +unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance4x8 vp9_variance4x8_c + +unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x16 vp9_variance8x16_c + +unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x4 vp9_variance8x4_c + +unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x8 vp9_variance8x8_c void vp9_rtcd(void); @@ -756,367 +768,180 @@ static void setup_rtcd_internal(void) (void)flags; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - vp9_mb_lpf_vertical_edge_w = vp9_mb_lpf_vertical_edge_w_c; - if (flags & HAS_SSE2) vp9_mb_lpf_vertical_edge_w = vp9_mb_lpf_vertical_edge_w_sse2; - - vp9_mbloop_filter_vertical_edge = vp9_mbloop_filter_vertical_edge_c; - if (flags & HAS_SSE2) vp9_mbloop_filter_vertical_edge = vp9_mbloop_filter_vertical_edge_sse2; - - vp9_loop_filter_vertical_edge = vp9_loop_filter_vertical_edge_c; - if (flags & HAS_MMX) vp9_loop_filter_vertical_edge = vp9_loop_filter_vertical_edge_mmx; - - vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_c; - if (flags & HAS_SSE2) vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_sse2; - if (flags & HAS_AVX2) vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_avx2; - - vp9_mbloop_filter_horizontal_edge = vp9_mbloop_filter_horizontal_edge_c; - if (flags & HAS_SSE2) vp9_mbloop_filter_horizontal_edge = vp9_mbloop_filter_horizontal_edge_sse2; - - vp9_loop_filter_horizontal_edge = vp9_loop_filter_horizontal_edge_c; - if (flags & HAS_MMX) vp9_loop_filter_horizontal_edge = vp9_loop_filter_horizontal_edge_mmx; - - - - - - + vp9_block_error = vp9_block_error_c; + if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2; vp9_convolve8 = vp9_convolve8_c; if (flags & HAS_SSE2) vp9_convolve8 = vp9_convolve8_sse2; if (flags & HAS_SSSE3) vp9_convolve8 = vp9_convolve8_ssse3; - - vp9_convolve8_horiz = vp9_convolve8_horiz_c; - if (flags & HAS_SSE2) vp9_convolve8_horiz = vp9_convolve8_horiz_sse2; - if (flags & HAS_SSSE3) vp9_convolve8_horiz = vp9_convolve8_horiz_ssse3; - - vp9_convolve8_vert = vp9_convolve8_vert_c; - if (flags & HAS_SSE2) vp9_convolve8_vert = vp9_convolve8_vert_sse2; - if (flags & HAS_SSSE3) vp9_convolve8_vert = vp9_convolve8_vert_ssse3; - + if (flags & HAS_AVX2) vp9_convolve8 = vp9_convolve8_avx2; vp9_convolve8_avg = vp9_convolve8_avg_c; if (flags & HAS_SSE2) vp9_convolve8_avg = vp9_convolve8_avg_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg = vp9_convolve8_avg_ssse3; - vp9_convolve8_avg_horiz = vp9_convolve8_avg_horiz_c; if (flags & HAS_SSE2) vp9_convolve8_avg_horiz = vp9_convolve8_avg_horiz_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg_horiz = vp9_convolve8_avg_horiz_ssse3; - vp9_convolve8_avg_vert = vp9_convolve8_avg_vert_c; if (flags & HAS_SSE2) vp9_convolve8_avg_vert = vp9_convolve8_avg_vert_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg_vert = vp9_convolve8_avg_vert_ssse3; - - vp9_idct4x4_1_add = vp9_idct4x4_1_add_c; - if (flags & HAS_SSE2) vp9_idct4x4_1_add = vp9_idct4x4_1_add_sse2; - - vp9_idct4x4_16_add = vp9_idct4x4_16_add_c; - if (flags & HAS_SSE2) vp9_idct4x4_16_add = vp9_idct4x4_16_add_sse2; - - vp9_idct8x8_1_add = vp9_idct8x8_1_add_c; - if (flags & HAS_SSE2) vp9_idct8x8_1_add = vp9_idct8x8_1_add_sse2; - - vp9_idct8x8_64_add = vp9_idct8x8_64_add_c; - if (flags & HAS_SSE2) vp9_idct8x8_64_add = vp9_idct8x8_64_add_sse2; - - vp9_idct8x8_10_add = vp9_idct8x8_10_add_c; - if (flags & HAS_SSE2) vp9_idct8x8_10_add = vp9_idct8x8_10_add_sse2; - - vp9_idct16x16_1_add = vp9_idct16x16_1_add_c; - if (flags & HAS_SSE2) vp9_idct16x16_1_add = vp9_idct16x16_1_add_sse2; - - vp9_idct16x16_256_add = vp9_idct16x16_256_add_c; - if (flags & HAS_SSE2) vp9_idct16x16_256_add = vp9_idct16x16_256_add_sse2; - - vp9_idct16x16_10_add = vp9_idct16x16_10_add_c; - if (flags & HAS_SSE2) vp9_idct16x16_10_add = vp9_idct16x16_10_add_sse2; - - vp9_idct32x32_1024_add = vp9_idct32x32_1024_add_c; - if (flags & HAS_SSE2) vp9_idct32x32_1024_add = vp9_idct32x32_1024_add_sse2; - - vp9_idct32x32_34_add = vp9_idct32x32_34_add_c; - if (flags & HAS_SSE2) vp9_idct32x32_34_add = vp9_idct32x32_34_add_sse2; - - vp9_idct32x32_1_add = vp9_idct32x32_1_add_c; - if (flags & HAS_SSE2) vp9_idct32x32_1_add = vp9_idct32x32_1_add_sse2; - - vp9_iht4x4_16_add = vp9_iht4x4_16_add_c; - if (flags & HAS_SSE2) vp9_iht4x4_16_add = vp9_iht4x4_16_add_sse2; - - vp9_iht8x8_64_add = vp9_iht8x8_64_add_c; - if (flags & HAS_SSE2) vp9_iht8x8_64_add = vp9_iht8x8_64_add_sse2; - - vp9_iht16x16_256_add = vp9_iht16x16_256_add_c; - if (flags & HAS_SSE2) vp9_iht16x16_256_add = vp9_iht16x16_256_add_sse2; - - - - - - - - - - vp9_variance16x16 = vp9_variance16x16_c; - if (flags & HAS_MMX) vp9_variance16x16 = vp9_variance16x16_mmx; - - vp9_variance16x8 = vp9_variance16x8_c; - if (flags & HAS_MMX) vp9_variance16x8 = vp9_variance16x8_mmx; - - vp9_variance8x16 = vp9_variance8x16_c; - if (flags & HAS_MMX) vp9_variance8x16 = vp9_variance8x16_mmx; - - vp9_variance8x8 = vp9_variance8x8_c; - if (flags & HAS_MMX) vp9_variance8x8 = vp9_variance8x8_mmx; - - vp9_get_sse_sum_8x8 = vp9_get_sse_sum_8x8_c; - if (flags & HAS_SSE2) vp9_get_sse_sum_8x8 = vp9_get8x8var_sse2; - - - - vp9_variance4x4 = vp9_variance4x4_c; - if (flags & HAS_MMX) vp9_variance4x4 = vp9_variance4x4_mmx; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - vp9_sad16x16 = vp9_sad16x16_c; - if (flags & HAS_MMX) vp9_sad16x16 = vp9_sad16x16_mmx; - - vp9_sad16x8 = vp9_sad16x8_c; - if (flags & HAS_MMX) vp9_sad16x8 = vp9_sad16x8_mmx; - - vp9_sad8x16 = vp9_sad8x16_c; - if (flags & HAS_MMX) vp9_sad8x16 = vp9_sad8x16_mmx; - - vp9_sad8x8 = vp9_sad8x8_c; - if (flags & HAS_MMX) vp9_sad8x8 = vp9_sad8x8_mmx; - - - - vp9_sad4x4 = vp9_sad4x4_c; - if (flags & HAS_MMX) vp9_sad4x4 = vp9_sad4x4_mmx; - - - - - - - - - - - - - - - - - - - - - - - - - - vp9_sad16x16x3 = vp9_sad16x16x3_c; - if (flags & HAS_SSE3) vp9_sad16x16x3 = vp9_sad16x16x3_sse3; - if (flags & HAS_SSSE3) vp9_sad16x16x3 = vp9_sad16x16x3_ssse3; - - vp9_sad16x8x3 = vp9_sad16x8x3_c; - if (flags & HAS_SSE3) vp9_sad16x8x3 = vp9_sad16x8x3_sse3; - if (flags & HAS_SSSE3) vp9_sad16x8x3 = vp9_sad16x8x3_ssse3; - - vp9_sad8x16x3 = vp9_sad8x16x3_c; - if (flags & HAS_SSE3) vp9_sad8x16x3 = vp9_sad8x16x3_sse3; - - vp9_sad8x8x3 = vp9_sad8x8x3_c; - if (flags & HAS_SSE3) vp9_sad8x8x3 = vp9_sad8x8x3_sse3; - - vp9_sad4x4x3 = vp9_sad4x4x3_c; - if (flags & HAS_SSE3) vp9_sad4x4x3 = vp9_sad4x4x3_sse3; - - - - - - - - - - - vp9_sad64x64x4d = vp9_sad64x64x4d_c; - if (flags & HAS_SSE2) vp9_sad64x64x4d = vp9_sad64x64x4d_sse2; - - vp9_sad32x64x4d = vp9_sad32x64x4d_c; - if (flags & HAS_SSE2) vp9_sad32x64x4d = vp9_sad32x64x4d_sse2; - - vp9_sad64x32x4d = vp9_sad64x32x4d_c; - if (flags & HAS_SSE2) vp9_sad64x32x4d = vp9_sad64x32x4d_sse2; - - vp9_sad32x16x4d = vp9_sad32x16x4d_c; - if (flags & HAS_SSE2) vp9_sad32x16x4d = vp9_sad32x16x4d_sse2; - - vp9_sad16x32x4d = vp9_sad16x32x4d_c; - if (flags & HAS_SSE2) vp9_sad16x32x4d = vp9_sad16x32x4d_sse2; - - vp9_sad32x32x4d = vp9_sad32x32x4d_c; - if (flags & HAS_SSE2) vp9_sad32x32x4d = vp9_sad32x32x4d_sse2; - - vp9_sad16x16x4d = vp9_sad16x16x4d_c; - if (flags & HAS_SSE2) vp9_sad16x16x4d = vp9_sad16x16x4d_sse2; - - vp9_sad16x8x4d = vp9_sad16x8x4d_c; - if (flags & HAS_SSE2) vp9_sad16x8x4d = vp9_sad16x8x4d_sse2; - - vp9_sad8x16x4d = vp9_sad8x16x4d_c; - if (flags & HAS_SSE2) vp9_sad8x16x4d = vp9_sad8x16x4d_sse2; - - vp9_sad8x8x4d = vp9_sad8x8x4d_c; - if (flags & HAS_SSE2) vp9_sad8x8x4d = vp9_sad8x8x4d_sse2; - - vp9_sad8x4x4d = vp9_sad8x4x4d_c; - if (flags & HAS_SSE2) vp9_sad8x4x4d = vp9_sad8x4x4d_sse2; - - vp9_sad4x8x4d = vp9_sad4x8x4d_c; - if (flags & HAS_SSE) vp9_sad4x8x4d = vp9_sad4x8x4d_sse; - - vp9_sad4x4x4d = vp9_sad4x4x4d_c; - if (flags & HAS_SSE) vp9_sad4x4x4d = vp9_sad4x4x4d_sse; - - vp9_mse16x16 = vp9_mse16x16_c; - if (flags & HAS_MMX) vp9_mse16x16 = vp9_mse16x16_mmx; - - - - - - - vp9_get_mb_ss = vp9_get_mb_ss_c; - if (flags & HAS_MMX) vp9_get_mb_ss = vp9_get_mb_ss_mmx; - if (flags & HAS_SSE2) vp9_get_mb_ss = vp9_get_mb_ss_sse2; - - - - - - vp9_short_fht4x4 = vp9_short_fht4x4_c; - if (flags & HAS_SSE2) vp9_short_fht4x4 = vp9_short_fht4x4_sse2; - - vp9_short_fht8x8 = vp9_short_fht8x8_c; - if (flags & HAS_SSE2) vp9_short_fht8x8 = vp9_short_fht8x8_sse2; - - vp9_short_fht16x16 = vp9_short_fht16x16_c; - if (flags & HAS_SSE2) vp9_short_fht16x16 = vp9_short_fht16x16_sse2; - - - vp9_fdct4x4 = vp9_fdct4x4_c; - if (flags & HAS_SSE2) vp9_fdct4x4 = vp9_fdct4x4_sse2; - - vp9_fdct8x8 = vp9_fdct8x8_c; - if (flags & HAS_SSE2) vp9_fdct8x8 = vp9_fdct8x8_sse2; - + vp9_convolve8_horiz = vp9_convolve8_horiz_c; + if (flags & HAS_SSE2) vp9_convolve8_horiz = vp9_convolve8_horiz_sse2; + if (flags & HAS_SSSE3) vp9_convolve8_horiz = vp9_convolve8_horiz_ssse3; + if (flags & HAS_AVX2) vp9_convolve8_horiz = vp9_convolve8_horiz_avx2; + vp9_convolve8_vert = vp9_convolve8_vert_c; + if (flags & HAS_SSE2) vp9_convolve8_vert = vp9_convolve8_vert_sse2; + if (flags & HAS_SSSE3) vp9_convolve8_vert = vp9_convolve8_vert_ssse3; + if (flags & HAS_AVX2) vp9_convolve8_vert = vp9_convolve8_vert_avx2; vp9_fdct16x16 = vp9_fdct16x16_c; if (flags & HAS_SSE2) vp9_fdct16x16 = vp9_fdct16x16_sse2; - + vp9_fdct16x16_1 = vp9_fdct16x16_1_c; + if (flags & HAS_SSE2) vp9_fdct16x16_1 = vp9_fdct16x16_1_sse2; vp9_fdct32x32 = vp9_fdct32x32_c; if (flags & HAS_SSE2) vp9_fdct32x32 = vp9_fdct32x32_sse2; - + if (flags & HAS_AVX2) vp9_fdct32x32 = vp9_fdct32x32_avx2; + vp9_fdct32x32_1 = vp9_fdct32x32_1_c; + if (flags & HAS_SSE2) vp9_fdct32x32_1 = vp9_fdct32x32_1_sse2; vp9_fdct32x32_rd = vp9_fdct32x32_rd_c; if (flags & HAS_SSE2) vp9_fdct32x32_rd = vp9_fdct32x32_rd_sse2; - + if (flags & HAS_AVX2) vp9_fdct32x32_rd = vp9_fdct32x32_rd_avx2; + vp9_fdct4x4 = vp9_fdct4x4_c; + if (flags & HAS_SSE2) vp9_fdct4x4 = vp9_fdct4x4_sse2; + vp9_fdct4x4_1 = vp9_fdct4x4_1_c; + if (flags & HAS_SSE2) vp9_fdct4x4_1 = vp9_fdct4x4_1_sse2; + vp9_fdct8x8 = vp9_fdct8x8_c; + if (flags & HAS_SSE2) vp9_fdct8x8 = vp9_fdct8x8_sse2; + vp9_fdct8x8_1 = vp9_fdct8x8_1_c; + if (flags & HAS_SSE2) vp9_fdct8x8_1 = vp9_fdct8x8_1_sse2; + vp9_fht16x16 = vp9_fht16x16_c; + if (flags & HAS_SSE2) vp9_fht16x16 = vp9_fht16x16_sse2; + vp9_fht4x4 = vp9_fht4x4_c; + if (flags & HAS_SSE2) vp9_fht4x4 = vp9_fht4x4_sse2; + vp9_fht8x8 = vp9_fht8x8_c; + if (flags & HAS_SSE2) vp9_fht8x8 = vp9_fht8x8_sse2; vp9_full_search_sad = vp9_full_search_sad_c; if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3; if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; - - vp9_refining_search_sad = vp9_refining_search_sad_c; - if (flags & HAS_SSE3) vp9_refining_search_sad = vp9_refining_search_sadx4; - - vp9_diamond_search_sad = vp9_diamond_search_sad_c; - if (flags & HAS_SSE3) vp9_diamond_search_sad = vp9_diamond_search_sadx4; - + vp9_get16x16var = vp9_get16x16var_c; + if (flags & HAS_AVX2) vp9_get16x16var = vp9_get16x16var_avx2; + vp9_idct16x16_10_add = vp9_idct16x16_10_add_c; + if (flags & HAS_SSE2) vp9_idct16x16_10_add = vp9_idct16x16_10_add_sse2; + if (flags & HAS_SSSE3) vp9_idct16x16_10_add = vp9_idct16x16_10_add_ssse3; + vp9_idct16x16_1_add = vp9_idct16x16_1_add_c; + if (flags & HAS_SSE2) vp9_idct16x16_1_add = vp9_idct16x16_1_add_sse2; + vp9_idct16x16_256_add = vp9_idct16x16_256_add_c; + if (flags & HAS_SSE2) vp9_idct16x16_256_add = vp9_idct16x16_256_add_sse2; + if (flags & HAS_SSSE3) vp9_idct16x16_256_add = vp9_idct16x16_256_add_ssse3; + vp9_idct32x32_1024_add = vp9_idct32x32_1024_add_c; + if (flags & HAS_SSE2) vp9_idct32x32_1024_add = vp9_idct32x32_1024_add_sse2; + vp9_idct32x32_1_add = vp9_idct32x32_1_add_c; + if (flags & HAS_SSE2) vp9_idct32x32_1_add = vp9_idct32x32_1_add_sse2; + vp9_idct32x32_34_add = vp9_idct32x32_34_add_c; + if (flags & HAS_SSE2) vp9_idct32x32_34_add = vp9_idct32x32_34_add_sse2; + vp9_idct4x4_16_add = vp9_idct4x4_16_add_c; + if (flags & HAS_SSE2) vp9_idct4x4_16_add = vp9_idct4x4_16_add_sse2; + vp9_idct4x4_1_add = vp9_idct4x4_1_add_c; + if (flags & HAS_SSE2) vp9_idct4x4_1_add = vp9_idct4x4_1_add_sse2; + vp9_idct8x8_12_add = vp9_idct8x8_12_add_c; + if (flags & HAS_SSE2) vp9_idct8x8_12_add = vp9_idct8x8_12_add_sse2; + vp9_idct8x8_1_add = vp9_idct8x8_1_add_c; + if (flags & HAS_SSE2) vp9_idct8x8_1_add = vp9_idct8x8_1_add_sse2; + vp9_idct8x8_64_add = vp9_idct8x8_64_add_c; + if (flags & HAS_SSE2) vp9_idct8x8_64_add = vp9_idct8x8_64_add_sse2; + vp9_iht16x16_256_add = vp9_iht16x16_256_add_c; + if (flags & HAS_SSE2) vp9_iht16x16_256_add = vp9_iht16x16_256_add_sse2; + vp9_iht4x4_16_add = vp9_iht4x4_16_add_c; + if (flags & HAS_SSE2) vp9_iht4x4_16_add = vp9_iht4x4_16_add_sse2; + vp9_iht8x8_64_add = vp9_iht8x8_64_add_c; + if (flags & HAS_SSE2) vp9_iht8x8_64_add = vp9_iht8x8_64_add_sse2; + vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_c; + if (flags & HAS_SSE2) vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_sse2; + if (flags & HAS_AVX2) vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_avx2; + vp9_lpf_horizontal_4 = vp9_lpf_horizontal_4_c; + if (flags & HAS_MMX) vp9_lpf_horizontal_4 = vp9_lpf_horizontal_4_mmx; + vp9_lpf_horizontal_4_dual = vp9_lpf_horizontal_4_dual_c; + if (flags & HAS_SSE2) vp9_lpf_horizontal_4_dual = vp9_lpf_horizontal_4_dual_sse2; + vp9_lpf_horizontal_8 = vp9_lpf_horizontal_8_c; + if (flags & HAS_SSE2) vp9_lpf_horizontal_8 = vp9_lpf_horizontal_8_sse2; + vp9_lpf_horizontal_8_dual = vp9_lpf_horizontal_8_dual_c; + if (flags & HAS_SSE2) vp9_lpf_horizontal_8_dual = vp9_lpf_horizontal_8_dual_sse2; + vp9_lpf_vertical_16 = vp9_lpf_vertical_16_c; + if (flags & HAS_SSE2) vp9_lpf_vertical_16 = vp9_lpf_vertical_16_sse2; + vp9_lpf_vertical_16_dual = vp9_lpf_vertical_16_dual_c; + if (flags & HAS_SSE2) vp9_lpf_vertical_16_dual = vp9_lpf_vertical_16_dual_sse2; + vp9_lpf_vertical_4 = vp9_lpf_vertical_4_c; + if (flags & HAS_MMX) vp9_lpf_vertical_4 = vp9_lpf_vertical_4_mmx; + vp9_lpf_vertical_4_dual = vp9_lpf_vertical_4_dual_c; + if (flags & HAS_SSE2) vp9_lpf_vertical_4_dual = vp9_lpf_vertical_4_dual_sse2; + vp9_lpf_vertical_8 = vp9_lpf_vertical_8_c; + if (flags & HAS_SSE2) vp9_lpf_vertical_8 = vp9_lpf_vertical_8_sse2; + vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_c; + if (flags & HAS_SSE2) vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_sse2; + vp9_mse16x16 = vp9_mse16x16_c; + if (flags & HAS_AVX2) vp9_mse16x16 = vp9_mse16x16_avx2; + vp9_sad16x16x3 = vp9_sad16x16x3_c; + if (flags & HAS_SSE3) vp9_sad16x16x3 = vp9_sad16x16x3_sse3; + if (flags & HAS_SSSE3) vp9_sad16x16x3 = vp9_sad16x16x3_ssse3; + vp9_sad16x16x4d = vp9_sad16x16x4d_c; + if (flags & HAS_SSE2) vp9_sad16x16x4d = vp9_sad16x16x4d_sse2; + vp9_sad16x32x4d = vp9_sad16x32x4d_c; + if (flags & HAS_SSE2) vp9_sad16x32x4d = vp9_sad16x32x4d_sse2; + vp9_sad16x8x3 = vp9_sad16x8x3_c; + if (flags & HAS_SSE3) vp9_sad16x8x3 = vp9_sad16x8x3_sse3; + if (flags & HAS_SSSE3) vp9_sad16x8x3 = vp9_sad16x8x3_ssse3; + vp9_sad16x8x4d = vp9_sad16x8x4d_c; + if (flags & HAS_SSE2) vp9_sad16x8x4d = vp9_sad16x8x4d_sse2; + vp9_sad32x16x4d = vp9_sad32x16x4d_c; + if (flags & HAS_SSE2) vp9_sad32x16x4d = vp9_sad32x16x4d_sse2; + vp9_sad32x32x4d = vp9_sad32x32x4d_c; + if (flags & HAS_SSE2) vp9_sad32x32x4d = vp9_sad32x32x4d_sse2; + if (flags & HAS_AVX2) vp9_sad32x32x4d = vp9_sad32x32x4d_avx2; + vp9_sad32x64x4d = vp9_sad32x64x4d_c; + if (flags & HAS_SSE2) vp9_sad32x64x4d = vp9_sad32x64x4d_sse2; + vp9_sad4x4x3 = vp9_sad4x4x3_c; + if (flags & HAS_SSE3) vp9_sad4x4x3 = vp9_sad4x4x3_sse3; + vp9_sad4x4x4d = vp9_sad4x4x4d_c; + if (flags & HAS_SSE) vp9_sad4x4x4d = vp9_sad4x4x4d_sse; + vp9_sad4x8x4d = vp9_sad4x8x4d_c; + if (flags & HAS_SSE) vp9_sad4x8x4d = vp9_sad4x8x4d_sse; + vp9_sad64x32x4d = vp9_sad64x32x4d_c; + if (flags & HAS_SSE2) vp9_sad64x32x4d = vp9_sad64x32x4d_sse2; + vp9_sad64x64x4d = vp9_sad64x64x4d_c; + if (flags & HAS_SSE2) vp9_sad64x64x4d = vp9_sad64x64x4d_sse2; + if (flags & HAS_AVX2) vp9_sad64x64x4d = vp9_sad64x64x4d_avx2; + vp9_sad8x16x3 = vp9_sad8x16x3_c; + if (flags & HAS_SSE3) vp9_sad8x16x3 = vp9_sad8x16x3_sse3; + vp9_sad8x16x4d = vp9_sad8x16x4d_c; + if (flags & HAS_SSE2) vp9_sad8x16x4d = vp9_sad8x16x4d_sse2; + vp9_sad8x4x4d = vp9_sad8x4x4d_c; + if (flags & HAS_SSE2) vp9_sad8x4x4d = vp9_sad8x4x4d_sse2; + vp9_sad8x8x3 = vp9_sad8x8x3_c; + if (flags & HAS_SSE3) vp9_sad8x8x3 = vp9_sad8x8x3_sse3; + vp9_sad8x8x4d = vp9_sad8x8x4d_c; + if (flags & HAS_SSE2) vp9_sad8x8x4d = vp9_sad8x8x4d_sse2; + vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_c; + if (flags & HAS_AVX2) vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_avx2; + vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_c; + if (flags & HAS_AVX2) vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_avx2; + vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_c; + if (flags & HAS_AVX2) vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_avx2; + vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_c; + if (flags & HAS_AVX2) vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_avx2; vp9_temporal_filter_apply = vp9_temporal_filter_apply_c; if (flags & HAS_SSE2) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse2; + vp9_variance16x16 = vp9_variance16x16_c; + if (flags & HAS_AVX2) vp9_variance16x16 = vp9_variance16x16_avx2; + vp9_variance32x16 = vp9_variance32x16_c; + if (flags & HAS_AVX2) vp9_variance32x16 = vp9_variance32x16_avx2; + vp9_variance32x32 = vp9_variance32x32_c; + if (flags & HAS_AVX2) vp9_variance32x32 = vp9_variance32x32_avx2; + vp9_variance64x32 = vp9_variance64x32_c; + if (flags & HAS_AVX2) vp9_variance64x32 = vp9_variance64x32_avx2; + vp9_variance64x64 = vp9_variance64x64_c; + if (flags & HAS_AVX2) vp9_variance64x64 = vp9_variance64x64_avx2; } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vp9_rtcd_x86-linux-gcc.h b/media/libvpx/vp9_rtcd_x86-linux-gcc.h index 39592a241043..3c1d522907f3 100644 --- a/media/libvpx/vp9_rtcd_x86-linux-gcc.h +++ b/media/libvpx/vp9_rtcd_x86-linux-gcc.h @@ -7,6 +7,10 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + /* * VP9 */ @@ -19,221 +23,21 @@ struct macroblockd; /* Encoder forward decls */ struct macroblock; struct vp9_variance_vtable; - -#define DEC_MVCOSTS int *mvjcost, int *mvcost[2] +struct search_site_config; +struct mv; union int_mv; struct yv12_buffer_config; -void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c - -void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c - -void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c - -void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c - -void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c - -void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c - -void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c - -void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c - -void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c - -void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c - -void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c - -void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c - -void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c - -void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c - -void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c - -void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c - -void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c - -void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c - -void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c - -void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c - -void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c - -void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c - -void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c - -void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c - -void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c - -void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c - -void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c - -void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c - -void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c - -void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c - -void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c - -void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c - -void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c - -void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c - -void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c - -void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c - -void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c - -void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c - -void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c - -void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c - -void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c - -void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c - -void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c - -void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c - -void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c - -void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c - -void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c - -void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c - -void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c - -void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c - -void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c - -void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c - -void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); -void vp9_mb_lpf_vertical_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); -RTCD_EXTERN void (*vp9_mb_lpf_vertical_edge_w)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); - -void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mbloop_filter_vertical_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_mbloop_filter_vertical_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_loop_filter_vertical_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_loop_filter_vertical_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mb_lpf_horizontal_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_mb_lpf_horizontal_edge_w)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mbloop_filter_horizontal_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_mbloop_filter_horizontal_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_loop_filter_horizontal_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_loop_filter_horizontal_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve_copy vp9_convolve_copy_c - -void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve_avg vp9_convolve_avg_c +int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); +RTCD_EXTERN int64_t (*vp9_block_error)(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*vp9_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); - -void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); - void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); @@ -249,14 +53,268 @@ void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8 void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*vp9_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct4x4_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_avg vp9_convolve_avg_c + +void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_copy vp9_convolve_copy_c + +void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c + +void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c + +void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c + +void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c + +void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c + +void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c + +void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c + +void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c + +void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d153_predictor_16x16 vp9_d153_predictor_16x16_c + +void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c + +void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d153_predictor_4x4 vp9_d153_predictor_4x4_c + +void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d153_predictor_8x8 vp9_d153_predictor_8x8_c + +void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_16x16 vp9_d207_predictor_16x16_c + +void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_32x32 vp9_d207_predictor_32x32_c + +void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_4x4 vp9_d207_predictor_4x4_c + +void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d207_predictor_8x8 vp9_d207_predictor_8x8_c + +void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d45_predictor_16x16 vp9_d45_predictor_16x16_c + +void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d45_predictor_32x32 vp9_d45_predictor_32x32_c + +void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d45_predictor_4x4 vp9_d45_predictor_4x4_c + +void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d45_predictor_8x8 vp9_d45_predictor_8x8_c + +void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d63_predictor_16x16 vp9_d63_predictor_16x16_c + +void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d63_predictor_32x32 vp9_d63_predictor_32x32_c + +void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d63_predictor_4x4 vp9_d63_predictor_4x4_c + +void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d63_predictor_8x8 vp9_d63_predictor_8x8_c + +void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c + +void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c + +void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c + +void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c + +void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c + +void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c + +void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c + +void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c + +void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_c + +void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_c + +void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_c + +void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_c + +void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c + +void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c + +void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c + +void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct16x16)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct16x16_1)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_sse2(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_avx2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct32x32)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct32x32_1)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_rd_sse2(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_rd_avx2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct32x32_rd)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct4x4)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct4x4_1)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct8x8)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct8x8_1)(const int16_t *input, int16_t *output, int stride); + +void vp9_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht16x16_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht16x16)(const int16_t *input, int16_t *output, int stride, int tx_type); + +void vp9_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht4x4_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht4x4)(const int16_t *input, int16_t *output, int stride, int tx_type); + +void vp9_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht8x8_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht8x8)(const int16_t *input, int16_t *output, int stride, int tx_type); + +int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_full_range_search vp9_full_range_search_c + +int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +int vp9_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +RTCD_EXTERN int (*vp9_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); + +void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_c + +void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vp9_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vp9_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vp9_get8x8var vp9_get8x8var_c + +unsigned int vp9_get_mb_ss_c(const int16_t *); +#define vp9_get_mb_ss vp9_get_mb_ss_c + +void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_h_predictor_16x16 vp9_h_predictor_16x16_c + +void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c + +void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_h_predictor_4x4 vp9_h_predictor_4x4_c + +void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_h_predictor_8x8 vp9_h_predictor_8x8_c + +void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_10_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct16x16_10_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct16x16_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct16x16_256_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct32x32_1024_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct32x32_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct32x32_34_add)(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); RTCD_EXTERN void (*vp9_idct4x4_16_add)(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct4x4_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct8x8_12_add)(const int16_t *input, uint8_t *dest, int dest_stride); + void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); RTCD_EXTERN void (*vp9_idct8x8_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); @@ -265,33 +323,9 @@ void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); RTCD_EXTERN void (*vp9_idct8x8_64_add)(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct8x8_10_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct16x16_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct16x16_256_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct16x16_10_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct32x32_1024_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct32x32_34_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct32x32_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type); +void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *output, int pitch, int tx_type); +RTCD_EXTERN void (*vp9_iht16x16_256_add)(const int16_t *input, uint8_t *output, int pitch, int tx_type); void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); @@ -301,449 +335,428 @@ void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride, i void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); RTCD_EXTERN void (*vp9_iht8x8_64_add)(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type); -void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *output, int pitch, int tx_type); -RTCD_EXTERN void (*vp9_iht16x16_256_add)(const int16_t *input, uint8_t *output, int pitch, int tx_type); +void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c void vp9_iwht4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); #define vp9_iwht4x4_1_add vp9_iwht4x4_1_add_c -void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c - -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x16 vp9_variance32x16_c - -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x32 vp9_variance16x32_c - -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x32 vp9_variance64x32_c - -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x64 vp9_variance32x64_c - -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x32 vp9_variance32x32_c - -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x64 vp9_variance64x64_c - -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vp9_get_sse_sum_8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x4 vp9_variance8x4_c - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x8 vp9_variance4x8_c - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance64x64 vp9_sub_pixel_variance64x64_c - -unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance64x64 vp9_sub_pixel_avg_variance64x64_c - -unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance32x64 vp9_sub_pixel_variance32x64_c - -unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance32x64 vp9_sub_pixel_avg_variance32x64_c - -unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance64x32 vp9_sub_pixel_variance64x32_c - -unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance64x32 vp9_sub_pixel_avg_variance64x32_c - -unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance32x16 vp9_sub_pixel_variance32x16_c - -unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance32x16 vp9_sub_pixel_avg_variance32x16_c - -unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance16x32 vp9_sub_pixel_variance16x32_c - -unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance16x32 vp9_sub_pixel_avg_variance16x32_c - -unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance32x32 vp9_sub_pixel_variance32x32_c - -unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance32x32 vp9_sub_pixel_avg_variance32x32_c - -unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance16x16 vp9_sub_pixel_variance16x16_c - -unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance16x16 vp9_sub_pixel_avg_variance16x16_c - -unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance8x16 vp9_sub_pixel_variance8x16_c - -unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance8x16 vp9_sub_pixel_avg_variance8x16_c - -unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance16x8 vp9_sub_pixel_variance16x8_c - -unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance16x8 vp9_sub_pixel_avg_variance16x8_c - -unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance8x8 vp9_sub_pixel_variance8x8_c - -unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance8x8 vp9_sub_pixel_avg_variance8x8_c - -unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance8x4 vp9_sub_pixel_variance8x4_c - -unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance8x4 vp9_sub_pixel_avg_variance8x4_c - -unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance4x8 vp9_sub_pixel_variance4x8_c - -unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance4x8 vp9_sub_pixel_avg_variance4x8_c - -unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_variance4x4 vp9_sub_pixel_variance4x4_c - -unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -#define vp9_sub_pixel_avg_variance4x4 vp9_sub_pixel_avg_variance4x4_c - -unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad64x64 vp9_sad64x64_c - -unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x64 vp9_sad32x64_c - -unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad64x32 vp9_sad64x32_c - -unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x16 vp9_sad32x16_c - -unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad16x32 vp9_sad16x32_c - -unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x32 vp9_sad32x32_c - -unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad8x4 vp9_sad8x4_c - -unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad4x8 vp9_sad4x8_c - -unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad64x64_avg vp9_sad64x64_avg_c - -unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x64_avg vp9_sad32x64_avg_c - -unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad64x32_avg vp9_sad64x32_avg_c - -unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x16_avg vp9_sad32x16_avg_c - -unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x32_avg vp9_sad16x32_avg_c - -unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x32_avg vp9_sad32x32_avg_c - -unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x16_avg vp9_sad16x16_avg_c - -unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x8_avg vp9_sad16x8_avg_c - -unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x16_avg vp9_sad8x16_avg_c - -unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x8_avg vp9_sad8x8_avg_c - -unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x4_avg vp9_sad8x4_avg_c - -unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad4x8_avg vp9_sad4x8_avg_c - -unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad4x4_avg vp9_sad4x4_avg_c - -unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_c - -unsigned int vp9_variance_halfpixvar16x16_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_c - -unsigned int vp9_variance_halfpixvar16x16_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_c - -unsigned int vp9_variance_halfpixvar64x64_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_h vp9_variance_halfpixvar64x64_h_c - -unsigned int vp9_variance_halfpixvar64x64_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_v vp9_variance_halfpixvar64x64_v_c - -unsigned int vp9_variance_halfpixvar64x64_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_hv vp9_variance_halfpixvar64x64_hv_c - -unsigned int vp9_variance_halfpixvar32x32_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_h vp9_variance_halfpixvar32x32_h_c - -unsigned int vp9_variance_halfpixvar32x32_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_v vp9_variance_halfpixvar32x32_v_c - -unsigned int vp9_variance_halfpixvar32x32_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_hv vp9_variance_halfpixvar32x32_hv_c - -void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad64x64x3 vp9_sad64x64x3_c - -void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad32x32x3 vp9_sad32x32x3_c - -void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad8x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad8x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad4x4x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad4x4x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad64x64x8 vp9_sad64x64x8_c - -void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad32x32x8 vp9_sad32x32x8_c - -void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad16x16x8 vp9_sad16x16x8_c - -void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad16x8x8 vp9_sad16x8x8_c - -void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x16x8 vp9_sad8x16x8_c - -void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x8x8 vp9_sad8x8x8_c - -void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x4x8 vp9_sad8x4x8_c - -void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad4x8x8 vp9_sad4x8x8_c - -void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad4x4x8 vp9_sad4x4x8_c - -void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad4x8x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad4x4x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_16_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_horizontal_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); + +void vp9_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_4_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_horizontal_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); + +void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vp9_lpf_horizontal_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vp9_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_horizontal_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); + +void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vp9_lpf_horizontal_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vp9_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vp9_lpf_vertical_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vp9_lpf_vertical_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vp9_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vp9_lpf_vertical_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vp9_lpf_vertical_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_vertical_4_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_vertical_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); + +void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vp9_lpf_vertical_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_vertical_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); + +void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vp9_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vp9_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vp9_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_c - unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); #define vp9_mse16x8 vp9_mse16x8_c +unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse8x16 vp9_mse8x16_c + unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); #define vp9_mse8x8 vp9_mse8x8_c -unsigned int vp9_sub_pixel_mse64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_mse64x64 vp9_sub_pixel_mse64x64_c - -unsigned int vp9_sub_pixel_mse32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_mse32x32 vp9_sub_pixel_mse32x32_c - -unsigned int vp9_get_mb_ss_c(const int16_t *); -unsigned int vp9_get_mb_ss_mmx(const int16_t *); -unsigned int vp9_get_mb_ss_sse2(const int16_t *); -RTCD_EXTERN unsigned int (*vp9_get_mb_ss)(const int16_t *); - -int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); -#define vp9_block_error vp9_block_error_c - -void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); -#define vp9_subtract_block vp9_subtract_block_c - void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_b vp9_quantize_b_c void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_b_32x32 vp9_quantize_b_32x32_c -void vp9_short_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -RTCD_EXTERN void (*vp9_short_fht4x4)(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_quantize_fp vp9_quantize_fp_c -void vp9_short_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -RTCD_EXTERN void (*vp9_short_fht8x8)(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c -void vp9_short_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -RTCD_EXTERN void (*vp9_short_fht16x16)(const int16_t *input, int16_t *output, int stride, int tx_type); +int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_refining_search_sad vp9_refining_search_sad_c -void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride); -#define vp9_fwht4x4 vp9_fwht4x4_c +unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x16 vp9_sad16x16_c -void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride); -RTCD_EXTERN void (*vp9_fdct4x4)(const int16_t *input, int16_t *output, int stride); +unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x16_avg vp9_sad16x16_avg_c -void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride); -RTCD_EXTERN void (*vp9_fdct8x8)(const int16_t *input, int16_t *output, int stride); +void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride); -RTCD_EXTERN void (*vp9_fdct16x16)(const int16_t *input, int16_t *output, int stride); +void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct32x32_sse2(const int16_t *input, int16_t *output, int stride); -RTCD_EXTERN void (*vp9_fdct32x32)(const int16_t *input, int16_t *output, int stride); +void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad16x16x8 vp9_sad16x16x8_c -void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct32x32_rd_sse2(const int16_t *input, int16_t *output, int stride); -RTCD_EXTERN void (*vp9_fdct32x32_rd)(const int16_t *input, int16_t *output, int stride); +unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x32 vp9_sad16x32_c -int vp9_full_search_sad_c(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -int vp9_full_search_sadx3(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -int vp9_full_search_sadx8(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -RTCD_EXTERN int (*vp9_full_search_sad)(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); +unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x32_avg vp9_sad16x32_avg_c -int vp9_refining_search_sad_c(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -int vp9_refining_search_sadx4(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -RTCD_EXTERN int (*vp9_refining_search_sad)(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); +void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -int vp9_diamond_search_sad_c(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -int vp9_diamond_search_sadx4(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -RTCD_EXTERN int (*vp9_diamond_search_sad)(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); +unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x8 vp9_sad16x8_c -void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -RTCD_EXTERN void (*vp9_temporal_filter_apply)(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x8_avg vp9_sad16x8_avg_c -void vp9_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction); -#define vp9_yv12_copy_partial_frame vp9_yv12_copy_partial_frame_c +void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad16x8x8 vp9_sad16x8x8_c + +unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad32x16 vp9_sad32x16_c + +unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x16_avg vp9_sad32x16_avg_c + +void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad32x32 vp9_sad32x32_c + +unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x32_avg vp9_sad32x32_avg_c + +void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad32x32x3 vp9_sad32x32x3_c + +void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad32x32x8 vp9_sad32x32x8_c + +unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad32x64 vp9_sad32x64_c + +unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x64_avg vp9_sad32x64_avg_c + +void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad4x4 vp9_sad4x4_c + +unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad4x4_avg vp9_sad4x4_avg_c + +void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad4x4x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad4x4x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad4x4x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad4x4x8 vp9_sad4x4x8_c + +unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad4x8 vp9_sad4x8_c + +unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad4x8_avg vp9_sad4x8_avg_c + +void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad4x8x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad4x8x8 vp9_sad4x8x8_c + +unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad64x32 vp9_sad64x32_c + +unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad64x32_avg vp9_sad64x32_avg_c + +void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad64x64 vp9_sad64x64_c + +unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad64x64_avg vp9_sad64x64_avg_c + +void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad64x64x3 vp9_sad64x64x3_c + +void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad64x64x8 vp9_sad64x64x8_c + +unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x16 vp9_sad8x16_c + +unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x16_avg vp9_sad8x16_avg_c + +void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad8x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x16x8 vp9_sad8x16x8_c + +unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x4 vp9_sad8x4_c + +unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x4_avg vp9_sad8x4_avg_c + +void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x4x8 vp9_sad8x4x8_c + +unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x8 vp9_sad8x8_c + +unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x8_avg vp9_sad8x8_avg_c + +void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad8x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x8x8 vp9_sad8x8x8_c + +unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance16x16 vp9_sub_pixel_avg_variance16x16_c + +unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance16x32 vp9_sub_pixel_avg_variance16x32_c + +unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance16x8 vp9_sub_pixel_avg_variance16x8_c + +unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance32x16 vp9_sub_pixel_avg_variance32x16_c + +unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance32x64 vp9_sub_pixel_avg_variance32x64_c + +unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance4x4 vp9_sub_pixel_avg_variance4x4_c + +unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance4x8 vp9_sub_pixel_avg_variance4x8_c + +unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance64x32 vp9_sub_pixel_avg_variance64x32_c + +unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance8x16 vp9_sub_pixel_avg_variance8x16_c + +unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance8x4 vp9_sub_pixel_avg_variance8x4_c + +unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +#define vp9_sub_pixel_avg_variance8x8 vp9_sub_pixel_avg_variance8x8_c + +unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance16x16 vp9_sub_pixel_variance16x16_c + +unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance16x32 vp9_sub_pixel_variance16x32_c + +unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance16x8 vp9_sub_pixel_variance16x8_c + +unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance32x16 vp9_sub_pixel_variance32x16_c + +unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance32x64 vp9_sub_pixel_variance32x64_c + +unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance4x4 vp9_sub_pixel_variance4x4_c + +unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance4x8 vp9_sub_pixel_variance4x8_c + +unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance64x32 vp9_sub_pixel_variance64x32_c + +unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance8x16 vp9_sub_pixel_variance8x16_c + +unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance8x4 vp9_sub_pixel_variance8x4_c + +unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_sub_pixel_variance8x8 vp9_sub_pixel_variance8x8_c + +void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +#define vp9_subtract_block vp9_subtract_block_c + +void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +RTCD_EXTERN void (*vp9_temporal_filter_apply)(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); + +void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_c + +void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c + +void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_c + +void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_c + +void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_c + +void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_c + +void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_c + +void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_c + +unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance16x32 vp9_variance16x32_c + +unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance16x8 vp9_variance16x8_c + +unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance32x64 vp9_variance32x64_c + +unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance4x4 vp9_variance4x4_c + +unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance4x8 vp9_variance4x8_c + +unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x16 vp9_variance8x16_c + +unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x4 vp9_variance8x4_c + +unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x8 vp9_variance8x8_c void vp9_rtcd(void); @@ -755,366 +768,180 @@ static void setup_rtcd_internal(void) (void)flags; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - vp9_mb_lpf_vertical_edge_w = vp9_mb_lpf_vertical_edge_w_c; - if (flags & HAS_SSE2) vp9_mb_lpf_vertical_edge_w = vp9_mb_lpf_vertical_edge_w_sse2; - - vp9_mbloop_filter_vertical_edge = vp9_mbloop_filter_vertical_edge_c; - if (flags & HAS_SSE2) vp9_mbloop_filter_vertical_edge = vp9_mbloop_filter_vertical_edge_sse2; - - vp9_loop_filter_vertical_edge = vp9_loop_filter_vertical_edge_c; - if (flags & HAS_MMX) vp9_loop_filter_vertical_edge = vp9_loop_filter_vertical_edge_mmx; - - vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_c; - if (flags & HAS_SSE2) vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_sse2; - - vp9_mbloop_filter_horizontal_edge = vp9_mbloop_filter_horizontal_edge_c; - if (flags & HAS_SSE2) vp9_mbloop_filter_horizontal_edge = vp9_mbloop_filter_horizontal_edge_sse2; - - vp9_loop_filter_horizontal_edge = vp9_loop_filter_horizontal_edge_c; - if (flags & HAS_MMX) vp9_loop_filter_horizontal_edge = vp9_loop_filter_horizontal_edge_mmx; - - - - - - + vp9_block_error = vp9_block_error_c; + if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2; vp9_convolve8 = vp9_convolve8_c; if (flags & HAS_SSE2) vp9_convolve8 = vp9_convolve8_sse2; if (flags & HAS_SSSE3) vp9_convolve8 = vp9_convolve8_ssse3; - - vp9_convolve8_horiz = vp9_convolve8_horiz_c; - if (flags & HAS_SSE2) vp9_convolve8_horiz = vp9_convolve8_horiz_sse2; - if (flags & HAS_SSSE3) vp9_convolve8_horiz = vp9_convolve8_horiz_ssse3; - - vp9_convolve8_vert = vp9_convolve8_vert_c; - if (flags & HAS_SSE2) vp9_convolve8_vert = vp9_convolve8_vert_sse2; - if (flags & HAS_SSSE3) vp9_convolve8_vert = vp9_convolve8_vert_ssse3; - + if (flags & HAS_AVX2) vp9_convolve8 = vp9_convolve8_avx2; vp9_convolve8_avg = vp9_convolve8_avg_c; if (flags & HAS_SSE2) vp9_convolve8_avg = vp9_convolve8_avg_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg = vp9_convolve8_avg_ssse3; - vp9_convolve8_avg_horiz = vp9_convolve8_avg_horiz_c; if (flags & HAS_SSE2) vp9_convolve8_avg_horiz = vp9_convolve8_avg_horiz_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg_horiz = vp9_convolve8_avg_horiz_ssse3; - vp9_convolve8_avg_vert = vp9_convolve8_avg_vert_c; if (flags & HAS_SSE2) vp9_convolve8_avg_vert = vp9_convolve8_avg_vert_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg_vert = vp9_convolve8_avg_vert_ssse3; - - vp9_idct4x4_1_add = vp9_idct4x4_1_add_c; - if (flags & HAS_SSE2) vp9_idct4x4_1_add = vp9_idct4x4_1_add_sse2; - - vp9_idct4x4_16_add = vp9_idct4x4_16_add_c; - if (flags & HAS_SSE2) vp9_idct4x4_16_add = vp9_idct4x4_16_add_sse2; - - vp9_idct8x8_1_add = vp9_idct8x8_1_add_c; - if (flags & HAS_SSE2) vp9_idct8x8_1_add = vp9_idct8x8_1_add_sse2; - - vp9_idct8x8_64_add = vp9_idct8x8_64_add_c; - if (flags & HAS_SSE2) vp9_idct8x8_64_add = vp9_idct8x8_64_add_sse2; - - vp9_idct8x8_10_add = vp9_idct8x8_10_add_c; - if (flags & HAS_SSE2) vp9_idct8x8_10_add = vp9_idct8x8_10_add_sse2; - - vp9_idct16x16_1_add = vp9_idct16x16_1_add_c; - if (flags & HAS_SSE2) vp9_idct16x16_1_add = vp9_idct16x16_1_add_sse2; - - vp9_idct16x16_256_add = vp9_idct16x16_256_add_c; - if (flags & HAS_SSE2) vp9_idct16x16_256_add = vp9_idct16x16_256_add_sse2; - - vp9_idct16x16_10_add = vp9_idct16x16_10_add_c; - if (flags & HAS_SSE2) vp9_idct16x16_10_add = vp9_idct16x16_10_add_sse2; - - vp9_idct32x32_1024_add = vp9_idct32x32_1024_add_c; - if (flags & HAS_SSE2) vp9_idct32x32_1024_add = vp9_idct32x32_1024_add_sse2; - - vp9_idct32x32_34_add = vp9_idct32x32_34_add_c; - if (flags & HAS_SSE2) vp9_idct32x32_34_add = vp9_idct32x32_34_add_sse2; - - vp9_idct32x32_1_add = vp9_idct32x32_1_add_c; - if (flags & HAS_SSE2) vp9_idct32x32_1_add = vp9_idct32x32_1_add_sse2; - - vp9_iht4x4_16_add = vp9_iht4x4_16_add_c; - if (flags & HAS_SSE2) vp9_iht4x4_16_add = vp9_iht4x4_16_add_sse2; - - vp9_iht8x8_64_add = vp9_iht8x8_64_add_c; - if (flags & HAS_SSE2) vp9_iht8x8_64_add = vp9_iht8x8_64_add_sse2; - - vp9_iht16x16_256_add = vp9_iht16x16_256_add_c; - if (flags & HAS_SSE2) vp9_iht16x16_256_add = vp9_iht16x16_256_add_sse2; - - - - - - - - - - vp9_variance16x16 = vp9_variance16x16_c; - if (flags & HAS_MMX) vp9_variance16x16 = vp9_variance16x16_mmx; - - vp9_variance16x8 = vp9_variance16x8_c; - if (flags & HAS_MMX) vp9_variance16x8 = vp9_variance16x8_mmx; - - vp9_variance8x16 = vp9_variance8x16_c; - if (flags & HAS_MMX) vp9_variance8x16 = vp9_variance8x16_mmx; - - vp9_variance8x8 = vp9_variance8x8_c; - if (flags & HAS_MMX) vp9_variance8x8 = vp9_variance8x8_mmx; - - vp9_get_sse_sum_8x8 = vp9_get_sse_sum_8x8_c; - if (flags & HAS_SSE2) vp9_get_sse_sum_8x8 = vp9_get8x8var_sse2; - - - - vp9_variance4x4 = vp9_variance4x4_c; - if (flags & HAS_MMX) vp9_variance4x4 = vp9_variance4x4_mmx; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - vp9_sad16x16 = vp9_sad16x16_c; - if (flags & HAS_MMX) vp9_sad16x16 = vp9_sad16x16_mmx; - - vp9_sad16x8 = vp9_sad16x8_c; - if (flags & HAS_MMX) vp9_sad16x8 = vp9_sad16x8_mmx; - - vp9_sad8x16 = vp9_sad8x16_c; - if (flags & HAS_MMX) vp9_sad8x16 = vp9_sad8x16_mmx; - - vp9_sad8x8 = vp9_sad8x8_c; - if (flags & HAS_MMX) vp9_sad8x8 = vp9_sad8x8_mmx; - - - - vp9_sad4x4 = vp9_sad4x4_c; - if (flags & HAS_MMX) vp9_sad4x4 = vp9_sad4x4_mmx; - - - - - - - - - - - - - - - - - - - - - - - - - - vp9_sad16x16x3 = vp9_sad16x16x3_c; - if (flags & HAS_SSE3) vp9_sad16x16x3 = vp9_sad16x16x3_sse3; - if (flags & HAS_SSSE3) vp9_sad16x16x3 = vp9_sad16x16x3_ssse3; - - vp9_sad16x8x3 = vp9_sad16x8x3_c; - if (flags & HAS_SSE3) vp9_sad16x8x3 = vp9_sad16x8x3_sse3; - if (flags & HAS_SSSE3) vp9_sad16x8x3 = vp9_sad16x8x3_ssse3; - - vp9_sad8x16x3 = vp9_sad8x16x3_c; - if (flags & HAS_SSE3) vp9_sad8x16x3 = vp9_sad8x16x3_sse3; - - vp9_sad8x8x3 = vp9_sad8x8x3_c; - if (flags & HAS_SSE3) vp9_sad8x8x3 = vp9_sad8x8x3_sse3; - - vp9_sad4x4x3 = vp9_sad4x4x3_c; - if (flags & HAS_SSE3) vp9_sad4x4x3 = vp9_sad4x4x3_sse3; - - - - - - - - - - - vp9_sad64x64x4d = vp9_sad64x64x4d_c; - if (flags & HAS_SSE2) vp9_sad64x64x4d = vp9_sad64x64x4d_sse2; - - vp9_sad32x64x4d = vp9_sad32x64x4d_c; - if (flags & HAS_SSE2) vp9_sad32x64x4d = vp9_sad32x64x4d_sse2; - - vp9_sad64x32x4d = vp9_sad64x32x4d_c; - if (flags & HAS_SSE2) vp9_sad64x32x4d = vp9_sad64x32x4d_sse2; - - vp9_sad32x16x4d = vp9_sad32x16x4d_c; - if (flags & HAS_SSE2) vp9_sad32x16x4d = vp9_sad32x16x4d_sse2; - - vp9_sad16x32x4d = vp9_sad16x32x4d_c; - if (flags & HAS_SSE2) vp9_sad16x32x4d = vp9_sad16x32x4d_sse2; - - vp9_sad32x32x4d = vp9_sad32x32x4d_c; - if (flags & HAS_SSE2) vp9_sad32x32x4d = vp9_sad32x32x4d_sse2; - - vp9_sad16x16x4d = vp9_sad16x16x4d_c; - if (flags & HAS_SSE2) vp9_sad16x16x4d = vp9_sad16x16x4d_sse2; - - vp9_sad16x8x4d = vp9_sad16x8x4d_c; - if (flags & HAS_SSE2) vp9_sad16x8x4d = vp9_sad16x8x4d_sse2; - - vp9_sad8x16x4d = vp9_sad8x16x4d_c; - if (flags & HAS_SSE2) vp9_sad8x16x4d = vp9_sad8x16x4d_sse2; - - vp9_sad8x8x4d = vp9_sad8x8x4d_c; - if (flags & HAS_SSE2) vp9_sad8x8x4d = vp9_sad8x8x4d_sse2; - - vp9_sad8x4x4d = vp9_sad8x4x4d_c; - if (flags & HAS_SSE2) vp9_sad8x4x4d = vp9_sad8x4x4d_sse2; - - vp9_sad4x8x4d = vp9_sad4x8x4d_c; - if (flags & HAS_SSE) vp9_sad4x8x4d = vp9_sad4x8x4d_sse; - - vp9_sad4x4x4d = vp9_sad4x4x4d_c; - if (flags & HAS_SSE) vp9_sad4x4x4d = vp9_sad4x4x4d_sse; - - vp9_mse16x16 = vp9_mse16x16_c; - if (flags & HAS_MMX) vp9_mse16x16 = vp9_mse16x16_mmx; - - - - - - - vp9_get_mb_ss = vp9_get_mb_ss_c; - if (flags & HAS_MMX) vp9_get_mb_ss = vp9_get_mb_ss_mmx; - if (flags & HAS_SSE2) vp9_get_mb_ss = vp9_get_mb_ss_sse2; - - - - - - vp9_short_fht4x4 = vp9_short_fht4x4_c; - if (flags & HAS_SSE2) vp9_short_fht4x4 = vp9_short_fht4x4_sse2; - - vp9_short_fht8x8 = vp9_short_fht8x8_c; - if (flags & HAS_SSE2) vp9_short_fht8x8 = vp9_short_fht8x8_sse2; - - vp9_short_fht16x16 = vp9_short_fht16x16_c; - if (flags & HAS_SSE2) vp9_short_fht16x16 = vp9_short_fht16x16_sse2; - - - vp9_fdct4x4 = vp9_fdct4x4_c; - if (flags & HAS_SSE2) vp9_fdct4x4 = vp9_fdct4x4_sse2; - - vp9_fdct8x8 = vp9_fdct8x8_c; - if (flags & HAS_SSE2) vp9_fdct8x8 = vp9_fdct8x8_sse2; - + vp9_convolve8_horiz = vp9_convolve8_horiz_c; + if (flags & HAS_SSE2) vp9_convolve8_horiz = vp9_convolve8_horiz_sse2; + if (flags & HAS_SSSE3) vp9_convolve8_horiz = vp9_convolve8_horiz_ssse3; + if (flags & HAS_AVX2) vp9_convolve8_horiz = vp9_convolve8_horiz_avx2; + vp9_convolve8_vert = vp9_convolve8_vert_c; + if (flags & HAS_SSE2) vp9_convolve8_vert = vp9_convolve8_vert_sse2; + if (flags & HAS_SSSE3) vp9_convolve8_vert = vp9_convolve8_vert_ssse3; + if (flags & HAS_AVX2) vp9_convolve8_vert = vp9_convolve8_vert_avx2; vp9_fdct16x16 = vp9_fdct16x16_c; if (flags & HAS_SSE2) vp9_fdct16x16 = vp9_fdct16x16_sse2; - + vp9_fdct16x16_1 = vp9_fdct16x16_1_c; + if (flags & HAS_SSE2) vp9_fdct16x16_1 = vp9_fdct16x16_1_sse2; vp9_fdct32x32 = vp9_fdct32x32_c; if (flags & HAS_SSE2) vp9_fdct32x32 = vp9_fdct32x32_sse2; - + if (flags & HAS_AVX2) vp9_fdct32x32 = vp9_fdct32x32_avx2; + vp9_fdct32x32_1 = vp9_fdct32x32_1_c; + if (flags & HAS_SSE2) vp9_fdct32x32_1 = vp9_fdct32x32_1_sse2; vp9_fdct32x32_rd = vp9_fdct32x32_rd_c; if (flags & HAS_SSE2) vp9_fdct32x32_rd = vp9_fdct32x32_rd_sse2; - + if (flags & HAS_AVX2) vp9_fdct32x32_rd = vp9_fdct32x32_rd_avx2; + vp9_fdct4x4 = vp9_fdct4x4_c; + if (flags & HAS_SSE2) vp9_fdct4x4 = vp9_fdct4x4_sse2; + vp9_fdct4x4_1 = vp9_fdct4x4_1_c; + if (flags & HAS_SSE2) vp9_fdct4x4_1 = vp9_fdct4x4_1_sse2; + vp9_fdct8x8 = vp9_fdct8x8_c; + if (flags & HAS_SSE2) vp9_fdct8x8 = vp9_fdct8x8_sse2; + vp9_fdct8x8_1 = vp9_fdct8x8_1_c; + if (flags & HAS_SSE2) vp9_fdct8x8_1 = vp9_fdct8x8_1_sse2; + vp9_fht16x16 = vp9_fht16x16_c; + if (flags & HAS_SSE2) vp9_fht16x16 = vp9_fht16x16_sse2; + vp9_fht4x4 = vp9_fht4x4_c; + if (flags & HAS_SSE2) vp9_fht4x4 = vp9_fht4x4_sse2; + vp9_fht8x8 = vp9_fht8x8_c; + if (flags & HAS_SSE2) vp9_fht8x8 = vp9_fht8x8_sse2; vp9_full_search_sad = vp9_full_search_sad_c; if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3; if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; - - vp9_refining_search_sad = vp9_refining_search_sad_c; - if (flags & HAS_SSE3) vp9_refining_search_sad = vp9_refining_search_sadx4; - - vp9_diamond_search_sad = vp9_diamond_search_sad_c; - if (flags & HAS_SSE3) vp9_diamond_search_sad = vp9_diamond_search_sadx4; - + vp9_get16x16var = vp9_get16x16var_c; + if (flags & HAS_AVX2) vp9_get16x16var = vp9_get16x16var_avx2; + vp9_idct16x16_10_add = vp9_idct16x16_10_add_c; + if (flags & HAS_SSE2) vp9_idct16x16_10_add = vp9_idct16x16_10_add_sse2; + if (flags & HAS_SSSE3) vp9_idct16x16_10_add = vp9_idct16x16_10_add_ssse3; + vp9_idct16x16_1_add = vp9_idct16x16_1_add_c; + if (flags & HAS_SSE2) vp9_idct16x16_1_add = vp9_idct16x16_1_add_sse2; + vp9_idct16x16_256_add = vp9_idct16x16_256_add_c; + if (flags & HAS_SSE2) vp9_idct16x16_256_add = vp9_idct16x16_256_add_sse2; + if (flags & HAS_SSSE3) vp9_idct16x16_256_add = vp9_idct16x16_256_add_ssse3; + vp9_idct32x32_1024_add = vp9_idct32x32_1024_add_c; + if (flags & HAS_SSE2) vp9_idct32x32_1024_add = vp9_idct32x32_1024_add_sse2; + vp9_idct32x32_1_add = vp9_idct32x32_1_add_c; + if (flags & HAS_SSE2) vp9_idct32x32_1_add = vp9_idct32x32_1_add_sse2; + vp9_idct32x32_34_add = vp9_idct32x32_34_add_c; + if (flags & HAS_SSE2) vp9_idct32x32_34_add = vp9_idct32x32_34_add_sse2; + vp9_idct4x4_16_add = vp9_idct4x4_16_add_c; + if (flags & HAS_SSE2) vp9_idct4x4_16_add = vp9_idct4x4_16_add_sse2; + vp9_idct4x4_1_add = vp9_idct4x4_1_add_c; + if (flags & HAS_SSE2) vp9_idct4x4_1_add = vp9_idct4x4_1_add_sse2; + vp9_idct8x8_12_add = vp9_idct8x8_12_add_c; + if (flags & HAS_SSE2) vp9_idct8x8_12_add = vp9_idct8x8_12_add_sse2; + vp9_idct8x8_1_add = vp9_idct8x8_1_add_c; + if (flags & HAS_SSE2) vp9_idct8x8_1_add = vp9_idct8x8_1_add_sse2; + vp9_idct8x8_64_add = vp9_idct8x8_64_add_c; + if (flags & HAS_SSE2) vp9_idct8x8_64_add = vp9_idct8x8_64_add_sse2; + vp9_iht16x16_256_add = vp9_iht16x16_256_add_c; + if (flags & HAS_SSE2) vp9_iht16x16_256_add = vp9_iht16x16_256_add_sse2; + vp9_iht4x4_16_add = vp9_iht4x4_16_add_c; + if (flags & HAS_SSE2) vp9_iht4x4_16_add = vp9_iht4x4_16_add_sse2; + vp9_iht8x8_64_add = vp9_iht8x8_64_add_c; + if (flags & HAS_SSE2) vp9_iht8x8_64_add = vp9_iht8x8_64_add_sse2; + vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_c; + if (flags & HAS_SSE2) vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_sse2; + if (flags & HAS_AVX2) vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_avx2; + vp9_lpf_horizontal_4 = vp9_lpf_horizontal_4_c; + if (flags & HAS_MMX) vp9_lpf_horizontal_4 = vp9_lpf_horizontal_4_mmx; + vp9_lpf_horizontal_4_dual = vp9_lpf_horizontal_4_dual_c; + if (flags & HAS_SSE2) vp9_lpf_horizontal_4_dual = vp9_lpf_horizontal_4_dual_sse2; + vp9_lpf_horizontal_8 = vp9_lpf_horizontal_8_c; + if (flags & HAS_SSE2) vp9_lpf_horizontal_8 = vp9_lpf_horizontal_8_sse2; + vp9_lpf_horizontal_8_dual = vp9_lpf_horizontal_8_dual_c; + if (flags & HAS_SSE2) vp9_lpf_horizontal_8_dual = vp9_lpf_horizontal_8_dual_sse2; + vp9_lpf_vertical_16 = vp9_lpf_vertical_16_c; + if (flags & HAS_SSE2) vp9_lpf_vertical_16 = vp9_lpf_vertical_16_sse2; + vp9_lpf_vertical_16_dual = vp9_lpf_vertical_16_dual_c; + if (flags & HAS_SSE2) vp9_lpf_vertical_16_dual = vp9_lpf_vertical_16_dual_sse2; + vp9_lpf_vertical_4 = vp9_lpf_vertical_4_c; + if (flags & HAS_MMX) vp9_lpf_vertical_4 = vp9_lpf_vertical_4_mmx; + vp9_lpf_vertical_4_dual = vp9_lpf_vertical_4_dual_c; + if (flags & HAS_SSE2) vp9_lpf_vertical_4_dual = vp9_lpf_vertical_4_dual_sse2; + vp9_lpf_vertical_8 = vp9_lpf_vertical_8_c; + if (flags & HAS_SSE2) vp9_lpf_vertical_8 = vp9_lpf_vertical_8_sse2; + vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_c; + if (flags & HAS_SSE2) vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_sse2; + vp9_mse16x16 = vp9_mse16x16_c; + if (flags & HAS_AVX2) vp9_mse16x16 = vp9_mse16x16_avx2; + vp9_sad16x16x3 = vp9_sad16x16x3_c; + if (flags & HAS_SSE3) vp9_sad16x16x3 = vp9_sad16x16x3_sse3; + if (flags & HAS_SSSE3) vp9_sad16x16x3 = vp9_sad16x16x3_ssse3; + vp9_sad16x16x4d = vp9_sad16x16x4d_c; + if (flags & HAS_SSE2) vp9_sad16x16x4d = vp9_sad16x16x4d_sse2; + vp9_sad16x32x4d = vp9_sad16x32x4d_c; + if (flags & HAS_SSE2) vp9_sad16x32x4d = vp9_sad16x32x4d_sse2; + vp9_sad16x8x3 = vp9_sad16x8x3_c; + if (flags & HAS_SSE3) vp9_sad16x8x3 = vp9_sad16x8x3_sse3; + if (flags & HAS_SSSE3) vp9_sad16x8x3 = vp9_sad16x8x3_ssse3; + vp9_sad16x8x4d = vp9_sad16x8x4d_c; + if (flags & HAS_SSE2) vp9_sad16x8x4d = vp9_sad16x8x4d_sse2; + vp9_sad32x16x4d = vp9_sad32x16x4d_c; + if (flags & HAS_SSE2) vp9_sad32x16x4d = vp9_sad32x16x4d_sse2; + vp9_sad32x32x4d = vp9_sad32x32x4d_c; + if (flags & HAS_SSE2) vp9_sad32x32x4d = vp9_sad32x32x4d_sse2; + if (flags & HAS_AVX2) vp9_sad32x32x4d = vp9_sad32x32x4d_avx2; + vp9_sad32x64x4d = vp9_sad32x64x4d_c; + if (flags & HAS_SSE2) vp9_sad32x64x4d = vp9_sad32x64x4d_sse2; + vp9_sad4x4x3 = vp9_sad4x4x3_c; + if (flags & HAS_SSE3) vp9_sad4x4x3 = vp9_sad4x4x3_sse3; + vp9_sad4x4x4d = vp9_sad4x4x4d_c; + if (flags & HAS_SSE) vp9_sad4x4x4d = vp9_sad4x4x4d_sse; + vp9_sad4x8x4d = vp9_sad4x8x4d_c; + if (flags & HAS_SSE) vp9_sad4x8x4d = vp9_sad4x8x4d_sse; + vp9_sad64x32x4d = vp9_sad64x32x4d_c; + if (flags & HAS_SSE2) vp9_sad64x32x4d = vp9_sad64x32x4d_sse2; + vp9_sad64x64x4d = vp9_sad64x64x4d_c; + if (flags & HAS_SSE2) vp9_sad64x64x4d = vp9_sad64x64x4d_sse2; + if (flags & HAS_AVX2) vp9_sad64x64x4d = vp9_sad64x64x4d_avx2; + vp9_sad8x16x3 = vp9_sad8x16x3_c; + if (flags & HAS_SSE3) vp9_sad8x16x3 = vp9_sad8x16x3_sse3; + vp9_sad8x16x4d = vp9_sad8x16x4d_c; + if (flags & HAS_SSE2) vp9_sad8x16x4d = vp9_sad8x16x4d_sse2; + vp9_sad8x4x4d = vp9_sad8x4x4d_c; + if (flags & HAS_SSE2) vp9_sad8x4x4d = vp9_sad8x4x4d_sse2; + vp9_sad8x8x3 = vp9_sad8x8x3_c; + if (flags & HAS_SSE3) vp9_sad8x8x3 = vp9_sad8x8x3_sse3; + vp9_sad8x8x4d = vp9_sad8x8x4d_c; + if (flags & HAS_SSE2) vp9_sad8x8x4d = vp9_sad8x8x4d_sse2; + vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_c; + if (flags & HAS_AVX2) vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_avx2; + vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_c; + if (flags & HAS_AVX2) vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_avx2; + vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_c; + if (flags & HAS_AVX2) vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_avx2; + vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_c; + if (flags & HAS_AVX2) vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_avx2; vp9_temporal_filter_apply = vp9_temporal_filter_apply_c; if (flags & HAS_SSE2) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse2; + vp9_variance16x16 = vp9_variance16x16_c; + if (flags & HAS_AVX2) vp9_variance16x16 = vp9_variance16x16_avx2; + vp9_variance32x16 = vp9_variance32x16_c; + if (flags & HAS_AVX2) vp9_variance32x16 = vp9_variance32x16_avx2; + vp9_variance32x32 = vp9_variance32x32_c; + if (flags & HAS_AVX2) vp9_variance32x32 = vp9_variance32x32_avx2; + vp9_variance64x32 = vp9_variance64x32_c; + if (flags & HAS_AVX2) vp9_variance64x32 = vp9_variance64x32_avx2; + vp9_variance64x64 = vp9_variance64x64_c; + if (flags & HAS_AVX2) vp9_variance64x64 = vp9_variance64x64_avx2; } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vp9_rtcd_x86-win32-gcc.h b/media/libvpx/vp9_rtcd_x86-win32-gcc.h index ba86228db6a2..347990384d88 100644 --- a/media/libvpx/vp9_rtcd_x86-win32-gcc.h +++ b/media/libvpx/vp9_rtcd_x86-win32-gcc.h @@ -7,6 +7,10 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + /* * VP9 */ @@ -19,253 +23,22 @@ struct macroblockd; /* Encoder forward decls */ struct macroblock; struct vp9_variance_vtable; - -#define DEC_MVCOSTS int *mvjcost, int *mvcost[2] +struct search_site_config; +struct mv; union int_mv; struct yv12_buffer_config; -void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c - -void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c - -void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_tm_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c - -void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c - -void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c - -void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c - -void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c - -void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c - -void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c - -void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c - -void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c - -void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c - -void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c - -void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c - -void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c - -void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c - -void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c - -void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c - -void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c - -void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c - -void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c - -void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c - -void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); -void vp9_mb_lpf_vertical_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); -RTCD_EXTERN void (*vp9_mb_lpf_vertical_edge_w)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); - -void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mbloop_filter_vertical_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_mbloop_filter_vertical_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_loop_filter_vertical_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_loop_filter_vertical_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mb_lpf_horizontal_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_mb_lpf_horizontal_edge_w)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mbloop_filter_horizontal_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_mbloop_filter_horizontal_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_loop_filter_horizontal_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_loop_filter_horizontal_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); - -void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_sse2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); +RTCD_EXTERN int64_t (*vp9_block_error)(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*vp9_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); - -void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); - void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); @@ -281,14 +54,297 @@ void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8 void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*vp9_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct4x4_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c + +void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c + +void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c + +void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c + +void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c + +void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c + +void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c + +void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c + +void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c + +void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c + +void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c + +void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c + +void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c + +void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c + +void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c + +void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c + +void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c + +void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c + +void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c + +void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c + +void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct16x16)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct16x16_1)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_sse2(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_avx2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct32x32)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct32x32_1)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_rd_sse2(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_rd_avx2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct32x32_rd)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct4x4)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct4x4_1)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct8x8)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct8x8_1)(const int16_t *input, int16_t *output, int stride); + +void vp9_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht16x16_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht16x16)(const int16_t *input, int16_t *output, int stride, int tx_type); + +void vp9_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht4x4_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht4x4)(const int16_t *input, int16_t *output, int stride, int tx_type); + +void vp9_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht8x8_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht8x8)(const int16_t *input, int16_t *output, int stride, int tx_type); + +int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_full_range_search vp9_full_range_search_c + +int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +int vp9_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +RTCD_EXTERN int (*vp9_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); + +void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride); +void vp9_fwht4x4_mmx(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fwht4x4)(const int16_t *input, int16_t *output, int stride); + +void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vp9_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vp9_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vp9_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vp9_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vp9_get_mb_ss_c(const int16_t *); +unsigned int vp9_get_mb_ss_sse2(const int16_t *); +RTCD_EXTERN unsigned int (*vp9_get_mb_ss)(const int16_t *); + +void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_10_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct16x16_10_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct16x16_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct16x16_256_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct32x32_1024_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct32x32_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct32x32_34_add)(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); RTCD_EXTERN void (*vp9_idct4x4_16_add)(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct4x4_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct8x8_12_add)(const int16_t *input, uint8_t *dest, int dest_stride); + void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); RTCD_EXTERN void (*vp9_idct8x8_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); @@ -297,33 +353,9 @@ void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); RTCD_EXTERN void (*vp9_idct8x8_64_add)(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct8x8_10_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct16x16_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct16x16_256_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct16x16_10_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct32x32_1024_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct32x32_34_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct32x32_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type); +void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *output, int pitch, int tx_type); +RTCD_EXTERN void (*vp9_iht16x16_256_add)(const int16_t *input, uint8_t *output, int pitch, int tx_type); void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); @@ -333,485 +365,73 @@ void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride, i void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); RTCD_EXTERN void (*vp9_iht8x8_64_add)(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type); -void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *output, int pitch, int tx_type); -RTCD_EXTERN void (*vp9_iht16x16_256_add)(const int16_t *input, uint8_t *output, int pitch, int tx_type); +void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c void vp9_iwht4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); #define vp9_iwht4x4_1_add vp9_iwht4x4_1_add_c -void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c - -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vp9_get_sse_sum_8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance4x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad4x8_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad4x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad4x4_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad64x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad64x64_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad32x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad32x64_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad64x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad64x32_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad32x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad32x16_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad16x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad16x32_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad32x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad32x32_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad16x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad16x16_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad16x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad16x8_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad8x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad8x16_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad8x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad8x8_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad8x4_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad8x4_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad4x8_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad4x8_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad4x4_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad4x4_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance_halfpixvar16x16_h_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance_halfpixvar16x16_h)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance_halfpixvar16x16_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance_halfpixvar16x16_v_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance_halfpixvar16x16_v)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance_halfpixvar16x16_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance_halfpixvar16x16_hv_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance_halfpixvar16x16_hv)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance_halfpixvar64x64_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_h vp9_variance_halfpixvar64x64_h_c - -unsigned int vp9_variance_halfpixvar64x64_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_v vp9_variance_halfpixvar64x64_v_c - -unsigned int vp9_variance_halfpixvar64x64_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_hv vp9_variance_halfpixvar64x64_hv_c - -unsigned int vp9_variance_halfpixvar32x32_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_h vp9_variance_halfpixvar32x32_h_c - -unsigned int vp9_variance_halfpixvar32x32_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_v vp9_variance_halfpixvar32x32_v_c - -unsigned int vp9_variance_halfpixvar32x32_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_hv vp9_variance_halfpixvar32x32_hv_c - -void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad64x64x3 vp9_sad64x64x3_c - -void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad32x32x3 vp9_sad32x32x3_c - -void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad8x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad8x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad4x4x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad4x4x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad64x64x8 vp9_sad64x64x8_c - -void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad32x32x8 vp9_sad32x32x8_c - -void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad16x16x8 vp9_sad16x16x8_c - -void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad16x8x8 vp9_sad16x8x8_c - -void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x16x8 vp9_sad8x16x8_c - -void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x8x8 vp9_sad8x8x8_c - -void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x4x8 vp9_sad8x4x8_c - -void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad4x8x8 vp9_sad4x8x8_c - -void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad4x4x8 vp9_sad4x4x8_c - -void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad4x8x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad4x4x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_16_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_horizontal_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); + +void vp9_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_4_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_horizontal_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); + +void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vp9_lpf_horizontal_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vp9_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_horizontal_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); + +void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vp9_lpf_horizontal_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vp9_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vp9_lpf_vertical_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vp9_lpf_vertical_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vp9_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vp9_lpf_vertical_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vp9_lpf_vertical_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); + +void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_vertical_4_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_vertical_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); + +void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vp9_lpf_vertical_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); + +void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_vertical_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); + +void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vp9_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int vp9_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vp9_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vp9_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_c - unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x8 vp9_mse16x8_c +unsigned int vp9_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_mse16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); + +unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vp9_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_mse8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x8 vp9_mse8x8_c - -unsigned int vp9_sub_pixel_mse64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_mse64x64 vp9_sub_pixel_mse64x64_c - -unsigned int vp9_sub_pixel_mse32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_mse32x32 vp9_sub_pixel_mse32x32_c - -unsigned int vp9_get_mb_ss_c(const int16_t *); -unsigned int vp9_get_mb_ss_mmx(const int16_t *); -unsigned int vp9_get_mb_ss_sse2(const int16_t *); -RTCD_EXTERN unsigned int (*vp9_get_mb_ss)(const int16_t *); - -int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); -int64_t vp9_block_error_sse2(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); -RTCD_EXTERN int64_t (*vp9_block_error)(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); - -void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); -void vp9_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); -RTCD_EXTERN void (*vp9_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +unsigned int vp9_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_mse8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_b vp9_quantize_b_c @@ -819,60 +439,457 @@ void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_bloc void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_b_32x32 vp9_quantize_b_32x32_c -void vp9_short_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -RTCD_EXTERN void (*vp9_short_fht4x4)(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_quantize_fp vp9_quantize_fp_c -void vp9_short_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -RTCD_EXTERN void (*vp9_short_fht8x8)(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c -void vp9_short_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -RTCD_EXTERN void (*vp9_short_fht16x16)(const int16_t *input, int16_t *output, int stride, int tx_type); +int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_refining_search_sad vp9_refining_search_sad_c -void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride); -#define vp9_fwht4x4 vp9_fwht4x4_c +unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride); -RTCD_EXTERN void (*vp9_fdct4x4)(const int16_t *input, int16_t *output, int stride); +unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad16x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad16x16_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride); -RTCD_EXTERN void (*vp9_fdct8x8)(const int16_t *input, int16_t *output, int stride); +void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride); -RTCD_EXTERN void (*vp9_fdct16x16)(const int16_t *input, int16_t *output, int stride); +void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct32x32_sse2(const int16_t *input, int16_t *output, int stride); -RTCD_EXTERN void (*vp9_fdct32x32)(const int16_t *input, int16_t *output, int stride); +void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad16x16x8 vp9_sad16x16x8_c -void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct32x32_rd_sse2(const int16_t *input, int16_t *output, int stride); -RTCD_EXTERN void (*vp9_fdct32x32_rd)(const int16_t *input, int16_t *output, int stride); +unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -int vp9_full_search_sad_c(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -int vp9_full_search_sadx3(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -int vp9_full_search_sadx8(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -RTCD_EXTERN int (*vp9_full_search_sad)(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); +unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad16x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad16x32_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -int vp9_refining_search_sad_c(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -int vp9_refining_search_sadx4(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -RTCD_EXTERN int (*vp9_refining_search_sad)(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); +void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -int vp9_diamond_search_sad_c(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -int vp9_diamond_search_sadx4(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -RTCD_EXTERN int (*vp9_diamond_search_sad)(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); +unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -RTCD_EXTERN void (*vp9_temporal_filter_apply)(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad16x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad16x8_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vp9_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction); -#define vp9_yv12_copy_partial_frame vp9_yv12_copy_partial_frame_c +void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad16x8x8 vp9_sad16x8x8_c + +unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad32x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad32x16_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad32x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad32x32_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad32x32x3 vp9_sad32x32x3_c + +void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad32x32x8 vp9_sad32x32x8_c + +unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad32x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad32x64_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad4x4_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad4x4_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad4x4_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad4x4x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad4x4x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad4x4x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad4x4x8 vp9_sad4x4x8_c + +unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad4x8_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad4x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad4x8_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad4x8_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad4x8x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad4x8x8 vp9_sad4x8x8_c + +unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad64x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad64x32_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad64x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad64x64_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad64x64x3 vp9_sad64x64x3_c + +void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad64x64x8 vp9_sad64x64x8_c + +unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad8x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad8x16_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad8x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x16x8 vp9_sad8x16x8_c + +unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad8x4_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad8x4_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x4x8 vp9_sad8x4x8_c + +unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad8x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad8x8_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad8x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x8x8 vp9_sad8x8x8_c + +unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vp9_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +RTCD_EXTERN void (*vp9_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); + +void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +RTCD_EXTERN void (*vp9_temporal_filter_apply)(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); + +void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c + +void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance4x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); void vp9_rtcd(void); @@ -884,587 +901,408 @@ static void setup_rtcd_internal(void) (void)flags; - vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_ssse3; - - vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_ssse3; - - vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_ssse3; - - vp9_h_predictor_4x4 = vp9_h_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_h_predictor_4x4 = vp9_h_predictor_4x4_ssse3; - - - - vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_ssse3; - - vp9_v_predictor_4x4 = vp9_v_predictor_4x4_c; - if (flags & HAS_SSE) vp9_v_predictor_4x4 = vp9_v_predictor_4x4_sse; - - vp9_tm_predictor_4x4 = vp9_tm_predictor_4x4_c; - if (flags & HAS_SSE) vp9_tm_predictor_4x4 = vp9_tm_predictor_4x4_sse; - - vp9_dc_predictor_4x4 = vp9_dc_predictor_4x4_c; - if (flags & HAS_SSE) vp9_dc_predictor_4x4 = vp9_dc_predictor_4x4_sse; - - - - - vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_ssse3; - - vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_ssse3; - - vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_ssse3; - - vp9_h_predictor_8x8 = vp9_h_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_h_predictor_8x8 = vp9_h_predictor_8x8_ssse3; - - - - vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_ssse3; - - vp9_v_predictor_8x8 = vp9_v_predictor_8x8_c; - if (flags & HAS_SSE) vp9_v_predictor_8x8 = vp9_v_predictor_8x8_sse; - - vp9_tm_predictor_8x8 = vp9_tm_predictor_8x8_c; - if (flags & HAS_SSE2) vp9_tm_predictor_8x8 = vp9_tm_predictor_8x8_sse2; - - vp9_dc_predictor_8x8 = vp9_dc_predictor_8x8_c; - if (flags & HAS_SSE) vp9_dc_predictor_8x8 = vp9_dc_predictor_8x8_sse; - - - - - vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_ssse3; - - vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_ssse3; - - vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_ssse3; - - vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_ssse3; - - - - vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_ssse3; - - vp9_v_predictor_16x16 = vp9_v_predictor_16x16_c; - if (flags & HAS_SSE2) vp9_v_predictor_16x16 = vp9_v_predictor_16x16_sse2; - - vp9_tm_predictor_16x16 = vp9_tm_predictor_16x16_c; - if (flags & HAS_SSE2) vp9_tm_predictor_16x16 = vp9_tm_predictor_16x16_sse2; - - vp9_dc_predictor_16x16 = vp9_dc_predictor_16x16_c; - if (flags & HAS_SSE2) vp9_dc_predictor_16x16 = vp9_dc_predictor_16x16_sse2; - - - - - vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_ssse3; - - vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_ssse3; - - vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_ssse3; - - vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_h_predictor_32x32 = vp9_h_predictor_32x32_ssse3; - - - - - vp9_v_predictor_32x32 = vp9_v_predictor_32x32_c; - if (flags & HAS_SSE2) vp9_v_predictor_32x32 = vp9_v_predictor_32x32_sse2; - - - vp9_dc_predictor_32x32 = vp9_dc_predictor_32x32_c; - if (flags & HAS_SSE2) vp9_dc_predictor_32x32 = vp9_dc_predictor_32x32_sse2; - - - - - vp9_mb_lpf_vertical_edge_w = vp9_mb_lpf_vertical_edge_w_c; - if (flags & HAS_SSE2) vp9_mb_lpf_vertical_edge_w = vp9_mb_lpf_vertical_edge_w_sse2; - - vp9_mbloop_filter_vertical_edge = vp9_mbloop_filter_vertical_edge_c; - if (flags & HAS_SSE2) vp9_mbloop_filter_vertical_edge = vp9_mbloop_filter_vertical_edge_sse2; - - vp9_loop_filter_vertical_edge = vp9_loop_filter_vertical_edge_c; - if (flags & HAS_MMX) vp9_loop_filter_vertical_edge = vp9_loop_filter_vertical_edge_mmx; - - vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_c; - if (flags & HAS_SSE2) vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_sse2; - - vp9_mbloop_filter_horizontal_edge = vp9_mbloop_filter_horizontal_edge_c; - if (flags & HAS_SSE2) vp9_mbloop_filter_horizontal_edge = vp9_mbloop_filter_horizontal_edge_sse2; - - vp9_loop_filter_horizontal_edge = vp9_loop_filter_horizontal_edge_c; - if (flags & HAS_MMX) vp9_loop_filter_horizontal_edge = vp9_loop_filter_horizontal_edge_mmx; - - - - - vp9_convolve_copy = vp9_convolve_copy_c; - if (flags & HAS_SSE2) vp9_convolve_copy = vp9_convolve_copy_sse2; - - vp9_convolve_avg = vp9_convolve_avg_c; - if (flags & HAS_SSE2) vp9_convolve_avg = vp9_convolve_avg_sse2; - + vp9_block_error = vp9_block_error_c; + if (flags & HAS_SSE2) vp9_block_error = vp9_block_error_sse2; + if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2; vp9_convolve8 = vp9_convolve8_c; if (flags & HAS_SSE2) vp9_convolve8 = vp9_convolve8_sse2; if (flags & HAS_SSSE3) vp9_convolve8 = vp9_convolve8_ssse3; - - vp9_convolve8_horiz = vp9_convolve8_horiz_c; - if (flags & HAS_SSE2) vp9_convolve8_horiz = vp9_convolve8_horiz_sse2; - if (flags & HAS_SSSE3) vp9_convolve8_horiz = vp9_convolve8_horiz_ssse3; - - vp9_convolve8_vert = vp9_convolve8_vert_c; - if (flags & HAS_SSE2) vp9_convolve8_vert = vp9_convolve8_vert_sse2; - if (flags & HAS_SSSE3) vp9_convolve8_vert = vp9_convolve8_vert_ssse3; - + if (flags & HAS_AVX2) vp9_convolve8 = vp9_convolve8_avx2; vp9_convolve8_avg = vp9_convolve8_avg_c; if (flags & HAS_SSE2) vp9_convolve8_avg = vp9_convolve8_avg_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg = vp9_convolve8_avg_ssse3; - vp9_convolve8_avg_horiz = vp9_convolve8_avg_horiz_c; if (flags & HAS_SSE2) vp9_convolve8_avg_horiz = vp9_convolve8_avg_horiz_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg_horiz = vp9_convolve8_avg_horiz_ssse3; - vp9_convolve8_avg_vert = vp9_convolve8_avg_vert_c; if (flags & HAS_SSE2) vp9_convolve8_avg_vert = vp9_convolve8_avg_vert_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg_vert = vp9_convolve8_avg_vert_ssse3; - - vp9_idct4x4_1_add = vp9_idct4x4_1_add_c; - if (flags & HAS_SSE2) vp9_idct4x4_1_add = vp9_idct4x4_1_add_sse2; - - vp9_idct4x4_16_add = vp9_idct4x4_16_add_c; - if (flags & HAS_SSE2) vp9_idct4x4_16_add = vp9_idct4x4_16_add_sse2; - - vp9_idct8x8_1_add = vp9_idct8x8_1_add_c; - if (flags & HAS_SSE2) vp9_idct8x8_1_add = vp9_idct8x8_1_add_sse2; - - vp9_idct8x8_64_add = vp9_idct8x8_64_add_c; - if (flags & HAS_SSE2) vp9_idct8x8_64_add = vp9_idct8x8_64_add_sse2; - - vp9_idct8x8_10_add = vp9_idct8x8_10_add_c; - if (flags & HAS_SSE2) vp9_idct8x8_10_add = vp9_idct8x8_10_add_sse2; - - vp9_idct16x16_1_add = vp9_idct16x16_1_add_c; - if (flags & HAS_SSE2) vp9_idct16x16_1_add = vp9_idct16x16_1_add_sse2; - - vp9_idct16x16_256_add = vp9_idct16x16_256_add_c; - if (flags & HAS_SSE2) vp9_idct16x16_256_add = vp9_idct16x16_256_add_sse2; - - vp9_idct16x16_10_add = vp9_idct16x16_10_add_c; - if (flags & HAS_SSE2) vp9_idct16x16_10_add = vp9_idct16x16_10_add_sse2; - - vp9_idct32x32_1024_add = vp9_idct32x32_1024_add_c; - if (flags & HAS_SSE2) vp9_idct32x32_1024_add = vp9_idct32x32_1024_add_sse2; - - vp9_idct32x32_34_add = vp9_idct32x32_34_add_c; - if (flags & HAS_SSE2) vp9_idct32x32_34_add = vp9_idct32x32_34_add_sse2; - - vp9_idct32x32_1_add = vp9_idct32x32_1_add_c; - if (flags & HAS_SSE2) vp9_idct32x32_1_add = vp9_idct32x32_1_add_sse2; - - vp9_iht4x4_16_add = vp9_iht4x4_16_add_c; - if (flags & HAS_SSE2) vp9_iht4x4_16_add = vp9_iht4x4_16_add_sse2; - - vp9_iht8x8_64_add = vp9_iht8x8_64_add_c; - if (flags & HAS_SSE2) vp9_iht8x8_64_add = vp9_iht8x8_64_add_sse2; - - vp9_iht16x16_256_add = vp9_iht16x16_256_add_c; - if (flags & HAS_SSE2) vp9_iht16x16_256_add = vp9_iht16x16_256_add_sse2; - - - - vp9_variance32x16 = vp9_variance32x16_c; - if (flags & HAS_SSE2) vp9_variance32x16 = vp9_variance32x16_sse2; - - vp9_variance16x32 = vp9_variance16x32_c; - if (flags & HAS_SSE2) vp9_variance16x32 = vp9_variance16x32_sse2; - - vp9_variance64x32 = vp9_variance64x32_c; - if (flags & HAS_SSE2) vp9_variance64x32 = vp9_variance64x32_sse2; - - vp9_variance32x64 = vp9_variance32x64_c; - if (flags & HAS_SSE2) vp9_variance32x64 = vp9_variance32x64_sse2; - - vp9_variance32x32 = vp9_variance32x32_c; - if (flags & HAS_SSE2) vp9_variance32x32 = vp9_variance32x32_sse2; - - vp9_variance64x64 = vp9_variance64x64_c; - if (flags & HAS_SSE2) vp9_variance64x64 = vp9_variance64x64_sse2; - - vp9_variance16x16 = vp9_variance16x16_c; - if (flags & HAS_MMX) vp9_variance16x16 = vp9_variance16x16_mmx; - if (flags & HAS_SSE2) vp9_variance16x16 = vp9_variance16x16_sse2; - - vp9_variance16x8 = vp9_variance16x8_c; - if (flags & HAS_MMX) vp9_variance16x8 = vp9_variance16x8_mmx; - if (flags & HAS_SSE2) vp9_variance16x8 = vp9_variance16x8_sse2; - - vp9_variance8x16 = vp9_variance8x16_c; - if (flags & HAS_MMX) vp9_variance8x16 = vp9_variance8x16_mmx; - if (flags & HAS_SSE2) vp9_variance8x16 = vp9_variance8x16_sse2; - - vp9_variance8x8 = vp9_variance8x8_c; - if (flags & HAS_MMX) vp9_variance8x8 = vp9_variance8x8_mmx; - if (flags & HAS_SSE2) vp9_variance8x8 = vp9_variance8x8_sse2; - - vp9_get_sse_sum_8x8 = vp9_get_sse_sum_8x8_c; - if (flags & HAS_SSE2) vp9_get_sse_sum_8x8 = vp9_get8x8var_sse2; - - vp9_variance8x4 = vp9_variance8x4_c; - if (flags & HAS_SSE2) vp9_variance8x4 = vp9_variance8x4_sse2; - - vp9_variance4x8 = vp9_variance4x8_c; - if (flags & HAS_SSE2) vp9_variance4x8 = vp9_variance4x8_sse2; - - vp9_variance4x4 = vp9_variance4x4_c; - if (flags & HAS_MMX) vp9_variance4x4 = vp9_variance4x4_mmx; - if (flags & HAS_SSE2) vp9_variance4x4 = vp9_variance4x4_sse2; - - vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_c; - if (flags & HAS_SSE2) vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_ssse3; - - vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_c; - if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_ssse3; - - vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_c; - if (flags & HAS_SSE2) vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_ssse3; - - vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_c; - if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_ssse3; - - vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_c; - if (flags & HAS_SSE2) vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_ssse3; - - vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_c; - if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_ssse3; - - vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_c; - if (flags & HAS_SSE2) vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_ssse3; - - vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_c; - if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_ssse3; - - vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_c; - if (flags & HAS_SSE2) vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_ssse3; - - vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_c; - if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_ssse3; - - vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_c; - if (flags & HAS_SSE2) vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_ssse3; - - vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_c; - if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_ssse3; - - vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_c; - if (flags & HAS_SSE2) vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_ssse3; - - vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_c; - if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_ssse3; - - vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_c; - if (flags & HAS_SSE2) vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_ssse3; - - vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_c; - if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_ssse3; - - vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_c; - if (flags & HAS_SSE2) vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_ssse3; - - vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_c; - if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_ssse3; - - vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_c; - if (flags & HAS_SSE2) vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_ssse3; - - vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_c; - if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_ssse3; - - vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_c; - if (flags & HAS_SSE2) vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_ssse3; - - vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_c; - if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_ssse3; - - vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_c; - if (flags & HAS_SSE) vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_ssse3; - - vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_c; - if (flags & HAS_SSE) vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_ssse3; - - vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_c; - if (flags & HAS_SSE) vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_ssse3; - - vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_c; - if (flags & HAS_SSE) vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_ssse3; - - vp9_sad64x64 = vp9_sad64x64_c; - if (flags & HAS_SSE2) vp9_sad64x64 = vp9_sad64x64_sse2; - - vp9_sad32x64 = vp9_sad32x64_c; - if (flags & HAS_SSE2) vp9_sad32x64 = vp9_sad32x64_sse2; - - vp9_sad64x32 = vp9_sad64x32_c; - if (flags & HAS_SSE2) vp9_sad64x32 = vp9_sad64x32_sse2; - - vp9_sad32x16 = vp9_sad32x16_c; - if (flags & HAS_SSE2) vp9_sad32x16 = vp9_sad32x16_sse2; - - vp9_sad16x32 = vp9_sad16x32_c; - if (flags & HAS_SSE2) vp9_sad16x32 = vp9_sad16x32_sse2; - - vp9_sad32x32 = vp9_sad32x32_c; - if (flags & HAS_SSE2) vp9_sad32x32 = vp9_sad32x32_sse2; - - vp9_sad16x16 = vp9_sad16x16_c; - if (flags & HAS_MMX) vp9_sad16x16 = vp9_sad16x16_mmx; - if (flags & HAS_SSE2) vp9_sad16x16 = vp9_sad16x16_sse2; - - vp9_sad16x8 = vp9_sad16x8_c; - if (flags & HAS_MMX) vp9_sad16x8 = vp9_sad16x8_mmx; - if (flags & HAS_SSE2) vp9_sad16x8 = vp9_sad16x8_sse2; - - vp9_sad8x16 = vp9_sad8x16_c; - if (flags & HAS_MMX) vp9_sad8x16 = vp9_sad8x16_mmx; - if (flags & HAS_SSE2) vp9_sad8x16 = vp9_sad8x16_sse2; - - vp9_sad8x8 = vp9_sad8x8_c; - if (flags & HAS_MMX) vp9_sad8x8 = vp9_sad8x8_mmx; - if (flags & HAS_SSE2) vp9_sad8x8 = vp9_sad8x8_sse2; - - vp9_sad8x4 = vp9_sad8x4_c; - if (flags & HAS_SSE2) vp9_sad8x4 = vp9_sad8x4_sse2; - - vp9_sad4x8 = vp9_sad4x8_c; - if (flags & HAS_SSE) vp9_sad4x8 = vp9_sad4x8_sse; - - vp9_sad4x4 = vp9_sad4x4_c; - if (flags & HAS_MMX) vp9_sad4x4 = vp9_sad4x4_mmx; - if (flags & HAS_SSE) vp9_sad4x4 = vp9_sad4x4_sse; - - vp9_sad64x64_avg = vp9_sad64x64_avg_c; - if (flags & HAS_SSE2) vp9_sad64x64_avg = vp9_sad64x64_avg_sse2; - - vp9_sad32x64_avg = vp9_sad32x64_avg_c; - if (flags & HAS_SSE2) vp9_sad32x64_avg = vp9_sad32x64_avg_sse2; - - vp9_sad64x32_avg = vp9_sad64x32_avg_c; - if (flags & HAS_SSE2) vp9_sad64x32_avg = vp9_sad64x32_avg_sse2; - - vp9_sad32x16_avg = vp9_sad32x16_avg_c; - if (flags & HAS_SSE2) vp9_sad32x16_avg = vp9_sad32x16_avg_sse2; - - vp9_sad16x32_avg = vp9_sad16x32_avg_c; - if (flags & HAS_SSE2) vp9_sad16x32_avg = vp9_sad16x32_avg_sse2; - - vp9_sad32x32_avg = vp9_sad32x32_avg_c; - if (flags & HAS_SSE2) vp9_sad32x32_avg = vp9_sad32x32_avg_sse2; - - vp9_sad16x16_avg = vp9_sad16x16_avg_c; - if (flags & HAS_SSE2) vp9_sad16x16_avg = vp9_sad16x16_avg_sse2; - - vp9_sad16x8_avg = vp9_sad16x8_avg_c; - if (flags & HAS_SSE2) vp9_sad16x8_avg = vp9_sad16x8_avg_sse2; - - vp9_sad8x16_avg = vp9_sad8x16_avg_c; - if (flags & HAS_SSE2) vp9_sad8x16_avg = vp9_sad8x16_avg_sse2; - - vp9_sad8x8_avg = vp9_sad8x8_avg_c; - if (flags & HAS_SSE2) vp9_sad8x8_avg = vp9_sad8x8_avg_sse2; - - vp9_sad8x4_avg = vp9_sad8x4_avg_c; - if (flags & HAS_SSE2) vp9_sad8x4_avg = vp9_sad8x4_avg_sse2; - - vp9_sad4x8_avg = vp9_sad4x8_avg_c; - if (flags & HAS_SSE) vp9_sad4x8_avg = vp9_sad4x8_avg_sse; - - vp9_sad4x4_avg = vp9_sad4x4_avg_c; - if (flags & HAS_SSE) vp9_sad4x4_avg = vp9_sad4x4_avg_sse; - - vp9_variance_halfpixvar16x16_h = vp9_variance_halfpixvar16x16_h_c; - if (flags & HAS_SSE2) vp9_variance_halfpixvar16x16_h = vp9_variance_halfpixvar16x16_h_sse2; - - vp9_variance_halfpixvar16x16_v = vp9_variance_halfpixvar16x16_v_c; - if (flags & HAS_SSE2) vp9_variance_halfpixvar16x16_v = vp9_variance_halfpixvar16x16_v_sse2; - - vp9_variance_halfpixvar16x16_hv = vp9_variance_halfpixvar16x16_hv_c; - if (flags & HAS_SSE2) vp9_variance_halfpixvar16x16_hv = vp9_variance_halfpixvar16x16_hv_sse2; - - - - - - - - - - vp9_sad16x16x3 = vp9_sad16x16x3_c; - if (flags & HAS_SSE3) vp9_sad16x16x3 = vp9_sad16x16x3_sse3; - if (flags & HAS_SSSE3) vp9_sad16x16x3 = vp9_sad16x16x3_ssse3; - - vp9_sad16x8x3 = vp9_sad16x8x3_c; - if (flags & HAS_SSE3) vp9_sad16x8x3 = vp9_sad16x8x3_sse3; - if (flags & HAS_SSSE3) vp9_sad16x8x3 = vp9_sad16x8x3_ssse3; - - vp9_sad8x16x3 = vp9_sad8x16x3_c; - if (flags & HAS_SSE3) vp9_sad8x16x3 = vp9_sad8x16x3_sse3; - - vp9_sad8x8x3 = vp9_sad8x8x3_c; - if (flags & HAS_SSE3) vp9_sad8x8x3 = vp9_sad8x8x3_sse3; - - vp9_sad4x4x3 = vp9_sad4x4x3_c; - if (flags & HAS_SSE3) vp9_sad4x4x3 = vp9_sad4x4x3_sse3; - - - - - - - - - - - vp9_sad64x64x4d = vp9_sad64x64x4d_c; - if (flags & HAS_SSE2) vp9_sad64x64x4d = vp9_sad64x64x4d_sse2; - - vp9_sad32x64x4d = vp9_sad32x64x4d_c; - if (flags & HAS_SSE2) vp9_sad32x64x4d = vp9_sad32x64x4d_sse2; - - vp9_sad64x32x4d = vp9_sad64x32x4d_c; - if (flags & HAS_SSE2) vp9_sad64x32x4d = vp9_sad64x32x4d_sse2; - - vp9_sad32x16x4d = vp9_sad32x16x4d_c; - if (flags & HAS_SSE2) vp9_sad32x16x4d = vp9_sad32x16x4d_sse2; - - vp9_sad16x32x4d = vp9_sad16x32x4d_c; - if (flags & HAS_SSE2) vp9_sad16x32x4d = vp9_sad16x32x4d_sse2; - - vp9_sad32x32x4d = vp9_sad32x32x4d_c; - if (flags & HAS_SSE2) vp9_sad32x32x4d = vp9_sad32x32x4d_sse2; - - vp9_sad16x16x4d = vp9_sad16x16x4d_c; - if (flags & HAS_SSE2) vp9_sad16x16x4d = vp9_sad16x16x4d_sse2; - - vp9_sad16x8x4d = vp9_sad16x8x4d_c; - if (flags & HAS_SSE2) vp9_sad16x8x4d = vp9_sad16x8x4d_sse2; - - vp9_sad8x16x4d = vp9_sad8x16x4d_c; - if (flags & HAS_SSE2) vp9_sad8x16x4d = vp9_sad8x16x4d_sse2; - - vp9_sad8x8x4d = vp9_sad8x8x4d_c; - if (flags & HAS_SSE2) vp9_sad8x8x4d = vp9_sad8x8x4d_sse2; - - vp9_sad8x4x4d = vp9_sad8x4x4d_c; - if (flags & HAS_SSE2) vp9_sad8x4x4d = vp9_sad8x4x4d_sse2; - - vp9_sad4x8x4d = vp9_sad4x8x4d_c; - if (flags & HAS_SSE) vp9_sad4x8x4d = vp9_sad4x8x4d_sse; - - vp9_sad4x4x4d = vp9_sad4x4x4d_c; - if (flags & HAS_SSE) vp9_sad4x4x4d = vp9_sad4x4x4d_sse; - - vp9_mse16x16 = vp9_mse16x16_c; - if (flags & HAS_MMX) vp9_mse16x16 = vp9_mse16x16_mmx; - if (flags & HAS_SSE2) vp9_mse16x16 = vp9_mse16x16_sse2; - - - - - - - vp9_get_mb_ss = vp9_get_mb_ss_c; - if (flags & HAS_MMX) vp9_get_mb_ss = vp9_get_mb_ss_mmx; - if (flags & HAS_SSE2) vp9_get_mb_ss = vp9_get_mb_ss_sse2; - - vp9_block_error = vp9_block_error_c; - if (flags & HAS_SSE2) vp9_block_error = vp9_block_error_sse2; - - vp9_subtract_block = vp9_subtract_block_c; - if (flags & HAS_SSE2) vp9_subtract_block = vp9_subtract_block_sse2; - - - - vp9_short_fht4x4 = vp9_short_fht4x4_c; - if (flags & HAS_SSE2) vp9_short_fht4x4 = vp9_short_fht4x4_sse2; - - vp9_short_fht8x8 = vp9_short_fht8x8_c; - if (flags & HAS_SSE2) vp9_short_fht8x8 = vp9_short_fht8x8_sse2; - - vp9_short_fht16x16 = vp9_short_fht16x16_c; - if (flags & HAS_SSE2) vp9_short_fht16x16 = vp9_short_fht16x16_sse2; - - - vp9_fdct4x4 = vp9_fdct4x4_c; - if (flags & HAS_SSE2) vp9_fdct4x4 = vp9_fdct4x4_sse2; - - vp9_fdct8x8 = vp9_fdct8x8_c; - if (flags & HAS_SSE2) vp9_fdct8x8 = vp9_fdct8x8_sse2; - + vp9_convolve8_horiz = vp9_convolve8_horiz_c; + if (flags & HAS_SSE2) vp9_convolve8_horiz = vp9_convolve8_horiz_sse2; + if (flags & HAS_SSSE3) vp9_convolve8_horiz = vp9_convolve8_horiz_ssse3; + if (flags & HAS_AVX2) vp9_convolve8_horiz = vp9_convolve8_horiz_avx2; + vp9_convolve8_vert = vp9_convolve8_vert_c; + if (flags & HAS_SSE2) vp9_convolve8_vert = vp9_convolve8_vert_sse2; + if (flags & HAS_SSSE3) vp9_convolve8_vert = vp9_convolve8_vert_ssse3; + if (flags & HAS_AVX2) vp9_convolve8_vert = vp9_convolve8_vert_avx2; + vp9_convolve_avg = vp9_convolve_avg_c; + if (flags & HAS_SSE2) vp9_convolve_avg = vp9_convolve_avg_sse2; + vp9_convolve_copy = vp9_convolve_copy_c; + if (flags & HAS_SSE2) vp9_convolve_copy = vp9_convolve_copy_sse2; + vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_ssse3; + vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_ssse3; + vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_ssse3; + vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_ssse3; + vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_ssse3; + vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_ssse3; + vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_ssse3; + vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_ssse3; + vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_ssse3; + vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_ssse3; + vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_ssse3; + vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_ssse3; + vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_ssse3; + vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_ssse3; + vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_ssse3; + vp9_dc_predictor_16x16 = vp9_dc_predictor_16x16_c; + if (flags & HAS_SSE2) vp9_dc_predictor_16x16 = vp9_dc_predictor_16x16_sse2; + vp9_dc_predictor_32x32 = vp9_dc_predictor_32x32_c; + if (flags & HAS_SSE2) vp9_dc_predictor_32x32 = vp9_dc_predictor_32x32_sse2; + vp9_dc_predictor_4x4 = vp9_dc_predictor_4x4_c; + if (flags & HAS_SSE) vp9_dc_predictor_4x4 = vp9_dc_predictor_4x4_sse; + vp9_dc_predictor_8x8 = vp9_dc_predictor_8x8_c; + if (flags & HAS_SSE) vp9_dc_predictor_8x8 = vp9_dc_predictor_8x8_sse; vp9_fdct16x16 = vp9_fdct16x16_c; if (flags & HAS_SSE2) vp9_fdct16x16 = vp9_fdct16x16_sse2; - + vp9_fdct16x16_1 = vp9_fdct16x16_1_c; + if (flags & HAS_SSE2) vp9_fdct16x16_1 = vp9_fdct16x16_1_sse2; vp9_fdct32x32 = vp9_fdct32x32_c; if (flags & HAS_SSE2) vp9_fdct32x32 = vp9_fdct32x32_sse2; - + if (flags & HAS_AVX2) vp9_fdct32x32 = vp9_fdct32x32_avx2; + vp9_fdct32x32_1 = vp9_fdct32x32_1_c; + if (flags & HAS_SSE2) vp9_fdct32x32_1 = vp9_fdct32x32_1_sse2; vp9_fdct32x32_rd = vp9_fdct32x32_rd_c; if (flags & HAS_SSE2) vp9_fdct32x32_rd = vp9_fdct32x32_rd_sse2; - + if (flags & HAS_AVX2) vp9_fdct32x32_rd = vp9_fdct32x32_rd_avx2; + vp9_fdct4x4 = vp9_fdct4x4_c; + if (flags & HAS_SSE2) vp9_fdct4x4 = vp9_fdct4x4_sse2; + vp9_fdct4x4_1 = vp9_fdct4x4_1_c; + if (flags & HAS_SSE2) vp9_fdct4x4_1 = vp9_fdct4x4_1_sse2; + vp9_fdct8x8 = vp9_fdct8x8_c; + if (flags & HAS_SSE2) vp9_fdct8x8 = vp9_fdct8x8_sse2; + vp9_fdct8x8_1 = vp9_fdct8x8_1_c; + if (flags & HAS_SSE2) vp9_fdct8x8_1 = vp9_fdct8x8_1_sse2; + vp9_fht16x16 = vp9_fht16x16_c; + if (flags & HAS_SSE2) vp9_fht16x16 = vp9_fht16x16_sse2; + vp9_fht4x4 = vp9_fht4x4_c; + if (flags & HAS_SSE2) vp9_fht4x4 = vp9_fht4x4_sse2; + vp9_fht8x8 = vp9_fht8x8_c; + if (flags & HAS_SSE2) vp9_fht8x8 = vp9_fht8x8_sse2; vp9_full_search_sad = vp9_full_search_sad_c; if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3; if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; - - vp9_refining_search_sad = vp9_refining_search_sad_c; - if (flags & HAS_SSE3) vp9_refining_search_sad = vp9_refining_search_sadx4; - - vp9_diamond_search_sad = vp9_diamond_search_sad_c; - if (flags & HAS_SSE3) vp9_diamond_search_sad = vp9_diamond_search_sadx4; - + vp9_fwht4x4 = vp9_fwht4x4_c; + if (flags & HAS_MMX) vp9_fwht4x4 = vp9_fwht4x4_mmx; + vp9_get16x16var = vp9_get16x16var_c; + if (flags & HAS_SSE2) vp9_get16x16var = vp9_get16x16var_sse2; + if (flags & HAS_AVX2) vp9_get16x16var = vp9_get16x16var_avx2; + vp9_get8x8var = vp9_get8x8var_c; + if (flags & HAS_SSE2) vp9_get8x8var = vp9_get8x8var_sse2; + vp9_get_mb_ss = vp9_get_mb_ss_c; + if (flags & HAS_SSE2) vp9_get_mb_ss = vp9_get_mb_ss_sse2; + vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_ssse3; + vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_h_predictor_32x32 = vp9_h_predictor_32x32_ssse3; + vp9_h_predictor_4x4 = vp9_h_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_h_predictor_4x4 = vp9_h_predictor_4x4_ssse3; + vp9_h_predictor_8x8 = vp9_h_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_h_predictor_8x8 = vp9_h_predictor_8x8_ssse3; + vp9_idct16x16_10_add = vp9_idct16x16_10_add_c; + if (flags & HAS_SSE2) vp9_idct16x16_10_add = vp9_idct16x16_10_add_sse2; + if (flags & HAS_SSSE3) vp9_idct16x16_10_add = vp9_idct16x16_10_add_ssse3; + vp9_idct16x16_1_add = vp9_idct16x16_1_add_c; + if (flags & HAS_SSE2) vp9_idct16x16_1_add = vp9_idct16x16_1_add_sse2; + vp9_idct16x16_256_add = vp9_idct16x16_256_add_c; + if (flags & HAS_SSE2) vp9_idct16x16_256_add = vp9_idct16x16_256_add_sse2; + if (flags & HAS_SSSE3) vp9_idct16x16_256_add = vp9_idct16x16_256_add_ssse3; + vp9_idct32x32_1024_add = vp9_idct32x32_1024_add_c; + if (flags & HAS_SSE2) vp9_idct32x32_1024_add = vp9_idct32x32_1024_add_sse2; + vp9_idct32x32_1_add = vp9_idct32x32_1_add_c; + if (flags & HAS_SSE2) vp9_idct32x32_1_add = vp9_idct32x32_1_add_sse2; + vp9_idct32x32_34_add = vp9_idct32x32_34_add_c; + if (flags & HAS_SSE2) vp9_idct32x32_34_add = vp9_idct32x32_34_add_sse2; + vp9_idct4x4_16_add = vp9_idct4x4_16_add_c; + if (flags & HAS_SSE2) vp9_idct4x4_16_add = vp9_idct4x4_16_add_sse2; + vp9_idct4x4_1_add = vp9_idct4x4_1_add_c; + if (flags & HAS_SSE2) vp9_idct4x4_1_add = vp9_idct4x4_1_add_sse2; + vp9_idct8x8_12_add = vp9_idct8x8_12_add_c; + if (flags & HAS_SSE2) vp9_idct8x8_12_add = vp9_idct8x8_12_add_sse2; + vp9_idct8x8_1_add = vp9_idct8x8_1_add_c; + if (flags & HAS_SSE2) vp9_idct8x8_1_add = vp9_idct8x8_1_add_sse2; + vp9_idct8x8_64_add = vp9_idct8x8_64_add_c; + if (flags & HAS_SSE2) vp9_idct8x8_64_add = vp9_idct8x8_64_add_sse2; + vp9_iht16x16_256_add = vp9_iht16x16_256_add_c; + if (flags & HAS_SSE2) vp9_iht16x16_256_add = vp9_iht16x16_256_add_sse2; + vp9_iht4x4_16_add = vp9_iht4x4_16_add_c; + if (flags & HAS_SSE2) vp9_iht4x4_16_add = vp9_iht4x4_16_add_sse2; + vp9_iht8x8_64_add = vp9_iht8x8_64_add_c; + if (flags & HAS_SSE2) vp9_iht8x8_64_add = vp9_iht8x8_64_add_sse2; + vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_c; + if (flags & HAS_SSE2) vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_sse2; + if (flags & HAS_AVX2) vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_avx2; + vp9_lpf_horizontal_4 = vp9_lpf_horizontal_4_c; + if (flags & HAS_MMX) vp9_lpf_horizontal_4 = vp9_lpf_horizontal_4_mmx; + vp9_lpf_horizontal_4_dual = vp9_lpf_horizontal_4_dual_c; + if (flags & HAS_SSE2) vp9_lpf_horizontal_4_dual = vp9_lpf_horizontal_4_dual_sse2; + vp9_lpf_horizontal_8 = vp9_lpf_horizontal_8_c; + if (flags & HAS_SSE2) vp9_lpf_horizontal_8 = vp9_lpf_horizontal_8_sse2; + vp9_lpf_horizontal_8_dual = vp9_lpf_horizontal_8_dual_c; + if (flags & HAS_SSE2) vp9_lpf_horizontal_8_dual = vp9_lpf_horizontal_8_dual_sse2; + vp9_lpf_vertical_16 = vp9_lpf_vertical_16_c; + if (flags & HAS_SSE2) vp9_lpf_vertical_16 = vp9_lpf_vertical_16_sse2; + vp9_lpf_vertical_16_dual = vp9_lpf_vertical_16_dual_c; + if (flags & HAS_SSE2) vp9_lpf_vertical_16_dual = vp9_lpf_vertical_16_dual_sse2; + vp9_lpf_vertical_4 = vp9_lpf_vertical_4_c; + if (flags & HAS_MMX) vp9_lpf_vertical_4 = vp9_lpf_vertical_4_mmx; + vp9_lpf_vertical_4_dual = vp9_lpf_vertical_4_dual_c; + if (flags & HAS_SSE2) vp9_lpf_vertical_4_dual = vp9_lpf_vertical_4_dual_sse2; + vp9_lpf_vertical_8 = vp9_lpf_vertical_8_c; + if (flags & HAS_SSE2) vp9_lpf_vertical_8 = vp9_lpf_vertical_8_sse2; + vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_c; + if (flags & HAS_SSE2) vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_sse2; + vp9_mse16x16 = vp9_mse16x16_c; + if (flags & HAS_SSE2) vp9_mse16x16 = vp9_mse16x16_sse2; + if (flags & HAS_AVX2) vp9_mse16x16 = vp9_mse16x16_avx2; + vp9_mse16x8 = vp9_mse16x8_c; + if (flags & HAS_SSE2) vp9_mse16x8 = vp9_mse16x8_sse2; + vp9_mse8x16 = vp9_mse8x16_c; + if (flags & HAS_SSE2) vp9_mse8x16 = vp9_mse8x16_sse2; + vp9_mse8x8 = vp9_mse8x8_c; + if (flags & HAS_SSE2) vp9_mse8x8 = vp9_mse8x8_sse2; + vp9_sad16x16 = vp9_sad16x16_c; + if (flags & HAS_SSE2) vp9_sad16x16 = vp9_sad16x16_sse2; + vp9_sad16x16_avg = vp9_sad16x16_avg_c; + if (flags & HAS_SSE2) vp9_sad16x16_avg = vp9_sad16x16_avg_sse2; + vp9_sad16x16x3 = vp9_sad16x16x3_c; + if (flags & HAS_SSE3) vp9_sad16x16x3 = vp9_sad16x16x3_sse3; + if (flags & HAS_SSSE3) vp9_sad16x16x3 = vp9_sad16x16x3_ssse3; + vp9_sad16x16x4d = vp9_sad16x16x4d_c; + if (flags & HAS_SSE2) vp9_sad16x16x4d = vp9_sad16x16x4d_sse2; + vp9_sad16x32 = vp9_sad16x32_c; + if (flags & HAS_SSE2) vp9_sad16x32 = vp9_sad16x32_sse2; + vp9_sad16x32_avg = vp9_sad16x32_avg_c; + if (flags & HAS_SSE2) vp9_sad16x32_avg = vp9_sad16x32_avg_sse2; + vp9_sad16x32x4d = vp9_sad16x32x4d_c; + if (flags & HAS_SSE2) vp9_sad16x32x4d = vp9_sad16x32x4d_sse2; + vp9_sad16x8 = vp9_sad16x8_c; + if (flags & HAS_SSE2) vp9_sad16x8 = vp9_sad16x8_sse2; + vp9_sad16x8_avg = vp9_sad16x8_avg_c; + if (flags & HAS_SSE2) vp9_sad16x8_avg = vp9_sad16x8_avg_sse2; + vp9_sad16x8x3 = vp9_sad16x8x3_c; + if (flags & HAS_SSE3) vp9_sad16x8x3 = vp9_sad16x8x3_sse3; + if (flags & HAS_SSSE3) vp9_sad16x8x3 = vp9_sad16x8x3_ssse3; + vp9_sad16x8x4d = vp9_sad16x8x4d_c; + if (flags & HAS_SSE2) vp9_sad16x8x4d = vp9_sad16x8x4d_sse2; + vp9_sad32x16 = vp9_sad32x16_c; + if (flags & HAS_SSE2) vp9_sad32x16 = vp9_sad32x16_sse2; + vp9_sad32x16_avg = vp9_sad32x16_avg_c; + if (flags & HAS_SSE2) vp9_sad32x16_avg = vp9_sad32x16_avg_sse2; + vp9_sad32x16x4d = vp9_sad32x16x4d_c; + if (flags & HAS_SSE2) vp9_sad32x16x4d = vp9_sad32x16x4d_sse2; + vp9_sad32x32 = vp9_sad32x32_c; + if (flags & HAS_SSE2) vp9_sad32x32 = vp9_sad32x32_sse2; + vp9_sad32x32_avg = vp9_sad32x32_avg_c; + if (flags & HAS_SSE2) vp9_sad32x32_avg = vp9_sad32x32_avg_sse2; + vp9_sad32x32x4d = vp9_sad32x32x4d_c; + if (flags & HAS_SSE2) vp9_sad32x32x4d = vp9_sad32x32x4d_sse2; + if (flags & HAS_AVX2) vp9_sad32x32x4d = vp9_sad32x32x4d_avx2; + vp9_sad32x64 = vp9_sad32x64_c; + if (flags & HAS_SSE2) vp9_sad32x64 = vp9_sad32x64_sse2; + vp9_sad32x64_avg = vp9_sad32x64_avg_c; + if (flags & HAS_SSE2) vp9_sad32x64_avg = vp9_sad32x64_avg_sse2; + vp9_sad32x64x4d = vp9_sad32x64x4d_c; + if (flags & HAS_SSE2) vp9_sad32x64x4d = vp9_sad32x64x4d_sse2; + vp9_sad4x4 = vp9_sad4x4_c; + if (flags & HAS_SSE) vp9_sad4x4 = vp9_sad4x4_sse; + vp9_sad4x4_avg = vp9_sad4x4_avg_c; + if (flags & HAS_SSE) vp9_sad4x4_avg = vp9_sad4x4_avg_sse; + vp9_sad4x4x3 = vp9_sad4x4x3_c; + if (flags & HAS_SSE3) vp9_sad4x4x3 = vp9_sad4x4x3_sse3; + vp9_sad4x4x4d = vp9_sad4x4x4d_c; + if (flags & HAS_SSE) vp9_sad4x4x4d = vp9_sad4x4x4d_sse; + vp9_sad4x8 = vp9_sad4x8_c; + if (flags & HAS_SSE) vp9_sad4x8 = vp9_sad4x8_sse; + vp9_sad4x8_avg = vp9_sad4x8_avg_c; + if (flags & HAS_SSE) vp9_sad4x8_avg = vp9_sad4x8_avg_sse; + vp9_sad4x8x4d = vp9_sad4x8x4d_c; + if (flags & HAS_SSE) vp9_sad4x8x4d = vp9_sad4x8x4d_sse; + vp9_sad64x32 = vp9_sad64x32_c; + if (flags & HAS_SSE2) vp9_sad64x32 = vp9_sad64x32_sse2; + vp9_sad64x32_avg = vp9_sad64x32_avg_c; + if (flags & HAS_SSE2) vp9_sad64x32_avg = vp9_sad64x32_avg_sse2; + vp9_sad64x32x4d = vp9_sad64x32x4d_c; + if (flags & HAS_SSE2) vp9_sad64x32x4d = vp9_sad64x32x4d_sse2; + vp9_sad64x64 = vp9_sad64x64_c; + if (flags & HAS_SSE2) vp9_sad64x64 = vp9_sad64x64_sse2; + vp9_sad64x64_avg = vp9_sad64x64_avg_c; + if (flags & HAS_SSE2) vp9_sad64x64_avg = vp9_sad64x64_avg_sse2; + vp9_sad64x64x4d = vp9_sad64x64x4d_c; + if (flags & HAS_SSE2) vp9_sad64x64x4d = vp9_sad64x64x4d_sse2; + if (flags & HAS_AVX2) vp9_sad64x64x4d = vp9_sad64x64x4d_avx2; + vp9_sad8x16 = vp9_sad8x16_c; + if (flags & HAS_SSE2) vp9_sad8x16 = vp9_sad8x16_sse2; + vp9_sad8x16_avg = vp9_sad8x16_avg_c; + if (flags & HAS_SSE2) vp9_sad8x16_avg = vp9_sad8x16_avg_sse2; + vp9_sad8x16x3 = vp9_sad8x16x3_c; + if (flags & HAS_SSE3) vp9_sad8x16x3 = vp9_sad8x16x3_sse3; + vp9_sad8x16x4d = vp9_sad8x16x4d_c; + if (flags & HAS_SSE2) vp9_sad8x16x4d = vp9_sad8x16x4d_sse2; + vp9_sad8x4 = vp9_sad8x4_c; + if (flags & HAS_SSE2) vp9_sad8x4 = vp9_sad8x4_sse2; + vp9_sad8x4_avg = vp9_sad8x4_avg_c; + if (flags & HAS_SSE2) vp9_sad8x4_avg = vp9_sad8x4_avg_sse2; + vp9_sad8x4x4d = vp9_sad8x4x4d_c; + if (flags & HAS_SSE2) vp9_sad8x4x4d = vp9_sad8x4x4d_sse2; + vp9_sad8x8 = vp9_sad8x8_c; + if (flags & HAS_SSE2) vp9_sad8x8 = vp9_sad8x8_sse2; + vp9_sad8x8_avg = vp9_sad8x8_avg_c; + if (flags & HAS_SSE2) vp9_sad8x8_avg = vp9_sad8x8_avg_sse2; + vp9_sad8x8x3 = vp9_sad8x8x3_c; + if (flags & HAS_SSE3) vp9_sad8x8x3 = vp9_sad8x8x3_sse3; + vp9_sad8x8x4d = vp9_sad8x8x4d_c; + if (flags & HAS_SSE2) vp9_sad8x8x4d = vp9_sad8x8x4d_sse2; + vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_c; + if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_ssse3; + vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_c; + if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_ssse3; + vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_c; + if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_ssse3; + vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_c; + if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_ssse3; + vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_c; + if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_ssse3; + if (flags & HAS_AVX2) vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_avx2; + vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_c; + if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_ssse3; + vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_c; + if (flags & HAS_SSE) vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_ssse3; + vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_c; + if (flags & HAS_SSE) vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_ssse3; + vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_c; + if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_ssse3; + vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_c; + if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_ssse3; + if (flags & HAS_AVX2) vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_avx2; + vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_c; + if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_ssse3; + vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_c; + if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_ssse3; + vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_c; + if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_ssse3; + vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_c; + if (flags & HAS_SSE2) vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_ssse3; + vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_c; + if (flags & HAS_SSE2) vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_ssse3; + vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_c; + if (flags & HAS_SSE2) vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_ssse3; + vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_c; + if (flags & HAS_SSE2) vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_ssse3; + vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_c; + if (flags & HAS_SSE2) vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_ssse3; + if (flags & HAS_AVX2) vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_avx2; + vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_c; + if (flags & HAS_SSE2) vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_ssse3; + vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_c; + if (flags & HAS_SSE) vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_ssse3; + vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_c; + if (flags & HAS_SSE) vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_ssse3; + vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_c; + if (flags & HAS_SSE2) vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_ssse3; + vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_c; + if (flags & HAS_SSE2) vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_ssse3; + if (flags & HAS_AVX2) vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_avx2; + vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_c; + if (flags & HAS_SSE2) vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_ssse3; + vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_c; + if (flags & HAS_SSE2) vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_ssse3; + vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_c; + if (flags & HAS_SSE2) vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_ssse3; + vp9_subtract_block = vp9_subtract_block_c; + if (flags & HAS_SSE2) vp9_subtract_block = vp9_subtract_block_sse2; vp9_temporal_filter_apply = vp9_temporal_filter_apply_c; if (flags & HAS_SSE2) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse2; + vp9_tm_predictor_16x16 = vp9_tm_predictor_16x16_c; + if (flags & HAS_SSE2) vp9_tm_predictor_16x16 = vp9_tm_predictor_16x16_sse2; + vp9_tm_predictor_4x4 = vp9_tm_predictor_4x4_c; + if (flags & HAS_SSE) vp9_tm_predictor_4x4 = vp9_tm_predictor_4x4_sse; + vp9_tm_predictor_8x8 = vp9_tm_predictor_8x8_c; + if (flags & HAS_SSE2) vp9_tm_predictor_8x8 = vp9_tm_predictor_8x8_sse2; + vp9_v_predictor_16x16 = vp9_v_predictor_16x16_c; + if (flags & HAS_SSE2) vp9_v_predictor_16x16 = vp9_v_predictor_16x16_sse2; + vp9_v_predictor_32x32 = vp9_v_predictor_32x32_c; + if (flags & HAS_SSE2) vp9_v_predictor_32x32 = vp9_v_predictor_32x32_sse2; + vp9_v_predictor_4x4 = vp9_v_predictor_4x4_c; + if (flags & HAS_SSE) vp9_v_predictor_4x4 = vp9_v_predictor_4x4_sse; + vp9_v_predictor_8x8 = vp9_v_predictor_8x8_c; + if (flags & HAS_SSE) vp9_v_predictor_8x8 = vp9_v_predictor_8x8_sse; + vp9_variance16x16 = vp9_variance16x16_c; + if (flags & HAS_SSE2) vp9_variance16x16 = vp9_variance16x16_sse2; + if (flags & HAS_AVX2) vp9_variance16x16 = vp9_variance16x16_avx2; + vp9_variance16x32 = vp9_variance16x32_c; + if (flags & HAS_SSE2) vp9_variance16x32 = vp9_variance16x32_sse2; + vp9_variance16x8 = vp9_variance16x8_c; + if (flags & HAS_SSE2) vp9_variance16x8 = vp9_variance16x8_sse2; + vp9_variance32x16 = vp9_variance32x16_c; + if (flags & HAS_SSE2) vp9_variance32x16 = vp9_variance32x16_sse2; + if (flags & HAS_AVX2) vp9_variance32x16 = vp9_variance32x16_avx2; + vp9_variance32x32 = vp9_variance32x32_c; + if (flags & HAS_SSE2) vp9_variance32x32 = vp9_variance32x32_sse2; + if (flags & HAS_AVX2) vp9_variance32x32 = vp9_variance32x32_avx2; + vp9_variance32x64 = vp9_variance32x64_c; + if (flags & HAS_SSE2) vp9_variance32x64 = vp9_variance32x64_sse2; + vp9_variance4x4 = vp9_variance4x4_c; + if (flags & HAS_SSE2) vp9_variance4x4 = vp9_variance4x4_sse2; + vp9_variance4x8 = vp9_variance4x8_c; + if (flags & HAS_SSE2) vp9_variance4x8 = vp9_variance4x8_sse2; + vp9_variance64x32 = vp9_variance64x32_c; + if (flags & HAS_SSE2) vp9_variance64x32 = vp9_variance64x32_sse2; + if (flags & HAS_AVX2) vp9_variance64x32 = vp9_variance64x32_avx2; + vp9_variance64x64 = vp9_variance64x64_c; + if (flags & HAS_SSE2) vp9_variance64x64 = vp9_variance64x64_sse2; + if (flags & HAS_AVX2) vp9_variance64x64 = vp9_variance64x64_avx2; + vp9_variance8x16 = vp9_variance8x16_c; + if (flags & HAS_SSE2) vp9_variance8x16 = vp9_variance8x16_sse2; + vp9_variance8x4 = vp9_variance8x4_c; + if (flags & HAS_SSE2) vp9_variance8x4 = vp9_variance8x4_sse2; + vp9_variance8x8 = vp9_variance8x8_c; + if (flags & HAS_SSE2) vp9_variance8x8 = vp9_variance8x8_sse2; } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vp9_rtcd_x86-win32-vs8.h b/media/libvpx/vp9_rtcd_x86-win32-vs8.h index ba86228db6a2..aa34a254b4fa 100644 --- a/media/libvpx/vp9_rtcd_x86-win32-vs8.h +++ b/media/libvpx/vp9_rtcd_x86-win32-vs8.h @@ -7,6 +7,10 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + /* * VP9 */ @@ -19,253 +23,20 @@ struct macroblockd; /* Encoder forward decls */ struct macroblock; struct vp9_variance_vtable; - -#define DEC_MVCOSTS int *mvjcost, int *mvcost[2] +struct search_site_config; +struct mv; union int_mv; struct yv12_buffer_config; -void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c - -void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c - -void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_tm_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c - -void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c - -void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c - -void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c - -void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c - -void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c - -void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c - -void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c - -void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c - -void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c - -void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c - -void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c - -void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c - -void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c - -void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c - -void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c - -void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c - -void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c - -void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c - -void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c - -void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); -void vp9_mb_lpf_vertical_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); -RTCD_EXTERN void (*vp9_mb_lpf_vertical_edge_w)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); - -void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mbloop_filter_vertical_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_mbloop_filter_vertical_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_loop_filter_vertical_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_loop_filter_vertical_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mb_lpf_horizontal_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_mb_lpf_horizontal_edge_w)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mbloop_filter_horizontal_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_mbloop_filter_horizontal_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_loop_filter_horizontal_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_loop_filter_horizontal_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); - -void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_sse2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); +RTCD_EXTERN int64_t (*vp9_block_error)(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*vp9_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); - -void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); - void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); @@ -281,14 +52,292 @@ void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8 void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*vp9_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct4x4_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve_avg)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); + +void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c + +void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c + +void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c + +void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c + +void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c + +void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c + +void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c + +void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c + +void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c + +void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c + +void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c + +void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c + +void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c + +void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c + +void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c + +void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c + +void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c + +void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c + +void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c + +void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c + +void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct16x16)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct16x16_1)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct32x32)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct32x32_1)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_rd_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct32x32_rd)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct4x4)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct4x4_1)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct8x8)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct8x8_1)(const int16_t *input, int16_t *output, int stride); + +void vp9_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht16x16_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht16x16)(const int16_t *input, int16_t *output, int stride, int tx_type); + +void vp9_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht4x4_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht4x4)(const int16_t *input, int16_t *output, int stride, int tx_type); + +void vp9_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht8x8_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +RTCD_EXTERN void (*vp9_fht8x8)(const int16_t *input, int16_t *output, int stride, int tx_type); + +int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_full_range_search vp9_full_range_search_c + +int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +int vp9_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +RTCD_EXTERN int (*vp9_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); + +void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride); +void vp9_fwht4x4_mmx(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fwht4x4)(const int16_t *input, int16_t *output, int stride); + +void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vp9_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vp9_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vp9_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +unsigned int vp9_get_mb_ss_c(const int16_t *); +unsigned int vp9_get_mb_ss_sse2(const int16_t *); +RTCD_EXTERN unsigned int (*vp9_get_mb_ss)(const int16_t *); + +void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_10_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct16x16_10_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct16x16_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct16x16_256_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct32x32_1024_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct32x32_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct32x32_34_add)(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); RTCD_EXTERN void (*vp9_idct4x4_16_add)(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct4x4_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct8x8_12_add)(const int16_t *input, uint8_t *dest, int dest_stride); + void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); RTCD_EXTERN void (*vp9_idct8x8_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); @@ -297,33 +346,9 @@ void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); RTCD_EXTERN void (*vp9_idct8x8_64_add)(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct8x8_10_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct16x16_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct16x16_256_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct16x16_10_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct32x32_1024_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct32x32_34_add)(const int16_t *input, uint8_t *dest, int dest_stride); - -void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vp9_idct32x32_1_add)(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type); +void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *output, int pitch, int tx_type); +RTCD_EXTERN void (*vp9_iht16x16_256_add)(const int16_t *input, uint8_t *output, int pitch, int tx_type); void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); @@ -333,485 +358,71 @@ void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride, i void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); RTCD_EXTERN void (*vp9_iht8x8_64_add)(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type); -void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *output, int pitch, int tx_type); -RTCD_EXTERN void (*vp9_iht16x16_256_add)(const int16_t *input, uint8_t *output, int pitch, int tx_type); +void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c void vp9_iwht4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); #define vp9_iwht4x4_1_add vp9_iwht4x4_1_add_c -void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c +void vp9_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_horizontal_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +void vp9_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_4_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_horizontal_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vp9_lpf_horizontal_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +void vp9_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_horizontal_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vp9_lpf_horizontal_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +void vp9_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vp9_lpf_vertical_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vp9_lpf_vertical_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +void vp9_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vp9_lpf_vertical_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +RTCD_EXTERN void (*vp9_lpf_vertical_16_dual)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_vertical_4_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_vertical_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vp9_lpf_vertical_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_vertical_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -RTCD_EXTERN void (*vp9_get_sse_sum_8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance4x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad4x8_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad4x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad4x4_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); - -unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad64x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad64x64_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad32x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad32x64_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad64x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad64x32_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad32x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad32x16_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad16x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad16x32_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad32x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad32x32_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad16x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad16x16_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad16x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad16x8_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad8x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad8x16_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad8x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad8x8_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad8x4_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad8x4_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad4x8_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad4x8_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad4x4_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -RTCD_EXTERN unsigned int (*vp9_sad4x4_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); - -unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance_halfpixvar16x16_h_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance_halfpixvar16x16_h)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance_halfpixvar16x16_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance_halfpixvar16x16_v_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance_halfpixvar16x16_v)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance_halfpixvar16x16_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance_halfpixvar16x16_hv_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_variance_halfpixvar16x16_hv)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_variance_halfpixvar64x64_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_h vp9_variance_halfpixvar64x64_h_c - -unsigned int vp9_variance_halfpixvar64x64_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_v vp9_variance_halfpixvar64x64_v_c - -unsigned int vp9_variance_halfpixvar64x64_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_hv vp9_variance_halfpixvar64x64_hv_c - -unsigned int vp9_variance_halfpixvar32x32_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_h vp9_variance_halfpixvar32x32_h_c - -unsigned int vp9_variance_halfpixvar32x32_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_v vp9_variance_halfpixvar32x32_v_c - -unsigned int vp9_variance_halfpixvar32x32_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_hv vp9_variance_halfpixvar32x32_hv_c - -void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad64x64x3 vp9_sad64x64x3_c - -void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad32x32x3 vp9_sad32x32x3_c - -void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad8x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad8x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad4x4x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad4x4x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad64x64x8 vp9_sad64x64x8_c - -void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad32x32x8 vp9_sad32x32x8_c - -void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad16x16x8 vp9_sad16x16x8_c - -void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad16x8x8 vp9_sad16x8x8_c - -void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x16x8 vp9_sad8x16x8_c - -void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x8x8 vp9_sad8x8x8_c - -void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x4x8 vp9_sad8x4x8_c - -void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad4x8x8 vp9_sad4x8x8_c - -void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad4x4x8 vp9_sad4x4x8_c - -void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad4x8x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); - -void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad4x4x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +RTCD_EXTERN void (*vp9_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int vp9_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vp9_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_c - unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x8 vp9_mse16x8_c +unsigned int vp9_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_mse16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); + +unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vp9_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_mse8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x8 vp9_mse8x8_c - -unsigned int vp9_sub_pixel_mse64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_mse64x64 vp9_sub_pixel_mse64x64_c - -unsigned int vp9_sub_pixel_mse32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_mse32x32 vp9_sub_pixel_mse32x32_c - -unsigned int vp9_get_mb_ss_c(const int16_t *); -unsigned int vp9_get_mb_ss_mmx(const int16_t *); -unsigned int vp9_get_mb_ss_sse2(const int16_t *); -RTCD_EXTERN unsigned int (*vp9_get_mb_ss)(const int16_t *); - -int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); -int64_t vp9_block_error_sse2(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); -RTCD_EXTERN int64_t (*vp9_block_error)(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); - -void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); -void vp9_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); -RTCD_EXTERN void (*vp9_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +unsigned int vp9_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_mse8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_b vp9_quantize_b_c @@ -819,60 +430,446 @@ void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_bloc void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_b_32x32 vp9_quantize_b_32x32_c -void vp9_short_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -RTCD_EXTERN void (*vp9_short_fht4x4)(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_quantize_fp vp9_quantize_fp_c -void vp9_short_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -RTCD_EXTERN void (*vp9_short_fht8x8)(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c -void vp9_short_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -RTCD_EXTERN void (*vp9_short_fht16x16)(const int16_t *input, int16_t *output, int stride, int tx_type); +int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_refining_search_sad vp9_refining_search_sad_c -void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride); -#define vp9_fwht4x4 vp9_fwht4x4_c +unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride); -RTCD_EXTERN void (*vp9_fdct4x4)(const int16_t *input, int16_t *output, int stride); +unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad16x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad16x16_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride); -RTCD_EXTERN void (*vp9_fdct8x8)(const int16_t *input, int16_t *output, int stride); +void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride); -RTCD_EXTERN void (*vp9_fdct16x16)(const int16_t *input, int16_t *output, int stride); +void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct32x32_sse2(const int16_t *input, int16_t *output, int stride); -RTCD_EXTERN void (*vp9_fdct32x32)(const int16_t *input, int16_t *output, int stride); +void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad16x16x8 vp9_sad16x16x8_c -void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct32x32_rd_sse2(const int16_t *input, int16_t *output, int stride); -RTCD_EXTERN void (*vp9_fdct32x32_rd)(const int16_t *input, int16_t *output, int stride); +unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -int vp9_full_search_sad_c(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -int vp9_full_search_sadx3(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -int vp9_full_search_sadx8(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -RTCD_EXTERN int (*vp9_full_search_sad)(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); +unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad16x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad16x32_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -int vp9_refining_search_sad_c(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -int vp9_refining_search_sadx4(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -RTCD_EXTERN int (*vp9_refining_search_sad)(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); +void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -int vp9_diamond_search_sad_c(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -int vp9_diamond_search_sadx4(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -RTCD_EXTERN int (*vp9_diamond_search_sad)(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); +unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); -void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -RTCD_EXTERN void (*vp9_temporal_filter_apply)(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad16x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad16x8_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); -void vp9_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction); -#define vp9_yv12_copy_partial_frame vp9_yv12_copy_partial_frame_c +void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad16x8x8 vp9_sad16x8x8_c + +unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad32x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad32x16_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad32x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad32x32_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad32x32x3 vp9_sad32x32x3_c + +void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad32x32x8 vp9_sad32x32x8_c + +unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad32x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad32x64_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad4x4_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad4x4_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad4x4_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad4x4x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad4x4x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad4x4x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad4x4x8 vp9_sad4x4x8_c + +unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad4x8_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad4x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad4x8_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad4x8_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad4x8x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad4x8x8 vp9_sad4x8x8_c + +unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad64x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad64x32_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad64x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad64x64_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad64x64x3 vp9_sad64x64x3_c + +void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad64x64x8 vp9_sad64x64x8_c + +unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad8x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad8x16_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad8x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x16x8 vp9_sad8x16x8_c + +unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad8x4_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad8x4_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x4x8 vp9_sad8x4x8_c + +unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +RTCD_EXTERN unsigned int (*vp9_sad8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); + +unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad8x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sad8x8_avg)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); + +void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad8x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x8x8 vp9_sad8x8x8_c + +unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vp9_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +RTCD_EXTERN void (*vp9_subtract_block)(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); + +void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +RTCD_EXTERN void (*vp9_temporal_filter_apply)(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); + +void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_c + +void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance4x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); void vp9_rtcd(void); @@ -884,587 +881,388 @@ static void setup_rtcd_internal(void) (void)flags; - vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_ssse3; - - vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_ssse3; - - vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_ssse3; - - vp9_h_predictor_4x4 = vp9_h_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_h_predictor_4x4 = vp9_h_predictor_4x4_ssse3; - - - - vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_ssse3; - - vp9_v_predictor_4x4 = vp9_v_predictor_4x4_c; - if (flags & HAS_SSE) vp9_v_predictor_4x4 = vp9_v_predictor_4x4_sse; - - vp9_tm_predictor_4x4 = vp9_tm_predictor_4x4_c; - if (flags & HAS_SSE) vp9_tm_predictor_4x4 = vp9_tm_predictor_4x4_sse; - - vp9_dc_predictor_4x4 = vp9_dc_predictor_4x4_c; - if (flags & HAS_SSE) vp9_dc_predictor_4x4 = vp9_dc_predictor_4x4_sse; - - - - - vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_ssse3; - - vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_ssse3; - - vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_ssse3; - - vp9_h_predictor_8x8 = vp9_h_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_h_predictor_8x8 = vp9_h_predictor_8x8_ssse3; - - - - vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_ssse3; - - vp9_v_predictor_8x8 = vp9_v_predictor_8x8_c; - if (flags & HAS_SSE) vp9_v_predictor_8x8 = vp9_v_predictor_8x8_sse; - - vp9_tm_predictor_8x8 = vp9_tm_predictor_8x8_c; - if (flags & HAS_SSE2) vp9_tm_predictor_8x8 = vp9_tm_predictor_8x8_sse2; - - vp9_dc_predictor_8x8 = vp9_dc_predictor_8x8_c; - if (flags & HAS_SSE) vp9_dc_predictor_8x8 = vp9_dc_predictor_8x8_sse; - - - - - vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_ssse3; - - vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_ssse3; - - vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_ssse3; - - vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_ssse3; - - - - vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_ssse3; - - vp9_v_predictor_16x16 = vp9_v_predictor_16x16_c; - if (flags & HAS_SSE2) vp9_v_predictor_16x16 = vp9_v_predictor_16x16_sse2; - - vp9_tm_predictor_16x16 = vp9_tm_predictor_16x16_c; - if (flags & HAS_SSE2) vp9_tm_predictor_16x16 = vp9_tm_predictor_16x16_sse2; - - vp9_dc_predictor_16x16 = vp9_dc_predictor_16x16_c; - if (flags & HAS_SSE2) vp9_dc_predictor_16x16 = vp9_dc_predictor_16x16_sse2; - - - - - vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_ssse3; - - vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_ssse3; - - vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_ssse3; - - vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_h_predictor_32x32 = vp9_h_predictor_32x32_ssse3; - - - - - vp9_v_predictor_32x32 = vp9_v_predictor_32x32_c; - if (flags & HAS_SSE2) vp9_v_predictor_32x32 = vp9_v_predictor_32x32_sse2; - - - vp9_dc_predictor_32x32 = vp9_dc_predictor_32x32_c; - if (flags & HAS_SSE2) vp9_dc_predictor_32x32 = vp9_dc_predictor_32x32_sse2; - - - - - vp9_mb_lpf_vertical_edge_w = vp9_mb_lpf_vertical_edge_w_c; - if (flags & HAS_SSE2) vp9_mb_lpf_vertical_edge_w = vp9_mb_lpf_vertical_edge_w_sse2; - - vp9_mbloop_filter_vertical_edge = vp9_mbloop_filter_vertical_edge_c; - if (flags & HAS_SSE2) vp9_mbloop_filter_vertical_edge = vp9_mbloop_filter_vertical_edge_sse2; - - vp9_loop_filter_vertical_edge = vp9_loop_filter_vertical_edge_c; - if (flags & HAS_MMX) vp9_loop_filter_vertical_edge = vp9_loop_filter_vertical_edge_mmx; - - vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_c; - if (flags & HAS_SSE2) vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_sse2; - - vp9_mbloop_filter_horizontal_edge = vp9_mbloop_filter_horizontal_edge_c; - if (flags & HAS_SSE2) vp9_mbloop_filter_horizontal_edge = vp9_mbloop_filter_horizontal_edge_sse2; - - vp9_loop_filter_horizontal_edge = vp9_loop_filter_horizontal_edge_c; - if (flags & HAS_MMX) vp9_loop_filter_horizontal_edge = vp9_loop_filter_horizontal_edge_mmx; - - - - - vp9_convolve_copy = vp9_convolve_copy_c; - if (flags & HAS_SSE2) vp9_convolve_copy = vp9_convolve_copy_sse2; - - vp9_convolve_avg = vp9_convolve_avg_c; - if (flags & HAS_SSE2) vp9_convolve_avg = vp9_convolve_avg_sse2; - + vp9_block_error = vp9_block_error_c; + if (flags & HAS_SSE2) vp9_block_error = vp9_block_error_sse2; vp9_convolve8 = vp9_convolve8_c; if (flags & HAS_SSE2) vp9_convolve8 = vp9_convolve8_sse2; if (flags & HAS_SSSE3) vp9_convolve8 = vp9_convolve8_ssse3; - - vp9_convolve8_horiz = vp9_convolve8_horiz_c; - if (flags & HAS_SSE2) vp9_convolve8_horiz = vp9_convolve8_horiz_sse2; - if (flags & HAS_SSSE3) vp9_convolve8_horiz = vp9_convolve8_horiz_ssse3; - - vp9_convolve8_vert = vp9_convolve8_vert_c; - if (flags & HAS_SSE2) vp9_convolve8_vert = vp9_convolve8_vert_sse2; - if (flags & HAS_SSSE3) vp9_convolve8_vert = vp9_convolve8_vert_ssse3; - vp9_convolve8_avg = vp9_convolve8_avg_c; if (flags & HAS_SSE2) vp9_convolve8_avg = vp9_convolve8_avg_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg = vp9_convolve8_avg_ssse3; - vp9_convolve8_avg_horiz = vp9_convolve8_avg_horiz_c; if (flags & HAS_SSE2) vp9_convolve8_avg_horiz = vp9_convolve8_avg_horiz_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg_horiz = vp9_convolve8_avg_horiz_ssse3; - vp9_convolve8_avg_vert = vp9_convolve8_avg_vert_c; if (flags & HAS_SSE2) vp9_convolve8_avg_vert = vp9_convolve8_avg_vert_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg_vert = vp9_convolve8_avg_vert_ssse3; - - vp9_idct4x4_1_add = vp9_idct4x4_1_add_c; - if (flags & HAS_SSE2) vp9_idct4x4_1_add = vp9_idct4x4_1_add_sse2; - - vp9_idct4x4_16_add = vp9_idct4x4_16_add_c; - if (flags & HAS_SSE2) vp9_idct4x4_16_add = vp9_idct4x4_16_add_sse2; - - vp9_idct8x8_1_add = vp9_idct8x8_1_add_c; - if (flags & HAS_SSE2) vp9_idct8x8_1_add = vp9_idct8x8_1_add_sse2; - - vp9_idct8x8_64_add = vp9_idct8x8_64_add_c; - if (flags & HAS_SSE2) vp9_idct8x8_64_add = vp9_idct8x8_64_add_sse2; - - vp9_idct8x8_10_add = vp9_idct8x8_10_add_c; - if (flags & HAS_SSE2) vp9_idct8x8_10_add = vp9_idct8x8_10_add_sse2; - - vp9_idct16x16_1_add = vp9_idct16x16_1_add_c; - if (flags & HAS_SSE2) vp9_idct16x16_1_add = vp9_idct16x16_1_add_sse2; - - vp9_idct16x16_256_add = vp9_idct16x16_256_add_c; - if (flags & HAS_SSE2) vp9_idct16x16_256_add = vp9_idct16x16_256_add_sse2; - - vp9_idct16x16_10_add = vp9_idct16x16_10_add_c; - if (flags & HAS_SSE2) vp9_idct16x16_10_add = vp9_idct16x16_10_add_sse2; - - vp9_idct32x32_1024_add = vp9_idct32x32_1024_add_c; - if (flags & HAS_SSE2) vp9_idct32x32_1024_add = vp9_idct32x32_1024_add_sse2; - - vp9_idct32x32_34_add = vp9_idct32x32_34_add_c; - if (flags & HAS_SSE2) vp9_idct32x32_34_add = vp9_idct32x32_34_add_sse2; - - vp9_idct32x32_1_add = vp9_idct32x32_1_add_c; - if (flags & HAS_SSE2) vp9_idct32x32_1_add = vp9_idct32x32_1_add_sse2; - - vp9_iht4x4_16_add = vp9_iht4x4_16_add_c; - if (flags & HAS_SSE2) vp9_iht4x4_16_add = vp9_iht4x4_16_add_sse2; - - vp9_iht8x8_64_add = vp9_iht8x8_64_add_c; - if (flags & HAS_SSE2) vp9_iht8x8_64_add = vp9_iht8x8_64_add_sse2; - - vp9_iht16x16_256_add = vp9_iht16x16_256_add_c; - if (flags & HAS_SSE2) vp9_iht16x16_256_add = vp9_iht16x16_256_add_sse2; - - - - vp9_variance32x16 = vp9_variance32x16_c; - if (flags & HAS_SSE2) vp9_variance32x16 = vp9_variance32x16_sse2; - - vp9_variance16x32 = vp9_variance16x32_c; - if (flags & HAS_SSE2) vp9_variance16x32 = vp9_variance16x32_sse2; - - vp9_variance64x32 = vp9_variance64x32_c; - if (flags & HAS_SSE2) vp9_variance64x32 = vp9_variance64x32_sse2; - - vp9_variance32x64 = vp9_variance32x64_c; - if (flags & HAS_SSE2) vp9_variance32x64 = vp9_variance32x64_sse2; - - vp9_variance32x32 = vp9_variance32x32_c; - if (flags & HAS_SSE2) vp9_variance32x32 = vp9_variance32x32_sse2; - - vp9_variance64x64 = vp9_variance64x64_c; - if (flags & HAS_SSE2) vp9_variance64x64 = vp9_variance64x64_sse2; - - vp9_variance16x16 = vp9_variance16x16_c; - if (flags & HAS_MMX) vp9_variance16x16 = vp9_variance16x16_mmx; - if (flags & HAS_SSE2) vp9_variance16x16 = vp9_variance16x16_sse2; - - vp9_variance16x8 = vp9_variance16x8_c; - if (flags & HAS_MMX) vp9_variance16x8 = vp9_variance16x8_mmx; - if (flags & HAS_SSE2) vp9_variance16x8 = vp9_variance16x8_sse2; - - vp9_variance8x16 = vp9_variance8x16_c; - if (flags & HAS_MMX) vp9_variance8x16 = vp9_variance8x16_mmx; - if (flags & HAS_SSE2) vp9_variance8x16 = vp9_variance8x16_sse2; - - vp9_variance8x8 = vp9_variance8x8_c; - if (flags & HAS_MMX) vp9_variance8x8 = vp9_variance8x8_mmx; - if (flags & HAS_SSE2) vp9_variance8x8 = vp9_variance8x8_sse2; - - vp9_get_sse_sum_8x8 = vp9_get_sse_sum_8x8_c; - if (flags & HAS_SSE2) vp9_get_sse_sum_8x8 = vp9_get8x8var_sse2; - - vp9_variance8x4 = vp9_variance8x4_c; - if (flags & HAS_SSE2) vp9_variance8x4 = vp9_variance8x4_sse2; - - vp9_variance4x8 = vp9_variance4x8_c; - if (flags & HAS_SSE2) vp9_variance4x8 = vp9_variance4x8_sse2; - - vp9_variance4x4 = vp9_variance4x4_c; - if (flags & HAS_MMX) vp9_variance4x4 = vp9_variance4x4_mmx; - if (flags & HAS_SSE2) vp9_variance4x4 = vp9_variance4x4_sse2; - - vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_c; - if (flags & HAS_SSE2) vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_ssse3; - - vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_c; - if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_ssse3; - - vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_c; - if (flags & HAS_SSE2) vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_ssse3; - - vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_c; - if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_ssse3; - - vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_c; - if (flags & HAS_SSE2) vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_ssse3; - - vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_c; - if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_ssse3; - - vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_c; - if (flags & HAS_SSE2) vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_ssse3; - - vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_c; - if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_ssse3; - - vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_c; - if (flags & HAS_SSE2) vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_ssse3; - - vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_c; - if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_ssse3; - - vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_c; - if (flags & HAS_SSE2) vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_ssse3; - - vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_c; - if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_ssse3; - - vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_c; - if (flags & HAS_SSE2) vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_ssse3; - - vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_c; - if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_ssse3; - - vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_c; - if (flags & HAS_SSE2) vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_ssse3; - - vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_c; - if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_ssse3; - - vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_c; - if (flags & HAS_SSE2) vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_ssse3; - - vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_c; - if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_ssse3; - - vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_c; - if (flags & HAS_SSE2) vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_ssse3; - - vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_c; - if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_ssse3; - - vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_c; - if (flags & HAS_SSE2) vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_ssse3; - - vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_c; - if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_ssse3; - - vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_c; - if (flags & HAS_SSE) vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_ssse3; - - vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_c; - if (flags & HAS_SSE) vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_ssse3; - - vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_c; - if (flags & HAS_SSE) vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_ssse3; - - vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_c; - if (flags & HAS_SSE) vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_ssse3; - - vp9_sad64x64 = vp9_sad64x64_c; - if (flags & HAS_SSE2) vp9_sad64x64 = vp9_sad64x64_sse2; - - vp9_sad32x64 = vp9_sad32x64_c; - if (flags & HAS_SSE2) vp9_sad32x64 = vp9_sad32x64_sse2; - - vp9_sad64x32 = vp9_sad64x32_c; - if (flags & HAS_SSE2) vp9_sad64x32 = vp9_sad64x32_sse2; - - vp9_sad32x16 = vp9_sad32x16_c; - if (flags & HAS_SSE2) vp9_sad32x16 = vp9_sad32x16_sse2; - - vp9_sad16x32 = vp9_sad16x32_c; - if (flags & HAS_SSE2) vp9_sad16x32 = vp9_sad16x32_sse2; - - vp9_sad32x32 = vp9_sad32x32_c; - if (flags & HAS_SSE2) vp9_sad32x32 = vp9_sad32x32_sse2; - - vp9_sad16x16 = vp9_sad16x16_c; - if (flags & HAS_MMX) vp9_sad16x16 = vp9_sad16x16_mmx; - if (flags & HAS_SSE2) vp9_sad16x16 = vp9_sad16x16_sse2; - - vp9_sad16x8 = vp9_sad16x8_c; - if (flags & HAS_MMX) vp9_sad16x8 = vp9_sad16x8_mmx; - if (flags & HAS_SSE2) vp9_sad16x8 = vp9_sad16x8_sse2; - - vp9_sad8x16 = vp9_sad8x16_c; - if (flags & HAS_MMX) vp9_sad8x16 = vp9_sad8x16_mmx; - if (flags & HAS_SSE2) vp9_sad8x16 = vp9_sad8x16_sse2; - - vp9_sad8x8 = vp9_sad8x8_c; - if (flags & HAS_MMX) vp9_sad8x8 = vp9_sad8x8_mmx; - if (flags & HAS_SSE2) vp9_sad8x8 = vp9_sad8x8_sse2; - - vp9_sad8x4 = vp9_sad8x4_c; - if (flags & HAS_SSE2) vp9_sad8x4 = vp9_sad8x4_sse2; - - vp9_sad4x8 = vp9_sad4x8_c; - if (flags & HAS_SSE) vp9_sad4x8 = vp9_sad4x8_sse; - - vp9_sad4x4 = vp9_sad4x4_c; - if (flags & HAS_MMX) vp9_sad4x4 = vp9_sad4x4_mmx; - if (flags & HAS_SSE) vp9_sad4x4 = vp9_sad4x4_sse; - - vp9_sad64x64_avg = vp9_sad64x64_avg_c; - if (flags & HAS_SSE2) vp9_sad64x64_avg = vp9_sad64x64_avg_sse2; - - vp9_sad32x64_avg = vp9_sad32x64_avg_c; - if (flags & HAS_SSE2) vp9_sad32x64_avg = vp9_sad32x64_avg_sse2; - - vp9_sad64x32_avg = vp9_sad64x32_avg_c; - if (flags & HAS_SSE2) vp9_sad64x32_avg = vp9_sad64x32_avg_sse2; - - vp9_sad32x16_avg = vp9_sad32x16_avg_c; - if (flags & HAS_SSE2) vp9_sad32x16_avg = vp9_sad32x16_avg_sse2; - - vp9_sad16x32_avg = vp9_sad16x32_avg_c; - if (flags & HAS_SSE2) vp9_sad16x32_avg = vp9_sad16x32_avg_sse2; - - vp9_sad32x32_avg = vp9_sad32x32_avg_c; - if (flags & HAS_SSE2) vp9_sad32x32_avg = vp9_sad32x32_avg_sse2; - - vp9_sad16x16_avg = vp9_sad16x16_avg_c; - if (flags & HAS_SSE2) vp9_sad16x16_avg = vp9_sad16x16_avg_sse2; - - vp9_sad16x8_avg = vp9_sad16x8_avg_c; - if (flags & HAS_SSE2) vp9_sad16x8_avg = vp9_sad16x8_avg_sse2; - - vp9_sad8x16_avg = vp9_sad8x16_avg_c; - if (flags & HAS_SSE2) vp9_sad8x16_avg = vp9_sad8x16_avg_sse2; - - vp9_sad8x8_avg = vp9_sad8x8_avg_c; - if (flags & HAS_SSE2) vp9_sad8x8_avg = vp9_sad8x8_avg_sse2; - - vp9_sad8x4_avg = vp9_sad8x4_avg_c; - if (flags & HAS_SSE2) vp9_sad8x4_avg = vp9_sad8x4_avg_sse2; - - vp9_sad4x8_avg = vp9_sad4x8_avg_c; - if (flags & HAS_SSE) vp9_sad4x8_avg = vp9_sad4x8_avg_sse; - - vp9_sad4x4_avg = vp9_sad4x4_avg_c; - if (flags & HAS_SSE) vp9_sad4x4_avg = vp9_sad4x4_avg_sse; - - vp9_variance_halfpixvar16x16_h = vp9_variance_halfpixvar16x16_h_c; - if (flags & HAS_SSE2) vp9_variance_halfpixvar16x16_h = vp9_variance_halfpixvar16x16_h_sse2; - - vp9_variance_halfpixvar16x16_v = vp9_variance_halfpixvar16x16_v_c; - if (flags & HAS_SSE2) vp9_variance_halfpixvar16x16_v = vp9_variance_halfpixvar16x16_v_sse2; - - vp9_variance_halfpixvar16x16_hv = vp9_variance_halfpixvar16x16_hv_c; - if (flags & HAS_SSE2) vp9_variance_halfpixvar16x16_hv = vp9_variance_halfpixvar16x16_hv_sse2; - - - - - - - - - - vp9_sad16x16x3 = vp9_sad16x16x3_c; - if (flags & HAS_SSE3) vp9_sad16x16x3 = vp9_sad16x16x3_sse3; - if (flags & HAS_SSSE3) vp9_sad16x16x3 = vp9_sad16x16x3_ssse3; - - vp9_sad16x8x3 = vp9_sad16x8x3_c; - if (flags & HAS_SSE3) vp9_sad16x8x3 = vp9_sad16x8x3_sse3; - if (flags & HAS_SSSE3) vp9_sad16x8x3 = vp9_sad16x8x3_ssse3; - - vp9_sad8x16x3 = vp9_sad8x16x3_c; - if (flags & HAS_SSE3) vp9_sad8x16x3 = vp9_sad8x16x3_sse3; - - vp9_sad8x8x3 = vp9_sad8x8x3_c; - if (flags & HAS_SSE3) vp9_sad8x8x3 = vp9_sad8x8x3_sse3; - - vp9_sad4x4x3 = vp9_sad4x4x3_c; - if (flags & HAS_SSE3) vp9_sad4x4x3 = vp9_sad4x4x3_sse3; - - - - - - - - - - - vp9_sad64x64x4d = vp9_sad64x64x4d_c; - if (flags & HAS_SSE2) vp9_sad64x64x4d = vp9_sad64x64x4d_sse2; - - vp9_sad32x64x4d = vp9_sad32x64x4d_c; - if (flags & HAS_SSE2) vp9_sad32x64x4d = vp9_sad32x64x4d_sse2; - - vp9_sad64x32x4d = vp9_sad64x32x4d_c; - if (flags & HAS_SSE2) vp9_sad64x32x4d = vp9_sad64x32x4d_sse2; - - vp9_sad32x16x4d = vp9_sad32x16x4d_c; - if (flags & HAS_SSE2) vp9_sad32x16x4d = vp9_sad32x16x4d_sse2; - - vp9_sad16x32x4d = vp9_sad16x32x4d_c; - if (flags & HAS_SSE2) vp9_sad16x32x4d = vp9_sad16x32x4d_sse2; - - vp9_sad32x32x4d = vp9_sad32x32x4d_c; - if (flags & HAS_SSE2) vp9_sad32x32x4d = vp9_sad32x32x4d_sse2; - - vp9_sad16x16x4d = vp9_sad16x16x4d_c; - if (flags & HAS_SSE2) vp9_sad16x16x4d = vp9_sad16x16x4d_sse2; - - vp9_sad16x8x4d = vp9_sad16x8x4d_c; - if (flags & HAS_SSE2) vp9_sad16x8x4d = vp9_sad16x8x4d_sse2; - - vp9_sad8x16x4d = vp9_sad8x16x4d_c; - if (flags & HAS_SSE2) vp9_sad8x16x4d = vp9_sad8x16x4d_sse2; - - vp9_sad8x8x4d = vp9_sad8x8x4d_c; - if (flags & HAS_SSE2) vp9_sad8x8x4d = vp9_sad8x8x4d_sse2; - - vp9_sad8x4x4d = vp9_sad8x4x4d_c; - if (flags & HAS_SSE2) vp9_sad8x4x4d = vp9_sad8x4x4d_sse2; - - vp9_sad4x8x4d = vp9_sad4x8x4d_c; - if (flags & HAS_SSE) vp9_sad4x8x4d = vp9_sad4x8x4d_sse; - - vp9_sad4x4x4d = vp9_sad4x4x4d_c; - if (flags & HAS_SSE) vp9_sad4x4x4d = vp9_sad4x4x4d_sse; - - vp9_mse16x16 = vp9_mse16x16_c; - if (flags & HAS_MMX) vp9_mse16x16 = vp9_mse16x16_mmx; - if (flags & HAS_SSE2) vp9_mse16x16 = vp9_mse16x16_sse2; - - - - - - - vp9_get_mb_ss = vp9_get_mb_ss_c; - if (flags & HAS_MMX) vp9_get_mb_ss = vp9_get_mb_ss_mmx; - if (flags & HAS_SSE2) vp9_get_mb_ss = vp9_get_mb_ss_sse2; - - vp9_block_error = vp9_block_error_c; - if (flags & HAS_SSE2) vp9_block_error = vp9_block_error_sse2; - - vp9_subtract_block = vp9_subtract_block_c; - if (flags & HAS_SSE2) vp9_subtract_block = vp9_subtract_block_sse2; - - - - vp9_short_fht4x4 = vp9_short_fht4x4_c; - if (flags & HAS_SSE2) vp9_short_fht4x4 = vp9_short_fht4x4_sse2; - - vp9_short_fht8x8 = vp9_short_fht8x8_c; - if (flags & HAS_SSE2) vp9_short_fht8x8 = vp9_short_fht8x8_sse2; - - vp9_short_fht16x16 = vp9_short_fht16x16_c; - if (flags & HAS_SSE2) vp9_short_fht16x16 = vp9_short_fht16x16_sse2; - - - vp9_fdct4x4 = vp9_fdct4x4_c; - if (flags & HAS_SSE2) vp9_fdct4x4 = vp9_fdct4x4_sse2; - - vp9_fdct8x8 = vp9_fdct8x8_c; - if (flags & HAS_SSE2) vp9_fdct8x8 = vp9_fdct8x8_sse2; - + vp9_convolve8_horiz = vp9_convolve8_horiz_c; + if (flags & HAS_SSE2) vp9_convolve8_horiz = vp9_convolve8_horiz_sse2; + if (flags & HAS_SSSE3) vp9_convolve8_horiz = vp9_convolve8_horiz_ssse3; + vp9_convolve8_vert = vp9_convolve8_vert_c; + if (flags & HAS_SSE2) vp9_convolve8_vert = vp9_convolve8_vert_sse2; + if (flags & HAS_SSSE3) vp9_convolve8_vert = vp9_convolve8_vert_ssse3; + vp9_convolve_avg = vp9_convolve_avg_c; + if (flags & HAS_SSE2) vp9_convolve_avg = vp9_convolve_avg_sse2; + vp9_convolve_copy = vp9_convolve_copy_c; + if (flags & HAS_SSE2) vp9_convolve_copy = vp9_convolve_copy_sse2; + vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_ssse3; + vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_ssse3; + vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_ssse3; + vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_ssse3; + vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_ssse3; + vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_ssse3; + vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_ssse3; + vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_ssse3; + vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_ssse3; + vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_ssse3; + vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_ssse3; + vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_ssse3; + vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_ssse3; + vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_ssse3; + vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_ssse3; + vp9_dc_predictor_16x16 = vp9_dc_predictor_16x16_c; + if (flags & HAS_SSE2) vp9_dc_predictor_16x16 = vp9_dc_predictor_16x16_sse2; + vp9_dc_predictor_32x32 = vp9_dc_predictor_32x32_c; + if (flags & HAS_SSE2) vp9_dc_predictor_32x32 = vp9_dc_predictor_32x32_sse2; + vp9_dc_predictor_4x4 = vp9_dc_predictor_4x4_c; + if (flags & HAS_SSE) vp9_dc_predictor_4x4 = vp9_dc_predictor_4x4_sse; + vp9_dc_predictor_8x8 = vp9_dc_predictor_8x8_c; + if (flags & HAS_SSE) vp9_dc_predictor_8x8 = vp9_dc_predictor_8x8_sse; vp9_fdct16x16 = vp9_fdct16x16_c; if (flags & HAS_SSE2) vp9_fdct16x16 = vp9_fdct16x16_sse2; - + vp9_fdct16x16_1 = vp9_fdct16x16_1_c; + if (flags & HAS_SSE2) vp9_fdct16x16_1 = vp9_fdct16x16_1_sse2; vp9_fdct32x32 = vp9_fdct32x32_c; if (flags & HAS_SSE2) vp9_fdct32x32 = vp9_fdct32x32_sse2; - + vp9_fdct32x32_1 = vp9_fdct32x32_1_c; + if (flags & HAS_SSE2) vp9_fdct32x32_1 = vp9_fdct32x32_1_sse2; vp9_fdct32x32_rd = vp9_fdct32x32_rd_c; if (flags & HAS_SSE2) vp9_fdct32x32_rd = vp9_fdct32x32_rd_sse2; - + vp9_fdct4x4 = vp9_fdct4x4_c; + if (flags & HAS_SSE2) vp9_fdct4x4 = vp9_fdct4x4_sse2; + vp9_fdct4x4_1 = vp9_fdct4x4_1_c; + if (flags & HAS_SSE2) vp9_fdct4x4_1 = vp9_fdct4x4_1_sse2; + vp9_fdct8x8 = vp9_fdct8x8_c; + if (flags & HAS_SSE2) vp9_fdct8x8 = vp9_fdct8x8_sse2; + vp9_fdct8x8_1 = vp9_fdct8x8_1_c; + if (flags & HAS_SSE2) vp9_fdct8x8_1 = vp9_fdct8x8_1_sse2; + vp9_fht16x16 = vp9_fht16x16_c; + if (flags & HAS_SSE2) vp9_fht16x16 = vp9_fht16x16_sse2; + vp9_fht4x4 = vp9_fht4x4_c; + if (flags & HAS_SSE2) vp9_fht4x4 = vp9_fht4x4_sse2; + vp9_fht8x8 = vp9_fht8x8_c; + if (flags & HAS_SSE2) vp9_fht8x8 = vp9_fht8x8_sse2; vp9_full_search_sad = vp9_full_search_sad_c; if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3; if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; - - vp9_refining_search_sad = vp9_refining_search_sad_c; - if (flags & HAS_SSE3) vp9_refining_search_sad = vp9_refining_search_sadx4; - - vp9_diamond_search_sad = vp9_diamond_search_sad_c; - if (flags & HAS_SSE3) vp9_diamond_search_sad = vp9_diamond_search_sadx4; - + vp9_fwht4x4 = vp9_fwht4x4_c; + if (flags & HAS_MMX) vp9_fwht4x4 = vp9_fwht4x4_mmx; + vp9_get16x16var = vp9_get16x16var_c; + if (flags & HAS_SSE2) vp9_get16x16var = vp9_get16x16var_sse2; + vp9_get8x8var = vp9_get8x8var_c; + if (flags & HAS_SSE2) vp9_get8x8var = vp9_get8x8var_sse2; + vp9_get_mb_ss = vp9_get_mb_ss_c; + if (flags & HAS_SSE2) vp9_get_mb_ss = vp9_get_mb_ss_sse2; + vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_ssse3; + vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_h_predictor_32x32 = vp9_h_predictor_32x32_ssse3; + vp9_h_predictor_4x4 = vp9_h_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_h_predictor_4x4 = vp9_h_predictor_4x4_ssse3; + vp9_h_predictor_8x8 = vp9_h_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_h_predictor_8x8 = vp9_h_predictor_8x8_ssse3; + vp9_idct16x16_10_add = vp9_idct16x16_10_add_c; + if (flags & HAS_SSE2) vp9_idct16x16_10_add = vp9_idct16x16_10_add_sse2; + if (flags & HAS_SSSE3) vp9_idct16x16_10_add = vp9_idct16x16_10_add_ssse3; + vp9_idct16x16_1_add = vp9_idct16x16_1_add_c; + if (flags & HAS_SSE2) vp9_idct16x16_1_add = vp9_idct16x16_1_add_sse2; + vp9_idct16x16_256_add = vp9_idct16x16_256_add_c; + if (flags & HAS_SSE2) vp9_idct16x16_256_add = vp9_idct16x16_256_add_sse2; + if (flags & HAS_SSSE3) vp9_idct16x16_256_add = vp9_idct16x16_256_add_ssse3; + vp9_idct32x32_1024_add = vp9_idct32x32_1024_add_c; + if (flags & HAS_SSE2) vp9_idct32x32_1024_add = vp9_idct32x32_1024_add_sse2; + vp9_idct32x32_1_add = vp9_idct32x32_1_add_c; + if (flags & HAS_SSE2) vp9_idct32x32_1_add = vp9_idct32x32_1_add_sse2; + vp9_idct32x32_34_add = vp9_idct32x32_34_add_c; + if (flags & HAS_SSE2) vp9_idct32x32_34_add = vp9_idct32x32_34_add_sse2; + vp9_idct4x4_16_add = vp9_idct4x4_16_add_c; + if (flags & HAS_SSE2) vp9_idct4x4_16_add = vp9_idct4x4_16_add_sse2; + vp9_idct4x4_1_add = vp9_idct4x4_1_add_c; + if (flags & HAS_SSE2) vp9_idct4x4_1_add = vp9_idct4x4_1_add_sse2; + vp9_idct8x8_12_add = vp9_idct8x8_12_add_c; + if (flags & HAS_SSE2) vp9_idct8x8_12_add = vp9_idct8x8_12_add_sse2; + vp9_idct8x8_1_add = vp9_idct8x8_1_add_c; + if (flags & HAS_SSE2) vp9_idct8x8_1_add = vp9_idct8x8_1_add_sse2; + vp9_idct8x8_64_add = vp9_idct8x8_64_add_c; + if (flags & HAS_SSE2) vp9_idct8x8_64_add = vp9_idct8x8_64_add_sse2; + vp9_iht16x16_256_add = vp9_iht16x16_256_add_c; + if (flags & HAS_SSE2) vp9_iht16x16_256_add = vp9_iht16x16_256_add_sse2; + vp9_iht4x4_16_add = vp9_iht4x4_16_add_c; + if (flags & HAS_SSE2) vp9_iht4x4_16_add = vp9_iht4x4_16_add_sse2; + vp9_iht8x8_64_add = vp9_iht8x8_64_add_c; + if (flags & HAS_SSE2) vp9_iht8x8_64_add = vp9_iht8x8_64_add_sse2; + vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_c; + if (flags & HAS_SSE2) vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_sse2; + vp9_lpf_horizontal_4 = vp9_lpf_horizontal_4_c; + if (flags & HAS_MMX) vp9_lpf_horizontal_4 = vp9_lpf_horizontal_4_mmx; + vp9_lpf_horizontal_4_dual = vp9_lpf_horizontal_4_dual_c; + if (flags & HAS_SSE2) vp9_lpf_horizontal_4_dual = vp9_lpf_horizontal_4_dual_sse2; + vp9_lpf_horizontal_8 = vp9_lpf_horizontal_8_c; + if (flags & HAS_SSE2) vp9_lpf_horizontal_8 = vp9_lpf_horizontal_8_sse2; + vp9_lpf_horizontal_8_dual = vp9_lpf_horizontal_8_dual_c; + if (flags & HAS_SSE2) vp9_lpf_horizontal_8_dual = vp9_lpf_horizontal_8_dual_sse2; + vp9_lpf_vertical_16 = vp9_lpf_vertical_16_c; + if (flags & HAS_SSE2) vp9_lpf_vertical_16 = vp9_lpf_vertical_16_sse2; + vp9_lpf_vertical_16_dual = vp9_lpf_vertical_16_dual_c; + if (flags & HAS_SSE2) vp9_lpf_vertical_16_dual = vp9_lpf_vertical_16_dual_sse2; + vp9_lpf_vertical_4 = vp9_lpf_vertical_4_c; + if (flags & HAS_MMX) vp9_lpf_vertical_4 = vp9_lpf_vertical_4_mmx; + vp9_lpf_vertical_4_dual = vp9_lpf_vertical_4_dual_c; + if (flags & HAS_SSE2) vp9_lpf_vertical_4_dual = vp9_lpf_vertical_4_dual_sse2; + vp9_lpf_vertical_8 = vp9_lpf_vertical_8_c; + if (flags & HAS_SSE2) vp9_lpf_vertical_8 = vp9_lpf_vertical_8_sse2; + vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_c; + if (flags & HAS_SSE2) vp9_lpf_vertical_8_dual = vp9_lpf_vertical_8_dual_sse2; + vp9_mse16x16 = vp9_mse16x16_c; + if (flags & HAS_SSE2) vp9_mse16x16 = vp9_mse16x16_sse2; + vp9_mse16x8 = vp9_mse16x8_c; + if (flags & HAS_SSE2) vp9_mse16x8 = vp9_mse16x8_sse2; + vp9_mse8x16 = vp9_mse8x16_c; + if (flags & HAS_SSE2) vp9_mse8x16 = vp9_mse8x16_sse2; + vp9_mse8x8 = vp9_mse8x8_c; + if (flags & HAS_SSE2) vp9_mse8x8 = vp9_mse8x8_sse2; + vp9_sad16x16 = vp9_sad16x16_c; + if (flags & HAS_SSE2) vp9_sad16x16 = vp9_sad16x16_sse2; + vp9_sad16x16_avg = vp9_sad16x16_avg_c; + if (flags & HAS_SSE2) vp9_sad16x16_avg = vp9_sad16x16_avg_sse2; + vp9_sad16x16x3 = vp9_sad16x16x3_c; + if (flags & HAS_SSE3) vp9_sad16x16x3 = vp9_sad16x16x3_sse3; + if (flags & HAS_SSSE3) vp9_sad16x16x3 = vp9_sad16x16x3_ssse3; + vp9_sad16x16x4d = vp9_sad16x16x4d_c; + if (flags & HAS_SSE2) vp9_sad16x16x4d = vp9_sad16x16x4d_sse2; + vp9_sad16x32 = vp9_sad16x32_c; + if (flags & HAS_SSE2) vp9_sad16x32 = vp9_sad16x32_sse2; + vp9_sad16x32_avg = vp9_sad16x32_avg_c; + if (flags & HAS_SSE2) vp9_sad16x32_avg = vp9_sad16x32_avg_sse2; + vp9_sad16x32x4d = vp9_sad16x32x4d_c; + if (flags & HAS_SSE2) vp9_sad16x32x4d = vp9_sad16x32x4d_sse2; + vp9_sad16x8 = vp9_sad16x8_c; + if (flags & HAS_SSE2) vp9_sad16x8 = vp9_sad16x8_sse2; + vp9_sad16x8_avg = vp9_sad16x8_avg_c; + if (flags & HAS_SSE2) vp9_sad16x8_avg = vp9_sad16x8_avg_sse2; + vp9_sad16x8x3 = vp9_sad16x8x3_c; + if (flags & HAS_SSE3) vp9_sad16x8x3 = vp9_sad16x8x3_sse3; + if (flags & HAS_SSSE3) vp9_sad16x8x3 = vp9_sad16x8x3_ssse3; + vp9_sad16x8x4d = vp9_sad16x8x4d_c; + if (flags & HAS_SSE2) vp9_sad16x8x4d = vp9_sad16x8x4d_sse2; + vp9_sad32x16 = vp9_sad32x16_c; + if (flags & HAS_SSE2) vp9_sad32x16 = vp9_sad32x16_sse2; + vp9_sad32x16_avg = vp9_sad32x16_avg_c; + if (flags & HAS_SSE2) vp9_sad32x16_avg = vp9_sad32x16_avg_sse2; + vp9_sad32x16x4d = vp9_sad32x16x4d_c; + if (flags & HAS_SSE2) vp9_sad32x16x4d = vp9_sad32x16x4d_sse2; + vp9_sad32x32 = vp9_sad32x32_c; + if (flags & HAS_SSE2) vp9_sad32x32 = vp9_sad32x32_sse2; + vp9_sad32x32_avg = vp9_sad32x32_avg_c; + if (flags & HAS_SSE2) vp9_sad32x32_avg = vp9_sad32x32_avg_sse2; + vp9_sad32x32x4d = vp9_sad32x32x4d_c; + if (flags & HAS_SSE2) vp9_sad32x32x4d = vp9_sad32x32x4d_sse2; + vp9_sad32x64 = vp9_sad32x64_c; + if (flags & HAS_SSE2) vp9_sad32x64 = vp9_sad32x64_sse2; + vp9_sad32x64_avg = vp9_sad32x64_avg_c; + if (flags & HAS_SSE2) vp9_sad32x64_avg = vp9_sad32x64_avg_sse2; + vp9_sad32x64x4d = vp9_sad32x64x4d_c; + if (flags & HAS_SSE2) vp9_sad32x64x4d = vp9_sad32x64x4d_sse2; + vp9_sad4x4 = vp9_sad4x4_c; + if (flags & HAS_SSE) vp9_sad4x4 = vp9_sad4x4_sse; + vp9_sad4x4_avg = vp9_sad4x4_avg_c; + if (flags & HAS_SSE) vp9_sad4x4_avg = vp9_sad4x4_avg_sse; + vp9_sad4x4x3 = vp9_sad4x4x3_c; + if (flags & HAS_SSE3) vp9_sad4x4x3 = vp9_sad4x4x3_sse3; + vp9_sad4x4x4d = vp9_sad4x4x4d_c; + if (flags & HAS_SSE) vp9_sad4x4x4d = vp9_sad4x4x4d_sse; + vp9_sad4x8 = vp9_sad4x8_c; + if (flags & HAS_SSE) vp9_sad4x8 = vp9_sad4x8_sse; + vp9_sad4x8_avg = vp9_sad4x8_avg_c; + if (flags & HAS_SSE) vp9_sad4x8_avg = vp9_sad4x8_avg_sse; + vp9_sad4x8x4d = vp9_sad4x8x4d_c; + if (flags & HAS_SSE) vp9_sad4x8x4d = vp9_sad4x8x4d_sse; + vp9_sad64x32 = vp9_sad64x32_c; + if (flags & HAS_SSE2) vp9_sad64x32 = vp9_sad64x32_sse2; + vp9_sad64x32_avg = vp9_sad64x32_avg_c; + if (flags & HAS_SSE2) vp9_sad64x32_avg = vp9_sad64x32_avg_sse2; + vp9_sad64x32x4d = vp9_sad64x32x4d_c; + if (flags & HAS_SSE2) vp9_sad64x32x4d = vp9_sad64x32x4d_sse2; + vp9_sad64x64 = vp9_sad64x64_c; + if (flags & HAS_SSE2) vp9_sad64x64 = vp9_sad64x64_sse2; + vp9_sad64x64_avg = vp9_sad64x64_avg_c; + if (flags & HAS_SSE2) vp9_sad64x64_avg = vp9_sad64x64_avg_sse2; + vp9_sad64x64x4d = vp9_sad64x64x4d_c; + if (flags & HAS_SSE2) vp9_sad64x64x4d = vp9_sad64x64x4d_sse2; + vp9_sad8x16 = vp9_sad8x16_c; + if (flags & HAS_SSE2) vp9_sad8x16 = vp9_sad8x16_sse2; + vp9_sad8x16_avg = vp9_sad8x16_avg_c; + if (flags & HAS_SSE2) vp9_sad8x16_avg = vp9_sad8x16_avg_sse2; + vp9_sad8x16x3 = vp9_sad8x16x3_c; + if (flags & HAS_SSE3) vp9_sad8x16x3 = vp9_sad8x16x3_sse3; + vp9_sad8x16x4d = vp9_sad8x16x4d_c; + if (flags & HAS_SSE2) vp9_sad8x16x4d = vp9_sad8x16x4d_sse2; + vp9_sad8x4 = vp9_sad8x4_c; + if (flags & HAS_SSE2) vp9_sad8x4 = vp9_sad8x4_sse2; + vp9_sad8x4_avg = vp9_sad8x4_avg_c; + if (flags & HAS_SSE2) vp9_sad8x4_avg = vp9_sad8x4_avg_sse2; + vp9_sad8x4x4d = vp9_sad8x4x4d_c; + if (flags & HAS_SSE2) vp9_sad8x4x4d = vp9_sad8x4x4d_sse2; + vp9_sad8x8 = vp9_sad8x8_c; + if (flags & HAS_SSE2) vp9_sad8x8 = vp9_sad8x8_sse2; + vp9_sad8x8_avg = vp9_sad8x8_avg_c; + if (flags & HAS_SSE2) vp9_sad8x8_avg = vp9_sad8x8_avg_sse2; + vp9_sad8x8x3 = vp9_sad8x8x3_c; + if (flags & HAS_SSE3) vp9_sad8x8x3 = vp9_sad8x8x3_sse3; + vp9_sad8x8x4d = vp9_sad8x8x4d_c; + if (flags & HAS_SSE2) vp9_sad8x8x4d = vp9_sad8x8x4d_sse2; + vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_c; + if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_ssse3; + vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_c; + if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_ssse3; + vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_c; + if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_ssse3; + vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_c; + if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_ssse3; + vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_c; + if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_ssse3; + vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_c; + if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_ssse3; + vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_c; + if (flags & HAS_SSE) vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_ssse3; + vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_c; + if (flags & HAS_SSE) vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_ssse3; + vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_c; + if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_ssse3; + vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_c; + if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_ssse3; + vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_c; + if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_ssse3; + vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_c; + if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_ssse3; + vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_c; + if (flags & HAS_SSE2) vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_ssse3; + vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_c; + if (flags & HAS_SSE2) vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_ssse3; + vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_c; + if (flags & HAS_SSE2) vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_ssse3; + vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_c; + if (flags & HAS_SSE2) vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_ssse3; + vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_c; + if (flags & HAS_SSE2) vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_ssse3; + vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_c; + if (flags & HAS_SSE2) vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_ssse3; + vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_c; + if (flags & HAS_SSE2) vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_ssse3; + vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_c; + if (flags & HAS_SSE) vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_ssse3; + vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_c; + if (flags & HAS_SSE) vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_ssse3; + vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_c; + if (flags & HAS_SSE2) vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_ssse3; + vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_c; + if (flags & HAS_SSE2) vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_ssse3; + vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_c; + if (flags & HAS_SSE2) vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_ssse3; + vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_c; + if (flags & HAS_SSE2) vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_ssse3; + vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_c; + if (flags & HAS_SSE2) vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_ssse3; + vp9_subtract_block = vp9_subtract_block_c; + if (flags & HAS_SSE2) vp9_subtract_block = vp9_subtract_block_sse2; vp9_temporal_filter_apply = vp9_temporal_filter_apply_c; if (flags & HAS_SSE2) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse2; + vp9_tm_predictor_16x16 = vp9_tm_predictor_16x16_c; + if (flags & HAS_SSE2) vp9_tm_predictor_16x16 = vp9_tm_predictor_16x16_sse2; + vp9_tm_predictor_4x4 = vp9_tm_predictor_4x4_c; + if (flags & HAS_SSE) vp9_tm_predictor_4x4 = vp9_tm_predictor_4x4_sse; + vp9_tm_predictor_8x8 = vp9_tm_predictor_8x8_c; + if (flags & HAS_SSE2) vp9_tm_predictor_8x8 = vp9_tm_predictor_8x8_sse2; + vp9_v_predictor_16x16 = vp9_v_predictor_16x16_c; + if (flags & HAS_SSE2) vp9_v_predictor_16x16 = vp9_v_predictor_16x16_sse2; + vp9_v_predictor_32x32 = vp9_v_predictor_32x32_c; + if (flags & HAS_SSE2) vp9_v_predictor_32x32 = vp9_v_predictor_32x32_sse2; + vp9_v_predictor_4x4 = vp9_v_predictor_4x4_c; + if (flags & HAS_SSE) vp9_v_predictor_4x4 = vp9_v_predictor_4x4_sse; + vp9_v_predictor_8x8 = vp9_v_predictor_8x8_c; + if (flags & HAS_SSE) vp9_v_predictor_8x8 = vp9_v_predictor_8x8_sse; + vp9_variance16x16 = vp9_variance16x16_c; + if (flags & HAS_SSE2) vp9_variance16x16 = vp9_variance16x16_sse2; + vp9_variance16x32 = vp9_variance16x32_c; + if (flags & HAS_SSE2) vp9_variance16x32 = vp9_variance16x32_sse2; + vp9_variance16x8 = vp9_variance16x8_c; + if (flags & HAS_SSE2) vp9_variance16x8 = vp9_variance16x8_sse2; + vp9_variance32x16 = vp9_variance32x16_c; + if (flags & HAS_SSE2) vp9_variance32x16 = vp9_variance32x16_sse2; + vp9_variance32x32 = vp9_variance32x32_c; + if (flags & HAS_SSE2) vp9_variance32x32 = vp9_variance32x32_sse2; + vp9_variance32x64 = vp9_variance32x64_c; + if (flags & HAS_SSE2) vp9_variance32x64 = vp9_variance32x64_sse2; + vp9_variance4x4 = vp9_variance4x4_c; + if (flags & HAS_SSE2) vp9_variance4x4 = vp9_variance4x4_sse2; + vp9_variance4x8 = vp9_variance4x8_c; + if (flags & HAS_SSE2) vp9_variance4x8 = vp9_variance4x8_sse2; + vp9_variance64x32 = vp9_variance64x32_c; + if (flags & HAS_SSE2) vp9_variance64x32 = vp9_variance64x32_sse2; + vp9_variance64x64 = vp9_variance64x64_c; + if (flags & HAS_SSE2) vp9_variance64x64 = vp9_variance64x64_sse2; + vp9_variance8x16 = vp9_variance8x16_c; + if (flags & HAS_SSE2) vp9_variance8x16 = vp9_variance8x16_sse2; + vp9_variance8x4 = vp9_variance8x4_c; + if (flags & HAS_SSE2) vp9_variance8x4 = vp9_variance8x4_sse2; + vp9_variance8x8 = vp9_variance8x8_c; + if (flags & HAS_SSE2) vp9_variance8x8 = vp9_variance8x8_sse2; } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vp9_rtcd_x86_64-darwin9-gcc.h b/media/libvpx/vp9_rtcd_x86_64-darwin9-gcc.h index 8bfe3af71454..dcc090dfe196 100644 --- a/media/libvpx/vp9_rtcd_x86_64-darwin9-gcc.h +++ b/media/libvpx/vp9_rtcd_x86_64-darwin9-gcc.h @@ -7,6 +7,10 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + /* * VP9 */ @@ -19,255 +23,22 @@ struct macroblockd; /* Encoder forward decls */ struct macroblock; struct vp9_variance_vtable; - -#define DEC_MVCOSTS int *mvjcost, int *mvcost[2] +struct search_site_config; +struct mv; union int_mv; struct yv12_buffer_config; -void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c - -void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c - -void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_sse - -void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_tm_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_sse - -void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_sse - -void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c - -void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c - -void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c - -void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c - -void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c - -void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_sse - -void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_sse2 - -void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_sse - -void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c - -void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c - -void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c - -void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c - -void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c - -void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_sse2 - -void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_sse2 - -void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_sse2 - -void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c - -void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c - -void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c - -void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c - -void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c - -void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c - -void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_sse2 - -void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_sse2 - -void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_sse2 - -void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c - -void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c - -void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c - -void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); -void vp9_mb_lpf_vertical_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); -#define vp9_mb_lpf_vertical_edge_w vp9_mb_lpf_vertical_edge_w_sse2 - -void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mbloop_filter_vertical_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_mbloop_filter_vertical_edge vp9_mbloop_filter_vertical_edge_sse2 - -void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_loop_filter_vertical_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_mmx - -void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mb_lpf_horizontal_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mb_lpf_horizontal_edge_w_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -RTCD_EXTERN void (*vp9_mb_lpf_horizontal_edge_w)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); - -void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mbloop_filter_horizontal_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_mbloop_filter_horizontal_edge vp9_mbloop_filter_horizontal_edge_sse2 - -void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_loop_filter_horizontal_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_loop_filter_horizontal_edge vp9_loop_filter_horizontal_edge_mmx - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve_copy vp9_convolve_copy_sse2 - -void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve_avg vp9_convolve_avg_sse2 +int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_sse2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); +RTCD_EXTERN int64_t (*vp9_block_error)(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*vp9_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); - -void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); - void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); @@ -283,25 +54,264 @@ void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8 void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*vp9_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct4x4_1_add vp9_idct4x4_1_add_sse2 +void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct4x4_16_add vp9_idct4x4_16_add_sse2 +void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct8x8_1_add vp9_idct8x8_1_add_sse2 +void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_avg vp9_convolve_avg_sse2 -void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct8x8_64_add vp9_idct8x8_64_add_sse2 +void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_copy vp9_convolve_copy_sse2 -void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct8x8_10_add vp9_idct8x8_10_add_sse2 +void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c + +void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c + +void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c + +void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c + +void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c + +void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c + +void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c + +void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c + +void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c + +void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c + +void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c + +void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c + +void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c + +void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c + +void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c + +void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c + +void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c + +void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_sse2 + +void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_sse2 + +void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_sse + +void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_sse + +void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c + +void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c + +void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c + +void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct16x16 vp9_fdct16x16_sse2 + +void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct16x16_1 vp9_fdct16x16_1_sse2 + +void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_sse2(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_avx2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct32x32)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct32x32_1 vp9_fdct32x32_1_sse2 + +void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_rd_sse2(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_rd_avx2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct32x32_rd)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct4x4 vp9_fdct4x4_sse2 + +void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct4x4_1 vp9_fdct4x4_1_sse2 + +void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride); +void vp9_fdct8x8_ssse3(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct8x8)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct8x8_1 vp9_fdct8x8_1_sse2 + +void vp9_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht16x16_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +#define vp9_fht16x16 vp9_fht16x16_sse2 + +void vp9_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht4x4_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +#define vp9_fht4x4 vp9_fht4x4_sse2 + +void vp9_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht8x8_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +#define vp9_fht8x8 vp9_fht8x8_sse2 + +int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_full_range_search vp9_full_range_search_c + +int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +int vp9_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +RTCD_EXTERN int (*vp9_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); + +void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride); +void vp9_fwht4x4_mmx(const int16_t *input, int16_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_mmx + +void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vp9_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vp9_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vp9_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vp9_get8x8var vp9_get8x8var_sse2 + +unsigned int vp9_get_mb_ss_c(const int16_t *); +unsigned int vp9_get_mb_ss_sse2(const int16_t *); +#define vp9_get_mb_ss vp9_get_mb_ss_sse2 + +void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_10_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct16x16_10_add)(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); @@ -309,23 +319,46 @@ void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stri void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct16x16_256_add vp9_idct16x16_256_add_sse2 - -void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct16x16_10_add vp9_idct16x16_10_add_sse2 +void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct16x16_256_add)(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); #define vp9_idct32x32_1024_add vp9_idct32x32_1024_add_sse2 +void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_idct32x32_1_add vp9_idct32x32_1_add_sse2 + void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); #define vp9_idct32x32_34_add vp9_idct32x32_34_add_sse2 -void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct32x32_1_add vp9_idct32x32_1_add_sse2 +void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_idct4x4_16_add vp9_idct4x4_16_add_sse2 + +void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_idct4x4_1_add vp9_idct4x4_1_add_sse2 + +void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_12_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct8x8_12_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_idct8x8_1_add vp9_idct8x8_1_add_sse2 + +void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_64_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct8x8_64_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type); +void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *output, int pitch, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2 void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); @@ -335,485 +368,73 @@ void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride, i void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2 -void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type); -void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *output, int pitch, int tx_type); -#define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2 +void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c void vp9_iwht4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); #define vp9_iwht4x4_1_add vp9_iwht4x4_1_add_c -void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c - -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x16 vp9_variance32x16_sse2 - -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x32 vp9_variance16x32_sse2 - -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x32 vp9_variance64x32_sse2 - -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x64 vp9_variance32x64_sse2 - -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x32 vp9_variance32x32_sse2 - -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x64 vp9_variance64x64_sse2 - -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x16 vp9_variance16x16_sse2 - -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x8 vp9_variance16x8_sse2 - -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x16 vp9_variance8x16_sse2 - -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x8 vp9_variance8x8_sse2 - -void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vp9_get_sse_sum_8x8 vp9_get8x8var_sse2 - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x4 vp9_variance8x4_sse2 - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x8 vp9_variance4x8_sse2 - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x4 vp9_variance4x4_sse2 - -unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad64x64 vp9_sad64x64_sse2 - -unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x64 vp9_sad32x64_sse2 - -unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad64x32 vp9_sad64x32_sse2 - -unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x16 vp9_sad32x16_sse2 - -unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad16x32 vp9_sad16x32_sse2 - -unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x32 vp9_sad32x32_sse2 - -unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad16x16 vp9_sad16x16_sse2 - -unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad16x8 vp9_sad16x8_sse2 - -unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad8x16 vp9_sad8x16_sse2 - -unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad8x8 vp9_sad8x8_sse2 - -unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad8x4 vp9_sad8x4_sse2 - -unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad4x8_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad4x8 vp9_sad4x8_sse - -unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad4x4_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad4x4 vp9_sad4x4_sse - -unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad64x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad64x64_avg vp9_sad64x64_avg_sse2 - -unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad32x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x64_avg vp9_sad32x64_avg_sse2 - -unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad64x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad64x32_avg vp9_sad64x32_avg_sse2 - -unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad32x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x16_avg vp9_sad32x16_avg_sse2 - -unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad16x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x32_avg vp9_sad16x32_avg_sse2 - -unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad32x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x32_avg vp9_sad32x32_avg_sse2 - -unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad16x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x16_avg vp9_sad16x16_avg_sse2 - -unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad16x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x8_avg vp9_sad16x8_avg_sse2 - -unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad8x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x16_avg vp9_sad8x16_avg_sse2 - -unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad8x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x8_avg vp9_sad8x8_avg_sse2 - -unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad8x4_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x4_avg vp9_sad8x4_avg_sse2 - -unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad4x8_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad4x8_avg vp9_sad4x8_avg_sse - -unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad4x4_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad4x4_avg vp9_sad4x4_avg_sse - -unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance_halfpixvar16x16_h_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_sse2 - -unsigned int vp9_variance_halfpixvar16x16_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance_halfpixvar16x16_v_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_sse2 - -unsigned int vp9_variance_halfpixvar16x16_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance_halfpixvar16x16_hv_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_sse2 - -unsigned int vp9_variance_halfpixvar64x64_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_h vp9_variance_halfpixvar64x64_h_c - -unsigned int vp9_variance_halfpixvar64x64_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_v vp9_variance_halfpixvar64x64_v_c - -unsigned int vp9_variance_halfpixvar64x64_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_hv vp9_variance_halfpixvar64x64_hv_c - -unsigned int vp9_variance_halfpixvar32x32_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_h vp9_variance_halfpixvar32x32_h_c - -unsigned int vp9_variance_halfpixvar32x32_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_v vp9_variance_halfpixvar32x32_v_c - -unsigned int vp9_variance_halfpixvar32x32_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_hv vp9_variance_halfpixvar32x32_hv_c - -void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad64x64x3 vp9_sad64x64x3_c - -void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad32x32x3 vp9_sad32x32x3_c - -void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad8x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad8x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad4x4x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad4x4x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad64x64x8 vp9_sad64x64x8_c - -void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad32x32x8 vp9_sad32x32x8_c - -void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad16x16x8 vp9_sad16x16x8_c - -void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad16x8x8 vp9_sad16x8x8_c - -void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x16x8 vp9_sad8x16x8_c - -void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x8x8 vp9_sad8x8x8_c - -void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x4x8 vp9_sad8x4x8_c - -void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad4x8x8 vp9_sad4x8x8_c - -void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad4x4x8 vp9_sad4x4x8_c - -void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad64x64x4d vp9_sad64x64x4d_sse2 - -void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad32x64x4d vp9_sad32x64x4d_sse2 - -void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad64x32x4d vp9_sad64x32x4d_sse2 - -void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad32x16x4d vp9_sad32x16x4d_sse2 - -void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad16x32x4d vp9_sad16x32x4d_sse2 - -void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad32x32x4d vp9_sad32x32x4d_sse2 - -void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad16x16x4d vp9_sad16x16x4d_sse2 - -void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad16x8x4d vp9_sad16x8x4d_sse2 - -void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad8x16x4d vp9_sad8x16x4d_sse2 - -void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad8x8x4d vp9_sad8x8x4d_sse2 - -void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad8x4x4d vp9_sad8x4x4d_sse2 - -void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad4x8x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad4x8x4d vp9_sad4x8x4d_sse - -void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad4x4x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad4x4x4d vp9_sad4x4x4d_sse +void vp9_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_16_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_horizontal_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); + +void vp9_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_4_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_lpf_horizontal_4 vp9_lpf_horizontal_4_mmx + +void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vp9_lpf_horizontal_4_dual vp9_lpf_horizontal_4_dual_sse2 + +void vp9_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_lpf_horizontal_8 vp9_lpf_horizontal_8_sse2 + +void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vp9_lpf_horizontal_8_dual vp9_lpf_horizontal_8_dual_sse2 + +void vp9_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vp9_lpf_vertical_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vp9_lpf_vertical_16 vp9_lpf_vertical_16_sse2 + +void vp9_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vp9_lpf_vertical_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vp9_lpf_vertical_16_dual vp9_lpf_vertical_16_dual_sse2 + +void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_vertical_4_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_lpf_vertical_4 vp9_lpf_vertical_4_mmx + +void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vp9_lpf_vertical_4_dual vp9_lpf_vertical_4_dual_sse2 + +void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_lpf_vertical_8 vp9_lpf_vertical_8_sse2 + +void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vp9_lpf_vertical_8_dual vp9_lpf_vertical_8_dual_sse2 unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int vp9_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x16 vp9_mse16x16_sse2 - -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_c +unsigned int vp9_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x8 vp9_mse16x8_c +unsigned int vp9_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse16x8 vp9_mse16x8_sse2 + +unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vp9_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse8x16 vp9_mse8x16_sse2 unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x8 vp9_mse8x8_c - -unsigned int vp9_sub_pixel_mse64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_mse64x64 vp9_sub_pixel_mse64x64_c - -unsigned int vp9_sub_pixel_mse32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_mse32x32 vp9_sub_pixel_mse32x32_c - -unsigned int vp9_get_mb_ss_c(const int16_t *); -unsigned int vp9_get_mb_ss_mmx(const int16_t *); -unsigned int vp9_get_mb_ss_sse2(const int16_t *); -#define vp9_get_mb_ss vp9_get_mb_ss_sse2 - -int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); -int64_t vp9_block_error_sse2(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); -#define vp9_block_error vp9_block_error_sse2 - -void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); -void vp9_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); -#define vp9_subtract_block vp9_subtract_block_sse2 +unsigned int vp9_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse8x8 vp9_mse8x8_sse2 void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void vp9_quantize_b_ssse3(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); @@ -823,60 +444,460 @@ void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int ski void vp9_quantize_b_32x32_ssse3(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); RTCD_EXTERN void (*vp9_quantize_b_32x32)(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_short_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -#define vp9_short_fht4x4 vp9_short_fht4x4_sse2 +void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_ssse3(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +RTCD_EXTERN void (*vp9_quantize_fp)(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_short_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -#define vp9_short_fht8x8 vp9_short_fht8x8_sse2 +void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_32x32_ssse3(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_short_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -#define vp9_short_fht16x16 vp9_short_fht16x16_sse2 +int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_refining_search_sad vp9_refining_search_sad_c -void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride); -#define vp9_fwht4x4 vp9_fwht4x4_c +unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x16 vp9_sad16x16_sse2 -void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct4x4 vp9_fdct4x4_sse2 +unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad16x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x16_avg vp9_sad16x16_avg_sse2 -void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct8x8 vp9_fdct8x8_sse2 +void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct16x16 vp9_fdct16x16_sse2 +void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad16x16x4d vp9_sad16x16x4d_sse2 -void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct32x32_sse2(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct32x32 vp9_fdct32x32_sse2 +void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad16x16x8 vp9_sad16x16x8_c -void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct32x32_rd_sse2(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct32x32_rd vp9_fdct32x32_rd_sse2 +unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x32 vp9_sad16x32_sse2 -int vp9_full_search_sad_c(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -int vp9_full_search_sadx3(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -int vp9_full_search_sadx8(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -RTCD_EXTERN int (*vp9_full_search_sad)(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); +unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad16x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x32_avg vp9_sad16x32_avg_sse2 -int vp9_refining_search_sad_c(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -int vp9_refining_search_sadx4(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -RTCD_EXTERN int (*vp9_refining_search_sad)(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); +void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad16x32x4d vp9_sad16x32x4d_sse2 -int vp9_diamond_search_sad_c(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -int vp9_diamond_search_sadx4(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -RTCD_EXTERN int (*vp9_diamond_search_sad)(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); +unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x8 vp9_sad16x8_sse2 -void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad16x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x8_avg vp9_sad16x8_avg_sse2 + +void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad16x8x4d vp9_sad16x8x4d_sse2 + +void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad16x8x8 vp9_sad16x8x8_c + +unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad32x16 vp9_sad32x16_sse2 + +unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad32x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x16_avg vp9_sad32x16_avg_sse2 + +void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad32x16x4d vp9_sad32x16x4d_sse2 + +unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad32x32 vp9_sad32x32_sse2 + +unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad32x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x32_avg vp9_sad32x32_avg_sse2 + +void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad32x32x3 vp9_sad32x32x3_c + +void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad32x32x8 vp9_sad32x32x8_c + +unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad32x64 vp9_sad32x64_sse2 + +unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad32x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x64_avg vp9_sad32x64_avg_sse2 + +void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad32x64x4d vp9_sad32x64x4d_sse2 + +unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad4x4_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad4x4 vp9_sad4x4_sse + +unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad4x4_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad4x4_avg vp9_sad4x4_avg_sse + +void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad4x4x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad4x4x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad4x4x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad4x4x4d vp9_sad4x4x4d_sse + +void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad4x4x8 vp9_sad4x4x8_c + +unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad4x8_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad4x8 vp9_sad4x8_sse + +unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad4x8_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad4x8_avg vp9_sad4x8_avg_sse + +void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad4x8x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad4x8x4d vp9_sad4x8x4d_sse + +void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad4x8x8 vp9_sad4x8x8_c + +unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad64x32 vp9_sad64x32_sse2 + +unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad64x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad64x32_avg vp9_sad64x32_avg_sse2 + +void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad64x32x4d vp9_sad64x32x4d_sse2 + +unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad64x64 vp9_sad64x64_sse2 + +unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad64x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad64x64_avg vp9_sad64x64_avg_sse2 + +void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad64x64x3 vp9_sad64x64x3_c + +void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad64x64x8 vp9_sad64x64x8_c + +unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x16 vp9_sad8x16_sse2 + +unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad8x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x16_avg vp9_sad8x16_avg_sse2 + +void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad8x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad8x16x4d vp9_sad8x16x4d_sse2 + +void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x16x8 vp9_sad8x16x8_c + +unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x4 vp9_sad8x4_sse2 + +unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad8x4_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x4_avg vp9_sad8x4_avg_sse2 + +void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad8x4x4d vp9_sad8x4x4d_sse2 + +void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x4x8 vp9_sad8x4x8_c + +unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x8 vp9_sad8x8_sse2 + +unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad8x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x8_avg vp9_sad8x8_avg_sse2 + +void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad8x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad8x8x4d vp9_sad8x8x4d_sse2 + +void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x8x8 vp9_sad8x8x8_c + +unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vp9_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +#define vp9_subtract_block vp9_subtract_block_sse2 + +void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); #define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2 -void vp9_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction); -#define vp9_yv12_copy_partial_frame vp9_yv12_copy_partial_frame_c +void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_sse2 + +void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_sse2 + +void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_sse + +void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_sse2 + +void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_sse2 + +void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_sse2 + +void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_sse + +void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_sse + +unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance16x32 vp9_variance16x32_sse2 + +unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance16x8 vp9_variance16x8_sse2 + +unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance32x64 vp9_variance32x64_sse2 + +unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance4x4 vp9_variance4x4_sse2 + +unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance4x8 vp9_variance4x8_sse2 + +unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x16 vp9_variance8x16_sse2 + +unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x4 vp9_variance8x4_sse2 + +unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x8 vp9_variance8x8_sse2 void vp9_rtcd(void); @@ -888,344 +909,179 @@ static void setup_rtcd_internal(void) (void)flags; - vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_ssse3; - - vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_ssse3; - - vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_ssse3; - - vp9_h_predictor_4x4 = vp9_h_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_h_predictor_4x4 = vp9_h_predictor_4x4_ssse3; - - - - vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_ssse3; - - - - - - - - vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_ssse3; - - vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_ssse3; - - vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_ssse3; - - vp9_h_predictor_8x8 = vp9_h_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_h_predictor_8x8 = vp9_h_predictor_8x8_ssse3; - - - - vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_ssse3; - - - - - - - - vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_ssse3; - - vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_ssse3; - - vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_ssse3; - - vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_ssse3; - - - - vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_ssse3; - - - - - - - - vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_ssse3; - - vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_ssse3; - - vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_ssse3; - - vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_h_predictor_32x32 = vp9_h_predictor_32x32_ssse3; - - - - - - - - - - - - - - vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_sse2; - if (flags & HAS_AVX2) vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_avx2; - - - - - - - - + vp9_block_error = vp9_block_error_sse2; + if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2; vp9_convolve8 = vp9_convolve8_sse2; if (flags & HAS_SSSE3) vp9_convolve8 = vp9_convolve8_ssse3; - - vp9_convolve8_horiz = vp9_convolve8_horiz_sse2; - if (flags & HAS_SSSE3) vp9_convolve8_horiz = vp9_convolve8_horiz_ssse3; - - vp9_convolve8_vert = vp9_convolve8_vert_sse2; - if (flags & HAS_SSSE3) vp9_convolve8_vert = vp9_convolve8_vert_ssse3; - + if (flags & HAS_AVX2) vp9_convolve8 = vp9_convolve8_avx2; vp9_convolve8_avg = vp9_convolve8_avg_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg = vp9_convolve8_avg_ssse3; - vp9_convolve8_avg_horiz = vp9_convolve8_avg_horiz_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg_horiz = vp9_convolve8_avg_horiz_ssse3; - vp9_convolve8_avg_vert = vp9_convolve8_avg_vert_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg_vert = vp9_convolve8_avg_vert_ssse3; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_ssse3; - - vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_ssse3; - - vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_ssse3; - - vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_ssse3; - - vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_ssse3; - - vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_ssse3; - - vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_ssse3; - - vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_ssse3; - - vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_ssse3; - - vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_ssse3; - - vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_ssse3; - - vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_ssse3; - - vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_ssse3; - - vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_ssse3; - - vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_ssse3; - - vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_ssse3; - - vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_ssse3; - - vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_ssse3; - - vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_ssse3; - - vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_ssse3; - - vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_ssse3; - - vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_ssse3; - - vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_ssse3; - - vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_ssse3; - - vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_ssse3; - - vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_ssse3; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - vp9_sad16x16x3 = vp9_sad16x16x3_c; - if (flags & HAS_SSE3) vp9_sad16x16x3 = vp9_sad16x16x3_sse3; - if (flags & HAS_SSSE3) vp9_sad16x16x3 = vp9_sad16x16x3_ssse3; - - vp9_sad16x8x3 = vp9_sad16x8x3_c; - if (flags & HAS_SSE3) vp9_sad16x8x3 = vp9_sad16x8x3_sse3; - if (flags & HAS_SSSE3) vp9_sad16x8x3 = vp9_sad16x8x3_ssse3; - - vp9_sad8x16x3 = vp9_sad8x16x3_c; - if (flags & HAS_SSE3) vp9_sad8x16x3 = vp9_sad8x16x3_sse3; - - vp9_sad8x8x3 = vp9_sad8x8x3_c; - if (flags & HAS_SSE3) vp9_sad8x8x3 = vp9_sad8x8x3_sse3; - - vp9_sad4x4x3 = vp9_sad4x4x3_c; - if (flags & HAS_SSE3) vp9_sad4x4x3 = vp9_sad4x4x3_sse3; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - vp9_quantize_b = vp9_quantize_b_c; - if (flags & HAS_SSSE3) vp9_quantize_b = vp9_quantize_b_ssse3; - - vp9_quantize_b_32x32 = vp9_quantize_b_32x32_c; - if (flags & HAS_SSSE3) vp9_quantize_b_32x32 = vp9_quantize_b_32x32_ssse3; - - - - - - - - - - + vp9_convolve8_horiz = vp9_convolve8_horiz_sse2; + if (flags & HAS_SSSE3) vp9_convolve8_horiz = vp9_convolve8_horiz_ssse3; + if (flags & HAS_AVX2) vp9_convolve8_horiz = vp9_convolve8_horiz_avx2; + vp9_convolve8_vert = vp9_convolve8_vert_sse2; + if (flags & HAS_SSSE3) vp9_convolve8_vert = vp9_convolve8_vert_ssse3; + if (flags & HAS_AVX2) vp9_convolve8_vert = vp9_convolve8_vert_avx2; + vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_ssse3; + vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_ssse3; + vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_ssse3; + vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_ssse3; + vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_ssse3; + vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_ssse3; + vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_ssse3; + vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_ssse3; + vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_ssse3; + vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_ssse3; + vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_ssse3; + vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_ssse3; + vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_ssse3; + vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_ssse3; + vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_ssse3; + vp9_fdct32x32 = vp9_fdct32x32_sse2; + if (flags & HAS_AVX2) vp9_fdct32x32 = vp9_fdct32x32_avx2; + vp9_fdct32x32_rd = vp9_fdct32x32_rd_sse2; + if (flags & HAS_AVX2) vp9_fdct32x32_rd = vp9_fdct32x32_rd_avx2; + vp9_fdct8x8 = vp9_fdct8x8_sse2; + if (flags & HAS_SSSE3) vp9_fdct8x8 = vp9_fdct8x8_ssse3; vp9_full_search_sad = vp9_full_search_sad_c; if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3; if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; - - vp9_refining_search_sad = vp9_refining_search_sad_c; - if (flags & HAS_SSE3) vp9_refining_search_sad = vp9_refining_search_sadx4; - - vp9_diamond_search_sad = vp9_diamond_search_sad_c; - if (flags & HAS_SSE3) vp9_diamond_search_sad = vp9_diamond_search_sadx4; + vp9_get16x16var = vp9_get16x16var_sse2; + if (flags & HAS_AVX2) vp9_get16x16var = vp9_get16x16var_avx2; + vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_ssse3; + vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_h_predictor_32x32 = vp9_h_predictor_32x32_ssse3; + vp9_h_predictor_4x4 = vp9_h_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_h_predictor_4x4 = vp9_h_predictor_4x4_ssse3; + vp9_h_predictor_8x8 = vp9_h_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_h_predictor_8x8 = vp9_h_predictor_8x8_ssse3; + vp9_idct16x16_10_add = vp9_idct16x16_10_add_sse2; + if (flags & HAS_SSSE3) vp9_idct16x16_10_add = vp9_idct16x16_10_add_ssse3; + vp9_idct16x16_256_add = vp9_idct16x16_256_add_sse2; + if (flags & HAS_SSSE3) vp9_idct16x16_256_add = vp9_idct16x16_256_add_ssse3; + vp9_idct8x8_12_add = vp9_idct8x8_12_add_sse2; + if (flags & HAS_SSSE3) vp9_idct8x8_12_add = vp9_idct8x8_12_add_ssse3; + vp9_idct8x8_64_add = vp9_idct8x8_64_add_sse2; + if (flags & HAS_SSSE3) vp9_idct8x8_64_add = vp9_idct8x8_64_add_ssse3; + vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_sse2; + if (flags & HAS_AVX2) vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_avx2; + vp9_mse16x16 = vp9_mse16x16_sse2; + if (flags & HAS_AVX2) vp9_mse16x16 = vp9_mse16x16_avx2; + vp9_quantize_b = vp9_quantize_b_c; + if (flags & HAS_SSSE3) vp9_quantize_b = vp9_quantize_b_ssse3; + vp9_quantize_b_32x32 = vp9_quantize_b_32x32_c; + if (flags & HAS_SSSE3) vp9_quantize_b_32x32 = vp9_quantize_b_32x32_ssse3; + vp9_quantize_fp = vp9_quantize_fp_c; + if (flags & HAS_SSSE3) vp9_quantize_fp = vp9_quantize_fp_ssse3; + vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c; + if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3; + vp9_sad16x16x3 = vp9_sad16x16x3_c; + if (flags & HAS_SSE3) vp9_sad16x16x3 = vp9_sad16x16x3_sse3; + if (flags & HAS_SSSE3) vp9_sad16x16x3 = vp9_sad16x16x3_ssse3; + vp9_sad16x8x3 = vp9_sad16x8x3_c; + if (flags & HAS_SSE3) vp9_sad16x8x3 = vp9_sad16x8x3_sse3; + if (flags & HAS_SSSE3) vp9_sad16x8x3 = vp9_sad16x8x3_ssse3; + vp9_sad32x32x4d = vp9_sad32x32x4d_sse2; + if (flags & HAS_AVX2) vp9_sad32x32x4d = vp9_sad32x32x4d_avx2; + vp9_sad4x4x3 = vp9_sad4x4x3_c; + if (flags & HAS_SSE3) vp9_sad4x4x3 = vp9_sad4x4x3_sse3; + vp9_sad64x64x4d = vp9_sad64x64x4d_sse2; + if (flags & HAS_AVX2) vp9_sad64x64x4d = vp9_sad64x64x4d_avx2; + vp9_sad8x16x3 = vp9_sad8x16x3_c; + if (flags & HAS_SSE3) vp9_sad8x16x3 = vp9_sad8x16x3_sse3; + vp9_sad8x8x3 = vp9_sad8x8x3_c; + if (flags & HAS_SSE3) vp9_sad8x8x3 = vp9_sad8x8x3_sse3; + vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_ssse3; + vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_ssse3; + vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_ssse3; + vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_ssse3; + vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_ssse3; + if (flags & HAS_AVX2) vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_avx2; + vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_ssse3; + vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_ssse3; + vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_ssse3; + vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_ssse3; + vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_ssse3; + if (flags & HAS_AVX2) vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_avx2; + vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_ssse3; + vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_ssse3; + vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_ssse3; + vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_ssse3; + vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_ssse3; + vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_ssse3; + vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_ssse3; + vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_ssse3; + if (flags & HAS_AVX2) vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_avx2; + vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_ssse3; + vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_ssse3; + vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_ssse3; + vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_ssse3; + vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_ssse3; + if (flags & HAS_AVX2) vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_avx2; + vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_ssse3; + vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_ssse3; + vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_ssse3; + vp9_variance16x16 = vp9_variance16x16_sse2; + if (flags & HAS_AVX2) vp9_variance16x16 = vp9_variance16x16_avx2; + vp9_variance32x16 = vp9_variance32x16_sse2; + if (flags & HAS_AVX2) vp9_variance32x16 = vp9_variance32x16_avx2; + vp9_variance32x32 = vp9_variance32x32_sse2; + if (flags & HAS_AVX2) vp9_variance32x32 = vp9_variance32x32_avx2; + vp9_variance64x32 = vp9_variance64x32_sse2; + if (flags & HAS_AVX2) vp9_variance64x32 = vp9_variance64x32_avx2; + vp9_variance64x64 = vp9_variance64x64_sse2; + if (flags & HAS_AVX2) vp9_variance64x64 = vp9_variance64x64_avx2; } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vp9_rtcd_x86_64-linux-gcc.h b/media/libvpx/vp9_rtcd_x86_64-linux-gcc.h index db6bbc2485ec..dcc090dfe196 100644 --- a/media/libvpx/vp9_rtcd_x86_64-linux-gcc.h +++ b/media/libvpx/vp9_rtcd_x86_64-linux-gcc.h @@ -7,6 +7,10 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + /* * VP9 */ @@ -19,254 +23,22 @@ struct macroblockd; /* Encoder forward decls */ struct macroblock; struct vp9_variance_vtable; - -#define DEC_MVCOSTS int *mvjcost, int *mvcost[2] +struct search_site_config; +struct mv; union int_mv; struct yv12_buffer_config; -void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c - -void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c - -void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_sse - -void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_tm_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_sse - -void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_sse - -void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c - -void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c - -void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c - -void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c - -void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c - -void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_sse - -void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_sse2 - -void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_sse - -void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c - -void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c - -void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c - -void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c - -void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c - -void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_sse2 - -void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_sse2 - -void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_sse2 - -void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c - -void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c - -void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c - -void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c - -void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c - -void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c - -void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_sse2 - -void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_sse2 - -void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_sse2 - -void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c - -void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c - -void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c - -void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); -void vp9_mb_lpf_vertical_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); -#define vp9_mb_lpf_vertical_edge_w vp9_mb_lpf_vertical_edge_w_sse2 - -void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mbloop_filter_vertical_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_mbloop_filter_vertical_edge vp9_mbloop_filter_vertical_edge_sse2 - -void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_loop_filter_vertical_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_mmx - -void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mb_lpf_horizontal_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_sse2 - -void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mbloop_filter_horizontal_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_mbloop_filter_horizontal_edge vp9_mbloop_filter_horizontal_edge_sse2 - -void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_loop_filter_horizontal_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_loop_filter_horizontal_edge vp9_loop_filter_horizontal_edge_mmx - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve_copy vp9_convolve_copy_sse2 - -void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve_avg vp9_convolve_avg_sse2 +int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_sse2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); +RTCD_EXTERN int64_t (*vp9_block_error)(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*vp9_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); - -void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); - void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); @@ -282,25 +54,264 @@ void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8 void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*vp9_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct4x4_1_add vp9_idct4x4_1_add_sse2 +void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct4x4_16_add vp9_idct4x4_16_add_sse2 +void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct8x8_1_add vp9_idct8x8_1_add_sse2 +void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_avg vp9_convolve_avg_sse2 -void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct8x8_64_add vp9_idct8x8_64_add_sse2 +void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_copy vp9_convolve_copy_sse2 -void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct8x8_10_add vp9_idct8x8_10_add_sse2 +void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c + +void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c + +void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c + +void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c + +void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c + +void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c + +void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c + +void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c + +void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c + +void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c + +void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c + +void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c + +void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c + +void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c + +void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c + +void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c + +void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c + +void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_sse2 + +void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_sse2 + +void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_sse + +void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_sse + +void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c + +void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c + +void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c + +void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct16x16 vp9_fdct16x16_sse2 + +void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct16x16_1 vp9_fdct16x16_1_sse2 + +void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_sse2(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_avx2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct32x32)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct32x32_1 vp9_fdct32x32_1_sse2 + +void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_rd_sse2(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_rd_avx2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct32x32_rd)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct4x4 vp9_fdct4x4_sse2 + +void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct4x4_1 vp9_fdct4x4_1_sse2 + +void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride); +void vp9_fdct8x8_ssse3(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct8x8)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct8x8_1 vp9_fdct8x8_1_sse2 + +void vp9_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht16x16_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +#define vp9_fht16x16 vp9_fht16x16_sse2 + +void vp9_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht4x4_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +#define vp9_fht4x4 vp9_fht4x4_sse2 + +void vp9_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht8x8_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +#define vp9_fht8x8 vp9_fht8x8_sse2 + +int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_full_range_search vp9_full_range_search_c + +int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +int vp9_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +RTCD_EXTERN int (*vp9_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); + +void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride); +void vp9_fwht4x4_mmx(const int16_t *input, int16_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_mmx + +void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vp9_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vp9_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vp9_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vp9_get8x8var vp9_get8x8var_sse2 + +unsigned int vp9_get_mb_ss_c(const int16_t *); +unsigned int vp9_get_mb_ss_sse2(const int16_t *); +#define vp9_get_mb_ss vp9_get_mb_ss_sse2 + +void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_10_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct16x16_10_add)(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); @@ -308,23 +319,46 @@ void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stri void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct16x16_256_add vp9_idct16x16_256_add_sse2 - -void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct16x16_10_add vp9_idct16x16_10_add_sse2 +void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct16x16_256_add)(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); #define vp9_idct32x32_1024_add vp9_idct32x32_1024_add_sse2 +void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_idct32x32_1_add vp9_idct32x32_1_add_sse2 + void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); #define vp9_idct32x32_34_add vp9_idct32x32_34_add_sse2 -void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct32x32_1_add vp9_idct32x32_1_add_sse2 +void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_idct4x4_16_add vp9_idct4x4_16_add_sse2 + +void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_idct4x4_1_add vp9_idct4x4_1_add_sse2 + +void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_12_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct8x8_12_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_idct8x8_1_add vp9_idct8x8_1_add_sse2 + +void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_64_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct8x8_64_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type); +void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *output, int pitch, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2 void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); @@ -334,485 +368,73 @@ void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride, i void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2 -void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type); -void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *output, int pitch, int tx_type); -#define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2 +void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c void vp9_iwht4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); #define vp9_iwht4x4_1_add vp9_iwht4x4_1_add_c -void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c - -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x16 vp9_variance32x16_sse2 - -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x32 vp9_variance16x32_sse2 - -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x32 vp9_variance64x32_sse2 - -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x64 vp9_variance32x64_sse2 - -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x32 vp9_variance32x32_sse2 - -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x64 vp9_variance64x64_sse2 - -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x16 vp9_variance16x16_sse2 - -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x8 vp9_variance16x8_sse2 - -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x16 vp9_variance8x16_sse2 - -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x8 vp9_variance8x8_sse2 - -void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vp9_get_sse_sum_8x8 vp9_get8x8var_sse2 - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x4 vp9_variance8x4_sse2 - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x8 vp9_variance4x8_sse2 - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x4 vp9_variance4x4_sse2 - -unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad64x64 vp9_sad64x64_sse2 - -unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x64 vp9_sad32x64_sse2 - -unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad64x32 vp9_sad64x32_sse2 - -unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x16 vp9_sad32x16_sse2 - -unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad16x32 vp9_sad16x32_sse2 - -unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x32 vp9_sad32x32_sse2 - -unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad16x16 vp9_sad16x16_sse2 - -unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad16x8 vp9_sad16x8_sse2 - -unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad8x16 vp9_sad8x16_sse2 - -unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad8x8 vp9_sad8x8_sse2 - -unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad8x4 vp9_sad8x4_sse2 - -unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad4x8_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad4x8 vp9_sad4x8_sse - -unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad4x4_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad4x4 vp9_sad4x4_sse - -unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad64x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad64x64_avg vp9_sad64x64_avg_sse2 - -unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad32x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x64_avg vp9_sad32x64_avg_sse2 - -unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad64x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad64x32_avg vp9_sad64x32_avg_sse2 - -unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad32x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x16_avg vp9_sad32x16_avg_sse2 - -unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad16x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x32_avg vp9_sad16x32_avg_sse2 - -unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad32x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x32_avg vp9_sad32x32_avg_sse2 - -unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad16x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x16_avg vp9_sad16x16_avg_sse2 - -unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad16x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x8_avg vp9_sad16x8_avg_sse2 - -unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad8x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x16_avg vp9_sad8x16_avg_sse2 - -unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad8x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x8_avg vp9_sad8x8_avg_sse2 - -unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad8x4_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x4_avg vp9_sad8x4_avg_sse2 - -unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad4x8_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad4x8_avg vp9_sad4x8_avg_sse - -unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad4x4_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad4x4_avg vp9_sad4x4_avg_sse - -unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance_halfpixvar16x16_h_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_sse2 - -unsigned int vp9_variance_halfpixvar16x16_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance_halfpixvar16x16_v_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_sse2 - -unsigned int vp9_variance_halfpixvar16x16_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance_halfpixvar16x16_hv_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_sse2 - -unsigned int vp9_variance_halfpixvar64x64_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_h vp9_variance_halfpixvar64x64_h_c - -unsigned int vp9_variance_halfpixvar64x64_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_v vp9_variance_halfpixvar64x64_v_c - -unsigned int vp9_variance_halfpixvar64x64_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_hv vp9_variance_halfpixvar64x64_hv_c - -unsigned int vp9_variance_halfpixvar32x32_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_h vp9_variance_halfpixvar32x32_h_c - -unsigned int vp9_variance_halfpixvar32x32_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_v vp9_variance_halfpixvar32x32_v_c - -unsigned int vp9_variance_halfpixvar32x32_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_hv vp9_variance_halfpixvar32x32_hv_c - -void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad64x64x3 vp9_sad64x64x3_c - -void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad32x32x3 vp9_sad32x32x3_c - -void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad8x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad8x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad4x4x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad4x4x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad64x64x8 vp9_sad64x64x8_c - -void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad32x32x8 vp9_sad32x32x8_c - -void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad16x16x8 vp9_sad16x16x8_c - -void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad16x8x8 vp9_sad16x8x8_c - -void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x16x8 vp9_sad8x16x8_c - -void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x8x8 vp9_sad8x8x8_c - -void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x4x8 vp9_sad8x4x8_c - -void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad4x8x8 vp9_sad4x8x8_c - -void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad4x4x8 vp9_sad4x4x8_c - -void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad64x64x4d vp9_sad64x64x4d_sse2 - -void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad32x64x4d vp9_sad32x64x4d_sse2 - -void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad64x32x4d vp9_sad64x32x4d_sse2 - -void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad32x16x4d vp9_sad32x16x4d_sse2 - -void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad16x32x4d vp9_sad16x32x4d_sse2 - -void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad32x32x4d vp9_sad32x32x4d_sse2 - -void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad16x16x4d vp9_sad16x16x4d_sse2 - -void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad16x8x4d vp9_sad16x8x4d_sse2 - -void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad8x16x4d vp9_sad8x16x4d_sse2 - -void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad8x8x4d vp9_sad8x8x4d_sse2 - -void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad8x4x4d vp9_sad8x4x4d_sse2 - -void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad4x8x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad4x8x4d vp9_sad4x8x4d_sse - -void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad4x4x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad4x4x4d vp9_sad4x4x4d_sse +void vp9_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_16_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_horizontal_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); + +void vp9_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_4_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_lpf_horizontal_4 vp9_lpf_horizontal_4_mmx + +void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vp9_lpf_horizontal_4_dual vp9_lpf_horizontal_4_dual_sse2 + +void vp9_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_lpf_horizontal_8 vp9_lpf_horizontal_8_sse2 + +void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vp9_lpf_horizontal_8_dual vp9_lpf_horizontal_8_dual_sse2 + +void vp9_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vp9_lpf_vertical_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vp9_lpf_vertical_16 vp9_lpf_vertical_16_sse2 + +void vp9_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vp9_lpf_vertical_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vp9_lpf_vertical_16_dual vp9_lpf_vertical_16_dual_sse2 + +void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_vertical_4_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_lpf_vertical_4 vp9_lpf_vertical_4_mmx + +void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vp9_lpf_vertical_4_dual vp9_lpf_vertical_4_dual_sse2 + +void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_lpf_vertical_8 vp9_lpf_vertical_8_sse2 + +void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vp9_lpf_vertical_8_dual vp9_lpf_vertical_8_dual_sse2 unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int vp9_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x16 vp9_mse16x16_sse2 - -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_c +unsigned int vp9_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x8 vp9_mse16x8_c +unsigned int vp9_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse16x8 vp9_mse16x8_sse2 + +unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vp9_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse8x16 vp9_mse8x16_sse2 unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x8 vp9_mse8x8_c - -unsigned int vp9_sub_pixel_mse64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_mse64x64 vp9_sub_pixel_mse64x64_c - -unsigned int vp9_sub_pixel_mse32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_mse32x32 vp9_sub_pixel_mse32x32_c - -unsigned int vp9_get_mb_ss_c(const int16_t *); -unsigned int vp9_get_mb_ss_mmx(const int16_t *); -unsigned int vp9_get_mb_ss_sse2(const int16_t *); -#define vp9_get_mb_ss vp9_get_mb_ss_sse2 - -int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); -int64_t vp9_block_error_sse2(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); -#define vp9_block_error vp9_block_error_sse2 - -void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); -void vp9_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); -#define vp9_subtract_block vp9_subtract_block_sse2 +unsigned int vp9_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse8x8 vp9_mse8x8_sse2 void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void vp9_quantize_b_ssse3(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); @@ -822,60 +444,460 @@ void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int ski void vp9_quantize_b_32x32_ssse3(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); RTCD_EXTERN void (*vp9_quantize_b_32x32)(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_short_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -#define vp9_short_fht4x4 vp9_short_fht4x4_sse2 +void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_ssse3(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +RTCD_EXTERN void (*vp9_quantize_fp)(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_short_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -#define vp9_short_fht8x8 vp9_short_fht8x8_sse2 +void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_32x32_ssse3(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_short_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -#define vp9_short_fht16x16 vp9_short_fht16x16_sse2 +int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_refining_search_sad vp9_refining_search_sad_c -void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride); -#define vp9_fwht4x4 vp9_fwht4x4_c +unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x16 vp9_sad16x16_sse2 -void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct4x4 vp9_fdct4x4_sse2 +unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad16x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x16_avg vp9_sad16x16_avg_sse2 -void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct8x8 vp9_fdct8x8_sse2 +void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct16x16 vp9_fdct16x16_sse2 +void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad16x16x4d vp9_sad16x16x4d_sse2 -void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct32x32_sse2(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct32x32 vp9_fdct32x32_sse2 +void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad16x16x8 vp9_sad16x16x8_c -void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct32x32_rd_sse2(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct32x32_rd vp9_fdct32x32_rd_sse2 +unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x32 vp9_sad16x32_sse2 -int vp9_full_search_sad_c(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -int vp9_full_search_sadx3(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -int vp9_full_search_sadx8(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -RTCD_EXTERN int (*vp9_full_search_sad)(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); +unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad16x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x32_avg vp9_sad16x32_avg_sse2 -int vp9_refining_search_sad_c(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -int vp9_refining_search_sadx4(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -RTCD_EXTERN int (*vp9_refining_search_sad)(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); +void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad16x32x4d vp9_sad16x32x4d_sse2 -int vp9_diamond_search_sad_c(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -int vp9_diamond_search_sadx4(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -RTCD_EXTERN int (*vp9_diamond_search_sad)(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); +unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x8 vp9_sad16x8_sse2 -void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad16x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x8_avg vp9_sad16x8_avg_sse2 + +void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad16x8x4d vp9_sad16x8x4d_sse2 + +void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad16x8x8 vp9_sad16x8x8_c + +unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad32x16 vp9_sad32x16_sse2 + +unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad32x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x16_avg vp9_sad32x16_avg_sse2 + +void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad32x16x4d vp9_sad32x16x4d_sse2 + +unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad32x32 vp9_sad32x32_sse2 + +unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad32x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x32_avg vp9_sad32x32_avg_sse2 + +void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad32x32x3 vp9_sad32x32x3_c + +void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad32x32x8 vp9_sad32x32x8_c + +unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad32x64 vp9_sad32x64_sse2 + +unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad32x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x64_avg vp9_sad32x64_avg_sse2 + +void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad32x64x4d vp9_sad32x64x4d_sse2 + +unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad4x4_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad4x4 vp9_sad4x4_sse + +unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad4x4_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad4x4_avg vp9_sad4x4_avg_sse + +void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad4x4x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad4x4x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad4x4x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad4x4x4d vp9_sad4x4x4d_sse + +void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad4x4x8 vp9_sad4x4x8_c + +unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad4x8_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad4x8 vp9_sad4x8_sse + +unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad4x8_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad4x8_avg vp9_sad4x8_avg_sse + +void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad4x8x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad4x8x4d vp9_sad4x8x4d_sse + +void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad4x8x8 vp9_sad4x8x8_c + +unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad64x32 vp9_sad64x32_sse2 + +unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad64x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad64x32_avg vp9_sad64x32_avg_sse2 + +void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad64x32x4d vp9_sad64x32x4d_sse2 + +unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad64x64 vp9_sad64x64_sse2 + +unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad64x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad64x64_avg vp9_sad64x64_avg_sse2 + +void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad64x64x3 vp9_sad64x64x3_c + +void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad64x64x8 vp9_sad64x64x8_c + +unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x16 vp9_sad8x16_sse2 + +unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad8x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x16_avg vp9_sad8x16_avg_sse2 + +void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad8x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad8x16x4d vp9_sad8x16x4d_sse2 + +void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x16x8 vp9_sad8x16x8_c + +unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x4 vp9_sad8x4_sse2 + +unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad8x4_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x4_avg vp9_sad8x4_avg_sse2 + +void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad8x4x4d vp9_sad8x4x4d_sse2 + +void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x4x8 vp9_sad8x4x8_c + +unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x8 vp9_sad8x8_sse2 + +unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad8x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x8_avg vp9_sad8x8_avg_sse2 + +void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad8x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad8x8x4d vp9_sad8x8x4d_sse2 + +void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x8x8 vp9_sad8x8x8_c + +unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vp9_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +#define vp9_subtract_block vp9_subtract_block_sse2 + +void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); #define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2 -void vp9_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction); -#define vp9_yv12_copy_partial_frame vp9_yv12_copy_partial_frame_c +void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_sse2 + +void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_sse2 + +void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_sse + +void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_sse2 + +void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_sse2 + +void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_sse2 + +void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_sse + +void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_sse + +unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance16x32 vp9_variance16x32_sse2 + +unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance16x8 vp9_variance16x8_sse2 + +unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance32x64 vp9_variance32x64_sse2 + +unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance4x4 vp9_variance4x4_sse2 + +unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance4x8 vp9_variance4x8_sse2 + +unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x16 vp9_variance8x16_sse2 + +unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x4 vp9_variance8x4_sse2 + +unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x8 vp9_variance8x8_sse2 void vp9_rtcd(void); @@ -887,342 +909,179 @@ static void setup_rtcd_internal(void) (void)flags; - vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_ssse3; - - vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_ssse3; - - vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_ssse3; - - vp9_h_predictor_4x4 = vp9_h_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_h_predictor_4x4 = vp9_h_predictor_4x4_ssse3; - - - - vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_ssse3; - - - - - - - - vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_ssse3; - - vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_ssse3; - - vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_ssse3; - - vp9_h_predictor_8x8 = vp9_h_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_h_predictor_8x8 = vp9_h_predictor_8x8_ssse3; - - - - vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_ssse3; - - - - - - - - vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_ssse3; - - vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_ssse3; - - vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_ssse3; - - vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_ssse3; - - - - vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_ssse3; - - - - - - - - vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_ssse3; - - vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_ssse3; - - vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_ssse3; - - vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_h_predictor_32x32 = vp9_h_predictor_32x32_ssse3; - - - - - - - - - - - - - - - - - - - - - + vp9_block_error = vp9_block_error_sse2; + if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2; vp9_convolve8 = vp9_convolve8_sse2; if (flags & HAS_SSSE3) vp9_convolve8 = vp9_convolve8_ssse3; - - vp9_convolve8_horiz = vp9_convolve8_horiz_sse2; - if (flags & HAS_SSSE3) vp9_convolve8_horiz = vp9_convolve8_horiz_ssse3; - - vp9_convolve8_vert = vp9_convolve8_vert_sse2; - if (flags & HAS_SSSE3) vp9_convolve8_vert = vp9_convolve8_vert_ssse3; - + if (flags & HAS_AVX2) vp9_convolve8 = vp9_convolve8_avx2; vp9_convolve8_avg = vp9_convolve8_avg_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg = vp9_convolve8_avg_ssse3; - vp9_convolve8_avg_horiz = vp9_convolve8_avg_horiz_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg_horiz = vp9_convolve8_avg_horiz_ssse3; - vp9_convolve8_avg_vert = vp9_convolve8_avg_vert_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg_vert = vp9_convolve8_avg_vert_ssse3; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_ssse3; - - vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_ssse3; - - vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_ssse3; - - vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_ssse3; - - vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_ssse3; - - vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_ssse3; - - vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_ssse3; - - vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_ssse3; - - vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_ssse3; - - vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_ssse3; - - vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_ssse3; - - vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_ssse3; - - vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_ssse3; - - vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_ssse3; - - vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_ssse3; - - vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_ssse3; - - vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_ssse3; - - vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_ssse3; - - vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_ssse3; - - vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_ssse3; - - vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_ssse3; - - vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_ssse3; - - vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_ssse3; - - vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_ssse3; - - vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_ssse3; - - vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_ssse3; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - vp9_sad16x16x3 = vp9_sad16x16x3_c; - if (flags & HAS_SSE3) vp9_sad16x16x3 = vp9_sad16x16x3_sse3; - if (flags & HAS_SSSE3) vp9_sad16x16x3 = vp9_sad16x16x3_ssse3; - - vp9_sad16x8x3 = vp9_sad16x8x3_c; - if (flags & HAS_SSE3) vp9_sad16x8x3 = vp9_sad16x8x3_sse3; - if (flags & HAS_SSSE3) vp9_sad16x8x3 = vp9_sad16x8x3_ssse3; - - vp9_sad8x16x3 = vp9_sad8x16x3_c; - if (flags & HAS_SSE3) vp9_sad8x16x3 = vp9_sad8x16x3_sse3; - - vp9_sad8x8x3 = vp9_sad8x8x3_c; - if (flags & HAS_SSE3) vp9_sad8x8x3 = vp9_sad8x8x3_sse3; - - vp9_sad4x4x3 = vp9_sad4x4x3_c; - if (flags & HAS_SSE3) vp9_sad4x4x3 = vp9_sad4x4x3_sse3; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - vp9_quantize_b = vp9_quantize_b_c; - if (flags & HAS_SSSE3) vp9_quantize_b = vp9_quantize_b_ssse3; - - vp9_quantize_b_32x32 = vp9_quantize_b_32x32_c; - if (flags & HAS_SSSE3) vp9_quantize_b_32x32 = vp9_quantize_b_32x32_ssse3; - - - - - - - - - - + vp9_convolve8_horiz = vp9_convolve8_horiz_sse2; + if (flags & HAS_SSSE3) vp9_convolve8_horiz = vp9_convolve8_horiz_ssse3; + if (flags & HAS_AVX2) vp9_convolve8_horiz = vp9_convolve8_horiz_avx2; + vp9_convolve8_vert = vp9_convolve8_vert_sse2; + if (flags & HAS_SSSE3) vp9_convolve8_vert = vp9_convolve8_vert_ssse3; + if (flags & HAS_AVX2) vp9_convolve8_vert = vp9_convolve8_vert_avx2; + vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_ssse3; + vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_ssse3; + vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_ssse3; + vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_ssse3; + vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_ssse3; + vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_ssse3; + vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_ssse3; + vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_ssse3; + vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_ssse3; + vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_ssse3; + vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_ssse3; + vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_ssse3; + vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_ssse3; + vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_ssse3; + vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_ssse3; + vp9_fdct32x32 = vp9_fdct32x32_sse2; + if (flags & HAS_AVX2) vp9_fdct32x32 = vp9_fdct32x32_avx2; + vp9_fdct32x32_rd = vp9_fdct32x32_rd_sse2; + if (flags & HAS_AVX2) vp9_fdct32x32_rd = vp9_fdct32x32_rd_avx2; + vp9_fdct8x8 = vp9_fdct8x8_sse2; + if (flags & HAS_SSSE3) vp9_fdct8x8 = vp9_fdct8x8_ssse3; vp9_full_search_sad = vp9_full_search_sad_c; if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3; if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; - - vp9_refining_search_sad = vp9_refining_search_sad_c; - if (flags & HAS_SSE3) vp9_refining_search_sad = vp9_refining_search_sadx4; - - vp9_diamond_search_sad = vp9_diamond_search_sad_c; - if (flags & HAS_SSE3) vp9_diamond_search_sad = vp9_diamond_search_sadx4; + vp9_get16x16var = vp9_get16x16var_sse2; + if (flags & HAS_AVX2) vp9_get16x16var = vp9_get16x16var_avx2; + vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_ssse3; + vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_h_predictor_32x32 = vp9_h_predictor_32x32_ssse3; + vp9_h_predictor_4x4 = vp9_h_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_h_predictor_4x4 = vp9_h_predictor_4x4_ssse3; + vp9_h_predictor_8x8 = vp9_h_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_h_predictor_8x8 = vp9_h_predictor_8x8_ssse3; + vp9_idct16x16_10_add = vp9_idct16x16_10_add_sse2; + if (flags & HAS_SSSE3) vp9_idct16x16_10_add = vp9_idct16x16_10_add_ssse3; + vp9_idct16x16_256_add = vp9_idct16x16_256_add_sse2; + if (flags & HAS_SSSE3) vp9_idct16x16_256_add = vp9_idct16x16_256_add_ssse3; + vp9_idct8x8_12_add = vp9_idct8x8_12_add_sse2; + if (flags & HAS_SSSE3) vp9_idct8x8_12_add = vp9_idct8x8_12_add_ssse3; + vp9_idct8x8_64_add = vp9_idct8x8_64_add_sse2; + if (flags & HAS_SSSE3) vp9_idct8x8_64_add = vp9_idct8x8_64_add_ssse3; + vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_sse2; + if (flags & HAS_AVX2) vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_avx2; + vp9_mse16x16 = vp9_mse16x16_sse2; + if (flags & HAS_AVX2) vp9_mse16x16 = vp9_mse16x16_avx2; + vp9_quantize_b = vp9_quantize_b_c; + if (flags & HAS_SSSE3) vp9_quantize_b = vp9_quantize_b_ssse3; + vp9_quantize_b_32x32 = vp9_quantize_b_32x32_c; + if (flags & HAS_SSSE3) vp9_quantize_b_32x32 = vp9_quantize_b_32x32_ssse3; + vp9_quantize_fp = vp9_quantize_fp_c; + if (flags & HAS_SSSE3) vp9_quantize_fp = vp9_quantize_fp_ssse3; + vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c; + if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3; + vp9_sad16x16x3 = vp9_sad16x16x3_c; + if (flags & HAS_SSE3) vp9_sad16x16x3 = vp9_sad16x16x3_sse3; + if (flags & HAS_SSSE3) vp9_sad16x16x3 = vp9_sad16x16x3_ssse3; + vp9_sad16x8x3 = vp9_sad16x8x3_c; + if (flags & HAS_SSE3) vp9_sad16x8x3 = vp9_sad16x8x3_sse3; + if (flags & HAS_SSSE3) vp9_sad16x8x3 = vp9_sad16x8x3_ssse3; + vp9_sad32x32x4d = vp9_sad32x32x4d_sse2; + if (flags & HAS_AVX2) vp9_sad32x32x4d = vp9_sad32x32x4d_avx2; + vp9_sad4x4x3 = vp9_sad4x4x3_c; + if (flags & HAS_SSE3) vp9_sad4x4x3 = vp9_sad4x4x3_sse3; + vp9_sad64x64x4d = vp9_sad64x64x4d_sse2; + if (flags & HAS_AVX2) vp9_sad64x64x4d = vp9_sad64x64x4d_avx2; + vp9_sad8x16x3 = vp9_sad8x16x3_c; + if (flags & HAS_SSE3) vp9_sad8x16x3 = vp9_sad8x16x3_sse3; + vp9_sad8x8x3 = vp9_sad8x8x3_c; + if (flags & HAS_SSE3) vp9_sad8x8x3 = vp9_sad8x8x3_sse3; + vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_ssse3; + vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_ssse3; + vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_ssse3; + vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_ssse3; + vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_ssse3; + if (flags & HAS_AVX2) vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_avx2; + vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_ssse3; + vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_ssse3; + vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_ssse3; + vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_ssse3; + vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_ssse3; + if (flags & HAS_AVX2) vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_avx2; + vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_ssse3; + vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_ssse3; + vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_ssse3; + vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_ssse3; + vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_ssse3; + vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_ssse3; + vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_ssse3; + vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_ssse3; + if (flags & HAS_AVX2) vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_avx2; + vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_ssse3; + vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_ssse3; + vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_ssse3; + vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_ssse3; + vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_ssse3; + if (flags & HAS_AVX2) vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_avx2; + vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_ssse3; + vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_ssse3; + vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_ssse3; + vp9_variance16x16 = vp9_variance16x16_sse2; + if (flags & HAS_AVX2) vp9_variance16x16 = vp9_variance16x16_avx2; + vp9_variance32x16 = vp9_variance32x16_sse2; + if (flags & HAS_AVX2) vp9_variance32x16 = vp9_variance32x16_avx2; + vp9_variance32x32 = vp9_variance32x32_sse2; + if (flags & HAS_AVX2) vp9_variance32x32 = vp9_variance32x32_avx2; + vp9_variance64x32 = vp9_variance64x32_sse2; + if (flags & HAS_AVX2) vp9_variance64x32 = vp9_variance64x32_avx2; + vp9_variance64x64 = vp9_variance64x64_sse2; + if (flags & HAS_AVX2) vp9_variance64x64 = vp9_variance64x64_avx2; } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vp9_rtcd_x86_64-win64-gcc.h b/media/libvpx/vp9_rtcd_x86_64-win64-gcc.h index db6bbc2485ec..dcc090dfe196 100644 --- a/media/libvpx/vp9_rtcd_x86_64-win64-gcc.h +++ b/media/libvpx/vp9_rtcd_x86_64-win64-gcc.h @@ -7,6 +7,10 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + /* * VP9 */ @@ -19,254 +23,22 @@ struct macroblockd; /* Encoder forward decls */ struct macroblock; struct vp9_variance_vtable; - -#define DEC_MVCOSTS int *mvjcost, int *mvcost[2] +struct search_site_config; +struct mv; union int_mv; struct yv12_buffer_config; -void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c - -void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c - -void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_sse - -void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_tm_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_sse - -void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_sse - -void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c - -void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c - -void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c - -void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c - -void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c - -void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_sse - -void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_sse2 - -void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_sse - -void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c - -void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c - -void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c - -void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c - -void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c - -void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_sse2 - -void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_sse2 - -void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_sse2 - -void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c - -void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c - -void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c - -void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c - -void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c - -void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c - -void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_sse2 - -void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_sse2 - -void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_sse2 - -void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c - -void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c - -void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c - -void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); -void vp9_mb_lpf_vertical_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); -#define vp9_mb_lpf_vertical_edge_w vp9_mb_lpf_vertical_edge_w_sse2 - -void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mbloop_filter_vertical_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_mbloop_filter_vertical_edge vp9_mbloop_filter_vertical_edge_sse2 - -void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_loop_filter_vertical_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_mmx - -void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mb_lpf_horizontal_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_sse2 - -void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mbloop_filter_horizontal_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_mbloop_filter_horizontal_edge vp9_mbloop_filter_horizontal_edge_sse2 - -void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_loop_filter_horizontal_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_loop_filter_horizontal_edge vp9_loop_filter_horizontal_edge_mmx - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve_copy vp9_convolve_copy_sse2 - -void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve_avg vp9_convolve_avg_sse2 +int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_sse2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); +RTCD_EXTERN int64_t (*vp9_block_error)(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*vp9_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); - -void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); - void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); @@ -282,25 +54,264 @@ void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8 void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*vp9_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct4x4_1_add vp9_idct4x4_1_add_sse2 +void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct4x4_16_add vp9_idct4x4_16_add_sse2 +void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct8x8_1_add vp9_idct8x8_1_add_sse2 +void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_avg vp9_convolve_avg_sse2 -void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct8x8_64_add vp9_idct8x8_64_add_sse2 +void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_copy vp9_convolve_copy_sse2 -void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct8x8_10_add vp9_idct8x8_10_add_sse2 +void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c + +void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c + +void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c + +void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c + +void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c + +void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c + +void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c + +void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c + +void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c + +void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c + +void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c + +void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c + +void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c + +void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c + +void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c + +void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c + +void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c + +void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_sse2 + +void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_sse2 + +void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_sse + +void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_sse + +void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c + +void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c + +void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c + +void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct16x16 vp9_fdct16x16_sse2 + +void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct16x16_1 vp9_fdct16x16_1_sse2 + +void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_sse2(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_avx2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct32x32)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct32x32_1 vp9_fdct32x32_1_sse2 + +void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_rd_sse2(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_rd_avx2(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct32x32_rd)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct4x4 vp9_fdct4x4_sse2 + +void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct4x4_1 vp9_fdct4x4_1_sse2 + +void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride); +void vp9_fdct8x8_ssse3(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct8x8)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct8x8_1 vp9_fdct8x8_1_sse2 + +void vp9_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht16x16_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +#define vp9_fht16x16 vp9_fht16x16_sse2 + +void vp9_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht4x4_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +#define vp9_fht4x4 vp9_fht4x4_sse2 + +void vp9_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht8x8_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +#define vp9_fht8x8 vp9_fht8x8_sse2 + +int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_full_range_search vp9_full_range_search_c + +int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +int vp9_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +RTCD_EXTERN int (*vp9_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); + +void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride); +void vp9_fwht4x4_mmx(const int16_t *input, int16_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_mmx + +void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vp9_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vp9_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +RTCD_EXTERN void (*vp9_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); + +void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vp9_get8x8var vp9_get8x8var_sse2 + +unsigned int vp9_get_mb_ss_c(const int16_t *); +unsigned int vp9_get_mb_ss_sse2(const int16_t *); +#define vp9_get_mb_ss vp9_get_mb_ss_sse2 + +void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_10_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct16x16_10_add)(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); @@ -308,23 +319,46 @@ void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stri void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct16x16_256_add vp9_idct16x16_256_add_sse2 - -void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct16x16_10_add vp9_idct16x16_10_add_sse2 +void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct16x16_256_add)(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); #define vp9_idct32x32_1024_add vp9_idct32x32_1024_add_sse2 +void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_idct32x32_1_add vp9_idct32x32_1_add_sse2 + void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); #define vp9_idct32x32_34_add vp9_idct32x32_34_add_sse2 -void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct32x32_1_add vp9_idct32x32_1_add_sse2 +void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_idct4x4_16_add vp9_idct4x4_16_add_sse2 + +void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_idct4x4_1_add vp9_idct4x4_1_add_sse2 + +void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_12_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct8x8_12_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_idct8x8_1_add vp9_idct8x8_1_add_sse2 + +void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_64_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct8x8_64_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type); +void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *output, int pitch, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2 void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); @@ -334,485 +368,73 @@ void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride, i void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2 -void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type); -void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *output, int pitch, int tx_type); -#define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2 +void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c void vp9_iwht4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); #define vp9_iwht4x4_1_add vp9_iwht4x4_1_add_c -void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c - -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x16 vp9_variance32x16_sse2 - -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x32 vp9_variance16x32_sse2 - -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x32 vp9_variance64x32_sse2 - -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x64 vp9_variance32x64_sse2 - -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x32 vp9_variance32x32_sse2 - -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x64 vp9_variance64x64_sse2 - -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x16 vp9_variance16x16_sse2 - -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x8 vp9_variance16x8_sse2 - -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x16 vp9_variance8x16_sse2 - -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x8 vp9_variance8x8_sse2 - -void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vp9_get_sse_sum_8x8 vp9_get8x8var_sse2 - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x4 vp9_variance8x4_sse2 - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x8 vp9_variance4x8_sse2 - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x4 vp9_variance4x4_sse2 - -unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad64x64 vp9_sad64x64_sse2 - -unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x64 vp9_sad32x64_sse2 - -unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad64x32 vp9_sad64x32_sse2 - -unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x16 vp9_sad32x16_sse2 - -unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad16x32 vp9_sad16x32_sse2 - -unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x32 vp9_sad32x32_sse2 - -unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad16x16 vp9_sad16x16_sse2 - -unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad16x8 vp9_sad16x8_sse2 - -unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad8x16 vp9_sad8x16_sse2 - -unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad8x8 vp9_sad8x8_sse2 - -unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad8x4 vp9_sad8x4_sse2 - -unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad4x8_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad4x8 vp9_sad4x8_sse - -unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad4x4_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad4x4 vp9_sad4x4_sse - -unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad64x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad64x64_avg vp9_sad64x64_avg_sse2 - -unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad32x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x64_avg vp9_sad32x64_avg_sse2 - -unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad64x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad64x32_avg vp9_sad64x32_avg_sse2 - -unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad32x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x16_avg vp9_sad32x16_avg_sse2 - -unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad16x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x32_avg vp9_sad16x32_avg_sse2 - -unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad32x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x32_avg vp9_sad32x32_avg_sse2 - -unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad16x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x16_avg vp9_sad16x16_avg_sse2 - -unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad16x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x8_avg vp9_sad16x8_avg_sse2 - -unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad8x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x16_avg vp9_sad8x16_avg_sse2 - -unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad8x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x8_avg vp9_sad8x8_avg_sse2 - -unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad8x4_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x4_avg vp9_sad8x4_avg_sse2 - -unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad4x8_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad4x8_avg vp9_sad4x8_avg_sse - -unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad4x4_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad4x4_avg vp9_sad4x4_avg_sse - -unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance_halfpixvar16x16_h_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_sse2 - -unsigned int vp9_variance_halfpixvar16x16_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance_halfpixvar16x16_v_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_sse2 - -unsigned int vp9_variance_halfpixvar16x16_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance_halfpixvar16x16_hv_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_sse2 - -unsigned int vp9_variance_halfpixvar64x64_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_h vp9_variance_halfpixvar64x64_h_c - -unsigned int vp9_variance_halfpixvar64x64_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_v vp9_variance_halfpixvar64x64_v_c - -unsigned int vp9_variance_halfpixvar64x64_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_hv vp9_variance_halfpixvar64x64_hv_c - -unsigned int vp9_variance_halfpixvar32x32_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_h vp9_variance_halfpixvar32x32_h_c - -unsigned int vp9_variance_halfpixvar32x32_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_v vp9_variance_halfpixvar32x32_v_c - -unsigned int vp9_variance_halfpixvar32x32_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_hv vp9_variance_halfpixvar32x32_hv_c - -void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad64x64x3 vp9_sad64x64x3_c - -void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad32x32x3 vp9_sad32x32x3_c - -void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad8x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad8x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad4x4x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad4x4x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad64x64x8 vp9_sad64x64x8_c - -void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad32x32x8 vp9_sad32x32x8_c - -void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad16x16x8 vp9_sad16x16x8_c - -void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad16x8x8 vp9_sad16x8x8_c - -void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x16x8 vp9_sad8x16x8_c - -void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x8x8 vp9_sad8x8x8_c - -void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x4x8 vp9_sad8x4x8_c - -void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad4x8x8 vp9_sad4x8x8_c - -void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad4x4x8 vp9_sad4x4x8_c - -void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad64x64x4d vp9_sad64x64x4d_sse2 - -void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad32x64x4d vp9_sad32x64x4d_sse2 - -void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad64x32x4d vp9_sad64x32x4d_sse2 - -void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad32x16x4d vp9_sad32x16x4d_sse2 - -void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad16x32x4d vp9_sad16x32x4d_sse2 - -void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad32x32x4d vp9_sad32x32x4d_sse2 - -void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad16x16x4d vp9_sad16x16x4d_sse2 - -void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad16x8x4d vp9_sad16x8x4d_sse2 - -void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad8x16x4d vp9_sad8x16x4d_sse2 - -void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad8x8x4d vp9_sad8x8x4d_sse2 - -void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad8x4x4d vp9_sad8x4x4d_sse2 - -void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad4x8x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad4x8x4d vp9_sad4x8x4d_sse - -void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad4x4x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad4x4x4d vp9_sad4x4x4d_sse +void vp9_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_16_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +RTCD_EXTERN void (*vp9_lpf_horizontal_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); + +void vp9_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_4_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_lpf_horizontal_4 vp9_lpf_horizontal_4_mmx + +void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vp9_lpf_horizontal_4_dual vp9_lpf_horizontal_4_dual_sse2 + +void vp9_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_lpf_horizontal_8 vp9_lpf_horizontal_8_sse2 + +void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vp9_lpf_horizontal_8_dual vp9_lpf_horizontal_8_dual_sse2 + +void vp9_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vp9_lpf_vertical_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vp9_lpf_vertical_16 vp9_lpf_vertical_16_sse2 + +void vp9_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vp9_lpf_vertical_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vp9_lpf_vertical_16_dual vp9_lpf_vertical_16_dual_sse2 + +void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_vertical_4_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_lpf_vertical_4 vp9_lpf_vertical_4_mmx + +void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vp9_lpf_vertical_4_dual vp9_lpf_vertical_4_dual_sse2 + +void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_lpf_vertical_8 vp9_lpf_vertical_8_sse2 + +void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vp9_lpf_vertical_8_dual vp9_lpf_vertical_8_dual_sse2 unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int vp9_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x16 vp9_mse16x16_sse2 - -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_c +unsigned int vp9_mse16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_mse16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x8 vp9_mse16x8_c +unsigned int vp9_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse16x8 vp9_mse16x8_sse2 + +unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vp9_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse8x16 vp9_mse8x16_sse2 unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x8 vp9_mse8x8_c - -unsigned int vp9_sub_pixel_mse64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_mse64x64 vp9_sub_pixel_mse64x64_c - -unsigned int vp9_sub_pixel_mse32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_mse32x32 vp9_sub_pixel_mse32x32_c - -unsigned int vp9_get_mb_ss_c(const int16_t *); -unsigned int vp9_get_mb_ss_mmx(const int16_t *); -unsigned int vp9_get_mb_ss_sse2(const int16_t *); -#define vp9_get_mb_ss vp9_get_mb_ss_sse2 - -int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); -int64_t vp9_block_error_sse2(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); -#define vp9_block_error vp9_block_error_sse2 - -void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); -void vp9_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); -#define vp9_subtract_block vp9_subtract_block_sse2 +unsigned int vp9_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse8x8 vp9_mse8x8_sse2 void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void vp9_quantize_b_ssse3(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); @@ -822,60 +444,460 @@ void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int ski void vp9_quantize_b_32x32_ssse3(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); RTCD_EXTERN void (*vp9_quantize_b_32x32)(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_short_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -#define vp9_short_fht4x4 vp9_short_fht4x4_sse2 +void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_ssse3(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +RTCD_EXTERN void (*vp9_quantize_fp)(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_short_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -#define vp9_short_fht8x8 vp9_short_fht8x8_sse2 +void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_32x32_ssse3(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_short_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -#define vp9_short_fht16x16 vp9_short_fht16x16_sse2 +int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_refining_search_sad vp9_refining_search_sad_c -void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride); -#define vp9_fwht4x4 vp9_fwht4x4_c +unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x16 vp9_sad16x16_sse2 -void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct4x4 vp9_fdct4x4_sse2 +unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad16x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x16_avg vp9_sad16x16_avg_sse2 -void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct8x8 vp9_fdct8x8_sse2 +void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct16x16 vp9_fdct16x16_sse2 +void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad16x16x4d vp9_sad16x16x4d_sse2 -void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct32x32_sse2(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct32x32 vp9_fdct32x32_sse2 +void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad16x16x8 vp9_sad16x16x8_c -void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct32x32_rd_sse2(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct32x32_rd vp9_fdct32x32_rd_sse2 +unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x32 vp9_sad16x32_sse2 -int vp9_full_search_sad_c(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -int vp9_full_search_sadx3(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -int vp9_full_search_sadx8(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -RTCD_EXTERN int (*vp9_full_search_sad)(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); +unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad16x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x32_avg vp9_sad16x32_avg_sse2 -int vp9_refining_search_sad_c(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -int vp9_refining_search_sadx4(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -RTCD_EXTERN int (*vp9_refining_search_sad)(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); +void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad16x32x4d vp9_sad16x32x4d_sse2 -int vp9_diamond_search_sad_c(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -int vp9_diamond_search_sadx4(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -RTCD_EXTERN int (*vp9_diamond_search_sad)(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); +unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x8 vp9_sad16x8_sse2 -void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad16x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x8_avg vp9_sad16x8_avg_sse2 + +void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad16x8x4d vp9_sad16x8x4d_sse2 + +void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad16x8x8 vp9_sad16x8x8_c + +unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad32x16 vp9_sad32x16_sse2 + +unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad32x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x16_avg vp9_sad32x16_avg_sse2 + +void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad32x16x4d vp9_sad32x16x4d_sse2 + +unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad32x32 vp9_sad32x32_sse2 + +unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad32x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x32_avg vp9_sad32x32_avg_sse2 + +void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad32x32x3 vp9_sad32x32x3_c + +void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad32x32x8 vp9_sad32x32x8_c + +unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad32x64 vp9_sad32x64_sse2 + +unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad32x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x64_avg vp9_sad32x64_avg_sse2 + +void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad32x64x4d vp9_sad32x64x4d_sse2 + +unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad4x4_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad4x4 vp9_sad4x4_sse + +unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad4x4_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad4x4_avg vp9_sad4x4_avg_sse + +void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad4x4x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad4x4x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad4x4x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad4x4x4d vp9_sad4x4x4d_sse + +void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad4x4x8 vp9_sad4x4x8_c + +unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad4x8_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad4x8 vp9_sad4x8_sse + +unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad4x8_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad4x8_avg vp9_sad4x8_avg_sse + +void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad4x8x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad4x8x4d vp9_sad4x8x4d_sse + +void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad4x8x8 vp9_sad4x8x8_c + +unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad64x32 vp9_sad64x32_sse2 + +unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad64x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad64x32_avg vp9_sad64x32_avg_sse2 + +void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad64x32x4d vp9_sad64x32x4d_sse2 + +unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad64x64 vp9_sad64x64_sse2 + +unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad64x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad64x64_avg vp9_sad64x64_avg_sse2 + +void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad64x64x3 vp9_sad64x64x3_c + +void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); + +void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad64x64x8 vp9_sad64x64x8_c + +unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x16 vp9_sad8x16_sse2 + +unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad8x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x16_avg vp9_sad8x16_avg_sse2 + +void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad8x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad8x16x4d vp9_sad8x16x4d_sse2 + +void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x16x8 vp9_sad8x16x8_c + +unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x4 vp9_sad8x4_sse2 + +unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad8x4_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x4_avg vp9_sad8x4_avg_sse2 + +void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad8x4x4d vp9_sad8x4x4d_sse2 + +void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x4x8 vp9_sad8x4x8_c + +unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x8 vp9_sad8x8_sse2 + +unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad8x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x8_avg vp9_sad8x8_avg_sse2 + +void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad8x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad8x8x4d vp9_sad8x8x4d_sse2 + +void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x8x8 vp9_sad8x8x8_c + +unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vp9_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +#define vp9_subtract_block vp9_subtract_block_sse2 + +void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); #define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2 -void vp9_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction); -#define vp9_yv12_copy_partial_frame vp9_yv12_copy_partial_frame_c +void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_sse2 + +void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_sse2 + +void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_sse + +void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_sse2 + +void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_sse2 + +void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_sse2 + +void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_sse + +void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_sse + +unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance16x32 vp9_variance16x32_sse2 + +unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance16x8 vp9_variance16x8_sse2 + +unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance32x64 vp9_variance32x64_sse2 + +unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance4x4 vp9_variance4x4_sse2 + +unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance4x8 vp9_variance4x8_sse2 + +unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x16 vp9_variance8x16_sse2 + +unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x4 vp9_variance8x4_sse2 + +unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x8 vp9_variance8x8_sse2 void vp9_rtcd(void); @@ -887,342 +909,179 @@ static void setup_rtcd_internal(void) (void)flags; - vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_ssse3; - - vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_ssse3; - - vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_ssse3; - - vp9_h_predictor_4x4 = vp9_h_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_h_predictor_4x4 = vp9_h_predictor_4x4_ssse3; - - - - vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_ssse3; - - - - - - - - vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_ssse3; - - vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_ssse3; - - vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_ssse3; - - vp9_h_predictor_8x8 = vp9_h_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_h_predictor_8x8 = vp9_h_predictor_8x8_ssse3; - - - - vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_ssse3; - - - - - - - - vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_ssse3; - - vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_ssse3; - - vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_ssse3; - - vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_ssse3; - - - - vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_ssse3; - - - - - - - - vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_ssse3; - - vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_ssse3; - - vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_ssse3; - - vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_h_predictor_32x32 = vp9_h_predictor_32x32_ssse3; - - - - - - - - - - - - - - - - - - - - - + vp9_block_error = vp9_block_error_sse2; + if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2; vp9_convolve8 = vp9_convolve8_sse2; if (flags & HAS_SSSE3) vp9_convolve8 = vp9_convolve8_ssse3; - - vp9_convolve8_horiz = vp9_convolve8_horiz_sse2; - if (flags & HAS_SSSE3) vp9_convolve8_horiz = vp9_convolve8_horiz_ssse3; - - vp9_convolve8_vert = vp9_convolve8_vert_sse2; - if (flags & HAS_SSSE3) vp9_convolve8_vert = vp9_convolve8_vert_ssse3; - + if (flags & HAS_AVX2) vp9_convolve8 = vp9_convolve8_avx2; vp9_convolve8_avg = vp9_convolve8_avg_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg = vp9_convolve8_avg_ssse3; - vp9_convolve8_avg_horiz = vp9_convolve8_avg_horiz_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg_horiz = vp9_convolve8_avg_horiz_ssse3; - vp9_convolve8_avg_vert = vp9_convolve8_avg_vert_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg_vert = vp9_convolve8_avg_vert_ssse3; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_ssse3; - - vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_ssse3; - - vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_ssse3; - - vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_ssse3; - - vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_ssse3; - - vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_ssse3; - - vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_ssse3; - - vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_ssse3; - - vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_ssse3; - - vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_ssse3; - - vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_ssse3; - - vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_ssse3; - - vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_ssse3; - - vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_ssse3; - - vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_ssse3; - - vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_ssse3; - - vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_ssse3; - - vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_ssse3; - - vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_ssse3; - - vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_ssse3; - - vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_ssse3; - - vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_ssse3; - - vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_ssse3; - - vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_ssse3; - - vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_ssse3; - - vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_ssse3; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - vp9_sad16x16x3 = vp9_sad16x16x3_c; - if (flags & HAS_SSE3) vp9_sad16x16x3 = vp9_sad16x16x3_sse3; - if (flags & HAS_SSSE3) vp9_sad16x16x3 = vp9_sad16x16x3_ssse3; - - vp9_sad16x8x3 = vp9_sad16x8x3_c; - if (flags & HAS_SSE3) vp9_sad16x8x3 = vp9_sad16x8x3_sse3; - if (flags & HAS_SSSE3) vp9_sad16x8x3 = vp9_sad16x8x3_ssse3; - - vp9_sad8x16x3 = vp9_sad8x16x3_c; - if (flags & HAS_SSE3) vp9_sad8x16x3 = vp9_sad8x16x3_sse3; - - vp9_sad8x8x3 = vp9_sad8x8x3_c; - if (flags & HAS_SSE3) vp9_sad8x8x3 = vp9_sad8x8x3_sse3; - - vp9_sad4x4x3 = vp9_sad4x4x3_c; - if (flags & HAS_SSE3) vp9_sad4x4x3 = vp9_sad4x4x3_sse3; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - vp9_quantize_b = vp9_quantize_b_c; - if (flags & HAS_SSSE3) vp9_quantize_b = vp9_quantize_b_ssse3; - - vp9_quantize_b_32x32 = vp9_quantize_b_32x32_c; - if (flags & HAS_SSSE3) vp9_quantize_b_32x32 = vp9_quantize_b_32x32_ssse3; - - - - - - - - - - + vp9_convolve8_horiz = vp9_convolve8_horiz_sse2; + if (flags & HAS_SSSE3) vp9_convolve8_horiz = vp9_convolve8_horiz_ssse3; + if (flags & HAS_AVX2) vp9_convolve8_horiz = vp9_convolve8_horiz_avx2; + vp9_convolve8_vert = vp9_convolve8_vert_sse2; + if (flags & HAS_SSSE3) vp9_convolve8_vert = vp9_convolve8_vert_ssse3; + if (flags & HAS_AVX2) vp9_convolve8_vert = vp9_convolve8_vert_avx2; + vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_ssse3; + vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_ssse3; + vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_ssse3; + vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_ssse3; + vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_ssse3; + vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_ssse3; + vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_ssse3; + vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_ssse3; + vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_ssse3; + vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_ssse3; + vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_ssse3; + vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_ssse3; + vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_ssse3; + vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_ssse3; + vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_ssse3; + vp9_fdct32x32 = vp9_fdct32x32_sse2; + if (flags & HAS_AVX2) vp9_fdct32x32 = vp9_fdct32x32_avx2; + vp9_fdct32x32_rd = vp9_fdct32x32_rd_sse2; + if (flags & HAS_AVX2) vp9_fdct32x32_rd = vp9_fdct32x32_rd_avx2; + vp9_fdct8x8 = vp9_fdct8x8_sse2; + if (flags & HAS_SSSE3) vp9_fdct8x8 = vp9_fdct8x8_ssse3; vp9_full_search_sad = vp9_full_search_sad_c; if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3; if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; - - vp9_refining_search_sad = vp9_refining_search_sad_c; - if (flags & HAS_SSE3) vp9_refining_search_sad = vp9_refining_search_sadx4; - - vp9_diamond_search_sad = vp9_diamond_search_sad_c; - if (flags & HAS_SSE3) vp9_diamond_search_sad = vp9_diamond_search_sadx4; + vp9_get16x16var = vp9_get16x16var_sse2; + if (flags & HAS_AVX2) vp9_get16x16var = vp9_get16x16var_avx2; + vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_ssse3; + vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_h_predictor_32x32 = vp9_h_predictor_32x32_ssse3; + vp9_h_predictor_4x4 = vp9_h_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_h_predictor_4x4 = vp9_h_predictor_4x4_ssse3; + vp9_h_predictor_8x8 = vp9_h_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_h_predictor_8x8 = vp9_h_predictor_8x8_ssse3; + vp9_idct16x16_10_add = vp9_idct16x16_10_add_sse2; + if (flags & HAS_SSSE3) vp9_idct16x16_10_add = vp9_idct16x16_10_add_ssse3; + vp9_idct16x16_256_add = vp9_idct16x16_256_add_sse2; + if (flags & HAS_SSSE3) vp9_idct16x16_256_add = vp9_idct16x16_256_add_ssse3; + vp9_idct8x8_12_add = vp9_idct8x8_12_add_sse2; + if (flags & HAS_SSSE3) vp9_idct8x8_12_add = vp9_idct8x8_12_add_ssse3; + vp9_idct8x8_64_add = vp9_idct8x8_64_add_sse2; + if (flags & HAS_SSSE3) vp9_idct8x8_64_add = vp9_idct8x8_64_add_ssse3; + vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_sse2; + if (flags & HAS_AVX2) vp9_lpf_horizontal_16 = vp9_lpf_horizontal_16_avx2; + vp9_mse16x16 = vp9_mse16x16_sse2; + if (flags & HAS_AVX2) vp9_mse16x16 = vp9_mse16x16_avx2; + vp9_quantize_b = vp9_quantize_b_c; + if (flags & HAS_SSSE3) vp9_quantize_b = vp9_quantize_b_ssse3; + vp9_quantize_b_32x32 = vp9_quantize_b_32x32_c; + if (flags & HAS_SSSE3) vp9_quantize_b_32x32 = vp9_quantize_b_32x32_ssse3; + vp9_quantize_fp = vp9_quantize_fp_c; + if (flags & HAS_SSSE3) vp9_quantize_fp = vp9_quantize_fp_ssse3; + vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c; + if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3; + vp9_sad16x16x3 = vp9_sad16x16x3_c; + if (flags & HAS_SSE3) vp9_sad16x16x3 = vp9_sad16x16x3_sse3; + if (flags & HAS_SSSE3) vp9_sad16x16x3 = vp9_sad16x16x3_ssse3; + vp9_sad16x8x3 = vp9_sad16x8x3_c; + if (flags & HAS_SSE3) vp9_sad16x8x3 = vp9_sad16x8x3_sse3; + if (flags & HAS_SSSE3) vp9_sad16x8x3 = vp9_sad16x8x3_ssse3; + vp9_sad32x32x4d = vp9_sad32x32x4d_sse2; + if (flags & HAS_AVX2) vp9_sad32x32x4d = vp9_sad32x32x4d_avx2; + vp9_sad4x4x3 = vp9_sad4x4x3_c; + if (flags & HAS_SSE3) vp9_sad4x4x3 = vp9_sad4x4x3_sse3; + vp9_sad64x64x4d = vp9_sad64x64x4d_sse2; + if (flags & HAS_AVX2) vp9_sad64x64x4d = vp9_sad64x64x4d_avx2; + vp9_sad8x16x3 = vp9_sad8x16x3_c; + if (flags & HAS_SSE3) vp9_sad8x16x3 = vp9_sad8x16x3_sse3; + vp9_sad8x8x3 = vp9_sad8x8x3_c; + if (flags & HAS_SSE3) vp9_sad8x8x3 = vp9_sad8x8x3_sse3; + vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_ssse3; + vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_ssse3; + vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_ssse3; + vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_ssse3; + vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_ssse3; + if (flags & HAS_AVX2) vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_avx2; + vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_ssse3; + vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_ssse3; + vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_ssse3; + vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_ssse3; + vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_ssse3; + if (flags & HAS_AVX2) vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_avx2; + vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_ssse3; + vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_ssse3; + vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_ssse3; + vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_ssse3; + vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_ssse3; + vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_ssse3; + vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_ssse3; + vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_ssse3; + if (flags & HAS_AVX2) vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_avx2; + vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_ssse3; + vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_ssse3; + vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_ssse3; + vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_ssse3; + vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_ssse3; + if (flags & HAS_AVX2) vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_avx2; + vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_ssse3; + vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_ssse3; + vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_ssse3; + vp9_variance16x16 = vp9_variance16x16_sse2; + if (flags & HAS_AVX2) vp9_variance16x16 = vp9_variance16x16_avx2; + vp9_variance32x16 = vp9_variance32x16_sse2; + if (flags & HAS_AVX2) vp9_variance32x16 = vp9_variance32x16_avx2; + vp9_variance32x32 = vp9_variance32x32_sse2; + if (flags & HAS_AVX2) vp9_variance32x32 = vp9_variance32x32_avx2; + vp9_variance64x32 = vp9_variance64x32_sse2; + if (flags & HAS_AVX2) vp9_variance64x32 = vp9_variance64x32_avx2; + vp9_variance64x64 = vp9_variance64x64_sse2; + if (flags & HAS_AVX2) vp9_variance64x64 = vp9_variance64x64_avx2; } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vp9_rtcd_x86_64-win64-vs8.h b/media/libvpx/vp9_rtcd_x86_64-win64-vs8.h index db6bbc2485ec..4e8678a1371f 100644 --- a/media/libvpx/vp9_rtcd_x86_64-win64-vs8.h +++ b/media/libvpx/vp9_rtcd_x86_64-win64-vs8.h @@ -7,6 +7,10 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + /* * VP9 */ @@ -19,254 +23,20 @@ struct macroblockd; /* Encoder forward decls */ struct macroblock; struct vp9_variance_vtable; - -#define DEC_MVCOSTS int *mvjcost, int *mvcost[2] +struct search_site_config; +struct mv; union int_mv; struct yv12_buffer_config; -void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c - -void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c - -void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_sse - -void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_tm_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_sse - -void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_sse - -void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c - -void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c - -void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c - -void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c - -void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c - -void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_sse - -void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_sse2 - -void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_sse - -void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c - -void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c - -void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c - -void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c - -void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c - -void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_sse2 - -void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_sse2 - -void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_sse2 - -void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c - -void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c - -void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c - -void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -RTCD_EXTERN void (*vp9_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); - -void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c - -void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c - -void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c - -void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_sse2 - -void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_sse2 - -void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -void vp9_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_sse2 - -void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c - -void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c - -void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c - -void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); -void vp9_mb_lpf_vertical_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); -#define vp9_mb_lpf_vertical_edge_w vp9_mb_lpf_vertical_edge_w_sse2 - -void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mbloop_filter_vertical_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_mbloop_filter_vertical_edge vp9_mbloop_filter_vertical_edge_sse2 - -void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_loop_filter_vertical_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_mmx - -void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mb_lpf_horizontal_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_sse2 - -void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_mbloop_filter_horizontal_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_mbloop_filter_horizontal_edge vp9_mbloop_filter_horizontal_edge_sse2 - -void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -void vp9_loop_filter_horizontal_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); -#define vp9_loop_filter_horizontal_edge vp9_loop_filter_horizontal_edge_mmx - -void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_inner vp9_blend_mb_inner_c - -void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_mb_outer vp9_blend_mb_outer_c - -void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride); -#define vp9_blend_b vp9_blend_b_c - -void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve_copy vp9_convolve_copy_sse2 - -void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -#define vp9_convolve_avg vp9_convolve_avg_sse2 +int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); +int64_t vp9_block_error_sse2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); +#define vp9_block_error vp9_block_error_sse2 void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*vp9_convolve8)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); - -void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -RTCD_EXTERN void (*vp9_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); - void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); @@ -282,25 +52,259 @@ void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8 void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); RTCD_EXTERN void (*vp9_convolve8_avg_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct4x4_1_add vp9_idct4x4_1_add_sse2 +void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct4x4_16_add vp9_idct4x4_16_add_sse2 +void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +RTCD_EXTERN void (*vp9_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); -void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct8x8_1_add vp9_idct8x8_1_add_sse2 +void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_avg vp9_convolve_avg_sse2 -void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct8x8_64_add vp9_idct8x8_64_add_sse2 +void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vp9_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vp9_convolve_copy vp9_convolve_copy_sse2 -void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct8x8_10_add vp9_idct8x8_10_add_sse2 +void vp9_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_16x16 vp9_d117_predictor_16x16_c + +void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c + +void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_4x4 vp9_d117_predictor_4x4_c + +void vp9_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d117_predictor_8x8 vp9_d117_predictor_8x8_c + +void vp9_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_16x16 vp9_d135_predictor_16x16_c + +void vp9_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_32x32 vp9_d135_predictor_32x32_c + +void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_4x4 vp9_d135_predictor_4x4_c + +void vp9_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d135_predictor_8x8 vp9_d135_predictor_8x8_c + +void vp9_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_d153_predictor_32x32 vp9_d153_predictor_32x32_c + +void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d45_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_16x16 vp9_dc_128_predictor_16x16_c + +void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_32x32 vp9_dc_128_predictor_32x32_c + +void vp9_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_4x4 vp9_dc_128_predictor_4x4_c + +void vp9_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_128_predictor_8x8 vp9_dc_128_predictor_8x8_c + +void vp9_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_16x16 vp9_dc_left_predictor_16x16_c + +void vp9_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_32x32 vp9_dc_left_predictor_32x32_c + +void vp9_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_4x4 vp9_dc_left_predictor_4x4_c + +void vp9_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_left_predictor_8x8 vp9_dc_left_predictor_8x8_c + +void vp9_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_16x16 vp9_dc_predictor_16x16_sse2 + +void vp9_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_32x32 vp9_dc_predictor_32x32_sse2 + +void vp9_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_4x4 vp9_dc_predictor_4x4_sse + +void vp9_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_dc_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_predictor_8x8 vp9_dc_predictor_8x8_sse + +void vp9_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_16x16 vp9_dc_top_predictor_16x16_c + +void vp9_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_32x32 vp9_dc_top_predictor_32x32_c + +void vp9_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_4x4 vp9_dc_top_predictor_4x4_c + +void vp9_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_dc_top_predictor_8x8 vp9_dc_top_predictor_8x8_c + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct16x16 vp9_fdct16x16_sse2 + +void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct16x16_1 vp9_fdct16x16_1_sse2 + +void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct32x32 vp9_fdct32x32_sse2 + +void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct32x32_1 vp9_fdct32x32_1_sse2 + +void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct32x32_rd_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct32x32_rd vp9_fdct32x32_rd_sse2 + +void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct4x4 vp9_fdct4x4_sse2 + +void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct4x4_1 vp9_fdct4x4_1_sse2 + +void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride); +void vp9_fdct8x8_ssse3(const int16_t *input, int16_t *output, int stride); +RTCD_EXTERN void (*vp9_fdct8x8)(const int16_t *input, int16_t *output, int stride); + +void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride); +void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride); +#define vp9_fdct8x8_1 vp9_fdct8x8_1_sse2 + +void vp9_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht16x16_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +#define vp9_fht16x16 vp9_fht16x16_sse2 + +void vp9_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht4x4_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +#define vp9_fht4x4 vp9_fht4x4_sse2 + +void vp9_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type); +void vp9_fht8x8_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); +#define vp9_fht8x8 vp9_fht8x8_sse2 + +int vp9_full_range_search_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_full_range_search vp9_full_range_search_c + +int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +int vp9_full_search_sadx8(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +RTCD_EXTERN int (*vp9_full_search_sad)(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); + +void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride); +void vp9_fwht4x4_mmx(const int16_t *input, int16_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_mmx + +void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vp9_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vp9_get16x16var vp9_get16x16var_sse2 + +void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vp9_get8x8var vp9_get8x8var_sse2 + +unsigned int vp9_get_mb_ss_c(const int16_t *); +unsigned int vp9_get_mb_ss_sse2(const int16_t *); +#define vp9_get_mb_ss vp9_get_mb_ss_sse2 + +void vp9_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vp9_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); + +void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct16x16_10_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct16x16_10_add)(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); @@ -308,23 +312,46 @@ void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stri void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct16x16_256_add vp9_idct16x16_256_add_sse2 - -void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct16x16_10_add vp9_idct16x16_10_add_sse2 +void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct16x16_256_add)(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); #define vp9_idct32x32_1024_add vp9_idct32x32_1024_add_sse2 +void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_idct32x32_1_add vp9_idct32x32_1_add_sse2 + void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride); void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); #define vp9_idct32x32_34_add vp9_idct32x32_34_add_sse2 -void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_idct32x32_1_add vp9_idct32x32_1_add_sse2 +void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_idct4x4_16_add vp9_idct4x4_16_add_sse2 + +void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_idct4x4_1_add vp9_idct4x4_1_add_sse2 + +void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_12_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct8x8_12_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_idct8x8_1_add vp9_idct8x8_1_add_sse2 + +void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride); +void vp9_idct8x8_64_add_ssse3(const int16_t *input, uint8_t *dest, int dest_stride); +RTCD_EXTERN void (*vp9_idct8x8_64_add)(const int16_t *input, uint8_t *dest, int dest_stride); + +void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type); +void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *output, int pitch, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2 void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); @@ -334,485 +361,71 @@ void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int dest_stride, i void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2 -void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch, int tx_type); -void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *output, int pitch, int tx_type); -#define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2 +void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); +#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c void vp9_iwht4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride); #define vp9_iwht4x4_1_add vp9_iwht4x4_1_add_c -void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride); -#define vp9_iwht4x4_16_add vp9_iwht4x4_16_add_c +void vp9_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_lpf_horizontal_16 vp9_lpf_horizontal_16_sse2 -unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x16 vp9_variance32x16_sse2 +void vp9_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_4_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_lpf_horizontal_4 vp9_lpf_horizontal_4_mmx -unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x32 vp9_variance16x32_sse2 +void vp9_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vp9_lpf_horizontal_4_dual vp9_lpf_horizontal_4_dual_sse2 -unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x32 vp9_variance64x32_sse2 +void vp9_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_lpf_horizontal_8 vp9_lpf_horizontal_8_sse2 -unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x64 vp9_variance32x64_sse2 +void vp9_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vp9_lpf_horizontal_8_dual vp9_lpf_horizontal_8_dual_sse2 -unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance32x32 vp9_variance32x32_sse2 +void vp9_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vp9_lpf_vertical_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vp9_lpf_vertical_16 vp9_lpf_vertical_16_sse2 -unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance64x64 vp9_variance64x64_sse2 +void vp9_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vp9_lpf_vertical_16_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vp9_lpf_vertical_16_dual vp9_lpf_vertical_16_dual_sse2 -unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x16 vp9_variance16x16_sse2 +void vp9_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_vertical_4_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_lpf_vertical_4 vp9_lpf_vertical_4_mmx -unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance16x8 vp9_variance16x8_sse2 +void vp9_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vp9_lpf_vertical_4_dual vp9_lpf_vertical_4_dual_sse2 -unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x16 vp9_variance8x16_sse2 +void vp9_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +void vp9_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count); +#define vp9_lpf_vertical_8 vp9_lpf_vertical_8_sse2 -unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x8 vp9_variance8x8_sse2 - -void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -void vp9_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); -#define vp9_get_sse_sum_8x8 vp9_get8x8var_sse2 - -unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance8x4 vp9_variance8x4_sse2 - -unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x8 vp9_variance4x8_sse2 - -unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance4x4 vp9_variance4x4_sse2 - -unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -unsigned int vp9_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); -RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); - -unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad64x64 vp9_sad64x64_sse2 - -unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x64 vp9_sad32x64_sse2 - -unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad64x32 vp9_sad64x32_sse2 - -unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x16 vp9_sad32x16_sse2 - -unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad16x32 vp9_sad16x32_sse2 - -unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad32x32 vp9_sad32x32_sse2 - -unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad16x16 vp9_sad16x16_sse2 - -unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad16x8 vp9_sad16x8_sse2 - -unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad8x16 vp9_sad8x16_sse2 - -unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x8_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad8x8 vp9_sad8x8_sse2 - -unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad8x4 vp9_sad8x4_sse2 - -unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad4x8_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad4x8 vp9_sad4x8_sse - -unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad4x4_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -unsigned int vp9_sad4x4_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad); -#define vp9_sad4x4 vp9_sad4x4_sse - -unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad64x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad64x64_avg vp9_sad64x64_avg_sse2 - -unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad32x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x64_avg vp9_sad32x64_avg_sse2 - -unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad64x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad64x32_avg vp9_sad64x32_avg_sse2 - -unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad32x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x16_avg vp9_sad32x16_avg_sse2 - -unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad16x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x32_avg vp9_sad16x32_avg_sse2 - -unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad32x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad32x32_avg vp9_sad32x32_avg_sse2 - -unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad16x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x16_avg vp9_sad16x16_avg_sse2 - -unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad16x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad16x8_avg vp9_sad16x8_avg_sse2 - -unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad8x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x16_avg vp9_sad8x16_avg_sse2 - -unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad8x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x8_avg vp9_sad8x8_avg_sse2 - -unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad8x4_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad8x4_avg vp9_sad8x4_avg_sse2 - -unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad4x8_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad4x8_avg vp9_sad4x8_avg_sse - -unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -unsigned int vp9_sad4x4_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad); -#define vp9_sad4x4_avg vp9_sad4x4_avg_sse - -unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance_halfpixvar16x16_h_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_sse2 - -unsigned int vp9_variance_halfpixvar16x16_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance_halfpixvar16x16_v_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_sse2 - -unsigned int vp9_variance_halfpixvar16x16_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -unsigned int vp9_variance_halfpixvar16x16_hv_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_sse2 - -unsigned int vp9_variance_halfpixvar64x64_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_h vp9_variance_halfpixvar64x64_h_c - -unsigned int vp9_variance_halfpixvar64x64_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_v vp9_variance_halfpixvar64x64_v_c - -unsigned int vp9_variance_halfpixvar64x64_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar64x64_hv vp9_variance_halfpixvar64x64_hv_c - -unsigned int vp9_variance_halfpixvar32x32_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_h vp9_variance_halfpixvar32x32_h_c - -unsigned int vp9_variance_halfpixvar32x32_v_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_v vp9_variance_halfpixvar32x32_v_c - -unsigned int vp9_variance_halfpixvar32x32_hv_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_variance_halfpixvar32x32_hv vp9_variance_halfpixvar32x32_hv_c - -void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad64x64x3 vp9_sad64x64x3_c - -void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -#define vp9_sad32x32x3 vp9_sad32x32x3_c - -void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad16x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad8x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad8x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad8x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_sad4x4x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -RTCD_EXTERN void (*vp9_sad4x4x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); - -void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad64x64x8 vp9_sad64x64x8_c - -void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad32x32x8 vp9_sad32x32x8_c - -void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad16x16x8 vp9_sad16x16x8_c - -void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad16x8x8 vp9_sad16x8x8_c - -void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x16x8 vp9_sad8x16x8_c - -void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x8x8 vp9_sad8x8x8_c - -void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad8x4x8 vp9_sad8x4x8_c - -void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad4x8x8 vp9_sad4x8x8_c - -void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); -#define vp9_sad4x4x8 vp9_sad4x4x8_c - -void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad64x64x4d vp9_sad64x64x4d_sse2 - -void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad32x64x4d vp9_sad32x64x4d_sse2 - -void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad64x32x4d vp9_sad64x32x4d_sse2 - -void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad32x16x4d vp9_sad32x16x4d_sse2 - -void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad16x32x4d vp9_sad16x32x4d_sse2 - -void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad32x32x4d vp9_sad32x32x4d_sse2 - -void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad16x16x4d vp9_sad16x16x4d_sse2 - -void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad16x8x4d vp9_sad16x8x4d_sse2 - -void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad8x16x4d vp9_sad8x16x4d_sse2 - -void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad8x8x4d vp9_sad8x8x4d_sse2 - -void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad8x4x4d vp9_sad8x4x4d_sse2 - -void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad4x8x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad4x8x4d vp9_sad4x8x4d_sse - -void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -void vp9_sad4x4x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); -#define vp9_sad4x4x4d vp9_sad4x4x4d_sse +void vp9_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vp9_lpf_vertical_8_dual vp9_lpf_vertical_8_dual_sse2 unsigned int vp9_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -unsigned int vp9_mse16x16_mmx(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); unsigned int vp9_mse16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); #define vp9_mse16x16 vp9_mse16x16_sse2 -unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x16 vp9_mse8x16_c - unsigned int vp9_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse16x8 vp9_mse16x8_c +unsigned int vp9_mse16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse16x8 vp9_mse16x8_sse2 + +unsigned int vp9_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vp9_mse8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse8x16 vp9_mse8x16_sse2 unsigned int vp9_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); -#define vp9_mse8x8 vp9_mse8x8_c - -unsigned int vp9_sub_pixel_mse64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_mse64x64 vp9_sub_pixel_mse64x64_c - -unsigned int vp9_sub_pixel_mse32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vp9_sub_pixel_mse32x32 vp9_sub_pixel_mse32x32_c - -unsigned int vp9_get_mb_ss_c(const int16_t *); -unsigned int vp9_get_mb_ss_mmx(const int16_t *); -unsigned int vp9_get_mb_ss_sse2(const int16_t *); -#define vp9_get_mb_ss vp9_get_mb_ss_sse2 - -int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); -int64_t vp9_block_error_sse2(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz); -#define vp9_block_error vp9_block_error_sse2 - -void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); -void vp9_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); -#define vp9_subtract_block vp9_subtract_block_sse2 +unsigned int vp9_mse8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vp9_mse8x8 vp9_mse8x8_sse2 void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void vp9_quantize_b_ssse3(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); @@ -822,60 +435,449 @@ void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int ski void vp9_quantize_b_32x32_ssse3(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); RTCD_EXTERN void (*vp9_quantize_b_32x32)(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_short_fht4x4_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -#define vp9_short_fht4x4 vp9_short_fht4x4_sse2 +void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_ssse3(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +RTCD_EXTERN void (*vp9_quantize_fp)(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_short_fht8x8_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -#define vp9_short_fht8x8 vp9_short_fht8x8_sse2 +void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_32x32_ssse3(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +RTCD_EXTERN void (*vp9_quantize_fp_32x32)(const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -void vp9_short_fht16x16_c(const int16_t *input, int16_t *output, int stride, int tx_type); -void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output, int stride, int tx_type); -#define vp9_short_fht16x16 vp9_short_fht16x16_sse2 +int vp9_refining_search_sad_c(const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_refining_search_sad vp9_refining_search_sad_c -void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride); -#define vp9_fwht4x4 vp9_fwht4x4_c +unsigned int vp9_sad16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x16 vp9_sad16x16_sse2 -void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct4x4 vp9_fdct4x4_sse2 +unsigned int vp9_sad16x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad16x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x16_avg vp9_sad16x16_avg_sse2 -void vp9_fdct8x8_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct8x8 vp9_fdct8x8_sse2 +void vp9_sad16x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); -void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct16x16 vp9_fdct16x16_sse2 +void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad16x16x4d vp9_sad16x16x4d_sse2 -void vp9_fdct32x32_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct32x32_sse2(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct32x32 vp9_fdct32x32_sse2 +void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad16x16x8 vp9_sad16x16x8_c -void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *output, int stride); -void vp9_fdct32x32_rd_sse2(const int16_t *input, int16_t *output, int stride); -#define vp9_fdct32x32_rd vp9_fdct32x32_rd_sse2 +unsigned int vp9_sad16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x32 vp9_sad16x32_sse2 -int vp9_full_search_sad_c(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -int vp9_full_search_sadx3(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -int vp9_full_search_sadx8(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); -RTCD_EXTERN int (*vp9_full_search_sad)(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n); +unsigned int vp9_sad16x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad16x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x32_avg vp9_sad16x32_avg_sse2 -int vp9_refining_search_sad_c(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -int vp9_refining_search_sadx4(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -RTCD_EXTERN int (*vp9_refining_search_sad)(struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); +void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad16x32x4d vp9_sad16x32x4d_sse2 -int vp9_diamond_search_sad_c(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -int vp9_diamond_search_sadx4(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); -RTCD_EXTERN int (*vp9_diamond_search_sad)(struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv); +unsigned int vp9_sad16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad16x8 vp9_sad16x8_sse2 -void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); -void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +unsigned int vp9_sad16x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad16x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad16x8_avg vp9_sad16x8_avg_sse2 + +void vp9_sad16x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x3_ssse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad16x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad16x8x4d vp9_sad16x8x4d_sse2 + +void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad16x8x8 vp9_sad16x8x8_c + +unsigned int vp9_sad32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad32x16 vp9_sad32x16_sse2 + +unsigned int vp9_sad32x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad32x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x16_avg vp9_sad32x16_avg_sse2 + +void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad32x16x4d vp9_sad32x16x4d_sse2 + +unsigned int vp9_sad32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad32x32 vp9_sad32x32_sse2 + +unsigned int vp9_sad32x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad32x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x32_avg vp9_sad32x32_avg_sse2 + +void vp9_sad32x32x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad32x32x3 vp9_sad32x32x3_c + +void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad32x32x4d vp9_sad32x32x4d_sse2 + +void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad32x32x8 vp9_sad32x32x8_c + +unsigned int vp9_sad32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad32x64 vp9_sad32x64_sse2 + +unsigned int vp9_sad32x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad32x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad32x64_avg vp9_sad32x64_avg_sse2 + +void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad32x64x4d vp9_sad32x64x4d_sse2 + +unsigned int vp9_sad4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad4x4_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad4x4 vp9_sad4x4_sse + +unsigned int vp9_sad4x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad4x4_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad4x4_avg vp9_sad4x4_avg_sse + +void vp9_sad4x4x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad4x4x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad4x4x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad4x4x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad4x4x4d vp9_sad4x4x4d_sse + +void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad4x4x8 vp9_sad4x4x8_c + +unsigned int vp9_sad4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad4x8_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad4x8 vp9_sad4x8_sse + +unsigned int vp9_sad4x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad4x8_avg_sse(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad4x8_avg vp9_sad4x8_avg_sse + +void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad4x8x4d_sse(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad4x8x4d vp9_sad4x8x4d_sse + +void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad4x8x8 vp9_sad4x8x8_c + +unsigned int vp9_sad64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad64x32 vp9_sad64x32_sse2 + +unsigned int vp9_sad64x32_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad64x32_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad64x32_avg vp9_sad64x32_avg_sse2 + +void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad64x32x4d vp9_sad64x32x4d_sse2 + +unsigned int vp9_sad64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad64x64 vp9_sad64x64_sse2 + +unsigned int vp9_sad64x64_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad64x64_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad64x64_avg vp9_sad64x64_avg_sse2 + +void vp9_sad64x64x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +#define vp9_sad64x64x3 vp9_sad64x64x3_c + +void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad64x64x4d vp9_sad64x64x4d_sse2 + +void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad64x64x8 vp9_sad64x64x8_c + +unsigned int vp9_sad8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x16 vp9_sad8x16_sse2 + +unsigned int vp9_sad8x16_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad8x16_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x16_avg vp9_sad8x16_avg_sse2 + +void vp9_sad8x16x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad8x16x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x16x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad8x16x4d vp9_sad8x16x4d_sse2 + +void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x16x8 vp9_sad8x16x8_c + +unsigned int vp9_sad8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x4 vp9_sad8x4_sse2 + +unsigned int vp9_sad8x4_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad8x4_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x4_avg vp9_sad8x4_avg_sse2 + +void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad8x4x4d vp9_sad8x4x4d_sse2 + +void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x4x8 vp9_sad8x4x8_c + +unsigned int vp9_sad8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vp9_sad8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride); +#define vp9_sad8x8 vp9_sad8x8_sse2 + +unsigned int vp9_sad8x8_avg_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +unsigned int vp9_sad8x8_avg_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vp9_sad8x8_avg vp9_sad8x8_avg_sse2 + +void vp9_sad8x8x3_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +void vp9_sad8x8x3_sse3(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); +RTCD_EXTERN void (*vp9_sad8x8x3)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array); + +void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +void vp9_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array); +#define vp9_sad8x8x4d vp9_sad8x8x4d_sse2 + +void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vp9_sad8x8x8 vp9_sad8x8x8_c + +unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +unsigned int vp9_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred); + +unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x4_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x8_sse(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vp9_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); + +void vp9_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vp9_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +#define vp9_subtract_block vp9_subtract_block_sse2 + +void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +void vp9_temporal_filter_apply_sse2(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); #define vp9_temporal_filter_apply vp9_temporal_filter_apply_sse2 -void vp9_yv12_copy_partial_frame_c(struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction); -#define vp9_yv12_copy_partial_frame vp9_yv12_copy_partial_frame_c +void vp9_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_16x16 vp9_tm_predictor_16x16_sse2 + +void vp9_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_32x32 vp9_tm_predictor_32x32_sse2 + +void vp9_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_4x4 vp9_tm_predictor_4x4_sse + +void vp9_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_tm_predictor_8x8 vp9_tm_predictor_8x8_sse2 + +void vp9_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_16x16 vp9_v_predictor_16x16_sse2 + +void vp9_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_32x32 vp9_v_predictor_32x32_sse2 + +void vp9_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_4x4_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_4x4 vp9_v_predictor_4x4_sse + +void vp9_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vp9_v_predictor_8x8_sse(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vp9_v_predictor_8x8 vp9_v_predictor_8x8_sse + +unsigned int vp9_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance16x16 vp9_variance16x16_sse2 + +unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance16x32 vp9_variance16x32_sse2 + +unsigned int vp9_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance16x8 vp9_variance16x8_sse2 + +unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance32x16 vp9_variance32x16_sse2 + +unsigned int vp9_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance32x32 vp9_variance32x32_sse2 + +unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance32x64 vp9_variance32x64_sse2 + +unsigned int vp9_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance4x4 vp9_variance4x4_sse2 + +unsigned int vp9_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance4x8 vp9_variance4x8_sse2 + +unsigned int vp9_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance64x32 vp9_variance64x32_sse2 + +unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance64x64 vp9_variance64x64_sse2 + +unsigned int vp9_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x16 vp9_variance8x16_sse2 + +unsigned int vp9_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x4 vp9_variance8x4_sse2 + +unsigned int vp9_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vp9_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vp9_variance8x8 vp9_variance8x8_sse2 void vp9_rtcd(void); @@ -886,343 +888,147 @@ static void setup_rtcd_internal(void) int flags = x86_simd_caps(); (void)flags; - - vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_ssse3; - - vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_ssse3; - - vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_ssse3; - - vp9_h_predictor_4x4 = vp9_h_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_h_predictor_4x4 = vp9_h_predictor_4x4_ssse3; - - - - vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_c; - if (flags & HAS_SSSE3) vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_ssse3; - - - - - - - - vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_ssse3; - - vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_ssse3; - - vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_ssse3; - - vp9_h_predictor_8x8 = vp9_h_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_h_predictor_8x8 = vp9_h_predictor_8x8_ssse3; - - - - vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_c; - if (flags & HAS_SSSE3) vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_ssse3; - - - - - - - - vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_ssse3; - - vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_ssse3; - - vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_ssse3; - - vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_ssse3; - - - - vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_c; - if (flags & HAS_SSSE3) vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_ssse3; - - - - - - - - vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_ssse3; - - vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_ssse3; - - vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_ssse3; - - vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c; - if (flags & HAS_SSSE3) vp9_h_predictor_32x32 = vp9_h_predictor_32x32_ssse3; - - - - - - - - - - - - - - - - - - - - vp9_convolve8 = vp9_convolve8_sse2; if (flags & HAS_SSSE3) vp9_convolve8 = vp9_convolve8_ssse3; - - vp9_convolve8_horiz = vp9_convolve8_horiz_sse2; - if (flags & HAS_SSSE3) vp9_convolve8_horiz = vp9_convolve8_horiz_ssse3; - - vp9_convolve8_vert = vp9_convolve8_vert_sse2; - if (flags & HAS_SSSE3) vp9_convolve8_vert = vp9_convolve8_vert_ssse3; - vp9_convolve8_avg = vp9_convolve8_avg_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg = vp9_convolve8_avg_ssse3; - vp9_convolve8_avg_horiz = vp9_convolve8_avg_horiz_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg_horiz = vp9_convolve8_avg_horiz_ssse3; - vp9_convolve8_avg_vert = vp9_convolve8_avg_vert_sse2; if (flags & HAS_SSSE3) vp9_convolve8_avg_vert = vp9_convolve8_avg_vert_ssse3; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_ssse3; - - vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_ssse3; - - vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_ssse3; - - vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_ssse3; - - vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_ssse3; - - vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_ssse3; - - vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_ssse3; - - vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_ssse3; - - vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_ssse3; - - vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_ssse3; - - vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_ssse3; - - vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_ssse3; - - vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_ssse3; - - vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_ssse3; - - vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_ssse3; - - vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_ssse3; - - vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_ssse3; - - vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_ssse3; - - vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_ssse3; - - vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_ssse3; - - vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_ssse3; - - vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_sse2; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_ssse3; - - vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_ssse3; - - vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_ssse3; - - vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_ssse3; - - vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_sse; - if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_ssse3; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - vp9_sad16x16x3 = vp9_sad16x16x3_c; - if (flags & HAS_SSE3) vp9_sad16x16x3 = vp9_sad16x16x3_sse3; - if (flags & HAS_SSSE3) vp9_sad16x16x3 = vp9_sad16x16x3_ssse3; - - vp9_sad16x8x3 = vp9_sad16x8x3_c; - if (flags & HAS_SSE3) vp9_sad16x8x3 = vp9_sad16x8x3_sse3; - if (flags & HAS_SSSE3) vp9_sad16x8x3 = vp9_sad16x8x3_ssse3; - - vp9_sad8x16x3 = vp9_sad8x16x3_c; - if (flags & HAS_SSE3) vp9_sad8x16x3 = vp9_sad8x16x3_sse3; - - vp9_sad8x8x3 = vp9_sad8x8x3_c; - if (flags & HAS_SSE3) vp9_sad8x8x3 = vp9_sad8x8x3_sse3; - - vp9_sad4x4x3 = vp9_sad4x4x3_c; - if (flags & HAS_SSE3) vp9_sad4x4x3 = vp9_sad4x4x3_sse3; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - vp9_quantize_b = vp9_quantize_b_c; - if (flags & HAS_SSSE3) vp9_quantize_b = vp9_quantize_b_ssse3; - - vp9_quantize_b_32x32 = vp9_quantize_b_32x32_c; - if (flags & HAS_SSSE3) vp9_quantize_b_32x32 = vp9_quantize_b_32x32_ssse3; - - - - - - - - - - + vp9_convolve8_horiz = vp9_convolve8_horiz_sse2; + if (flags & HAS_SSSE3) vp9_convolve8_horiz = vp9_convolve8_horiz_ssse3; + vp9_convolve8_vert = vp9_convolve8_vert_sse2; + if (flags & HAS_SSSE3) vp9_convolve8_vert = vp9_convolve8_vert_ssse3; + vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_16x16 = vp9_d153_predictor_16x16_ssse3; + vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_4x4 = vp9_d153_predictor_4x4_ssse3; + vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d153_predictor_8x8 = vp9_d153_predictor_8x8_ssse3; + vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_16x16 = vp9_d207_predictor_16x16_ssse3; + vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_32x32 = vp9_d207_predictor_32x32_ssse3; + vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_4x4 = vp9_d207_predictor_4x4_ssse3; + vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d207_predictor_8x8 = vp9_d207_predictor_8x8_ssse3; + vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_16x16 = vp9_d45_predictor_16x16_ssse3; + vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_32x32 = vp9_d45_predictor_32x32_ssse3; + vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_4x4 = vp9_d45_predictor_4x4_ssse3; + vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d45_predictor_8x8 = vp9_d45_predictor_8x8_ssse3; + vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_16x16 = vp9_d63_predictor_16x16_ssse3; + vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_ssse3; + vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_4x4 = vp9_d63_predictor_4x4_ssse3; + vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_d63_predictor_8x8 = vp9_d63_predictor_8x8_ssse3; + vp9_fdct8x8 = vp9_fdct8x8_sse2; + if (flags & HAS_SSSE3) vp9_fdct8x8 = vp9_fdct8x8_ssse3; vp9_full_search_sad = vp9_full_search_sad_c; if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3; if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; - - vp9_refining_search_sad = vp9_refining_search_sad_c; - if (flags & HAS_SSE3) vp9_refining_search_sad = vp9_refining_search_sadx4; - - vp9_diamond_search_sad = vp9_diamond_search_sad_c; - if (flags & HAS_SSE3) vp9_diamond_search_sad = vp9_diamond_search_sadx4; + vp9_h_predictor_16x16 = vp9_h_predictor_16x16_c; + if (flags & HAS_SSSE3) vp9_h_predictor_16x16 = vp9_h_predictor_16x16_ssse3; + vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c; + if (flags & HAS_SSSE3) vp9_h_predictor_32x32 = vp9_h_predictor_32x32_ssse3; + vp9_h_predictor_4x4 = vp9_h_predictor_4x4_c; + if (flags & HAS_SSSE3) vp9_h_predictor_4x4 = vp9_h_predictor_4x4_ssse3; + vp9_h_predictor_8x8 = vp9_h_predictor_8x8_c; + if (flags & HAS_SSSE3) vp9_h_predictor_8x8 = vp9_h_predictor_8x8_ssse3; + vp9_idct16x16_10_add = vp9_idct16x16_10_add_sse2; + if (flags & HAS_SSSE3) vp9_idct16x16_10_add = vp9_idct16x16_10_add_ssse3; + vp9_idct16x16_256_add = vp9_idct16x16_256_add_sse2; + if (flags & HAS_SSSE3) vp9_idct16x16_256_add = vp9_idct16x16_256_add_ssse3; + vp9_idct8x8_12_add = vp9_idct8x8_12_add_sse2; + if (flags & HAS_SSSE3) vp9_idct8x8_12_add = vp9_idct8x8_12_add_ssse3; + vp9_idct8x8_64_add = vp9_idct8x8_64_add_sse2; + if (flags & HAS_SSSE3) vp9_idct8x8_64_add = vp9_idct8x8_64_add_ssse3; + vp9_quantize_b = vp9_quantize_b_c; + if (flags & HAS_SSSE3) vp9_quantize_b = vp9_quantize_b_ssse3; + vp9_quantize_b_32x32 = vp9_quantize_b_32x32_c; + if (flags & HAS_SSSE3) vp9_quantize_b_32x32 = vp9_quantize_b_32x32_ssse3; + vp9_quantize_fp = vp9_quantize_fp_c; + if (flags & HAS_SSSE3) vp9_quantize_fp = vp9_quantize_fp_ssse3; + vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c; + if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3; + vp9_sad16x16x3 = vp9_sad16x16x3_c; + if (flags & HAS_SSE3) vp9_sad16x16x3 = vp9_sad16x16x3_sse3; + if (flags & HAS_SSSE3) vp9_sad16x16x3 = vp9_sad16x16x3_ssse3; + vp9_sad16x8x3 = vp9_sad16x8x3_c; + if (flags & HAS_SSE3) vp9_sad16x8x3 = vp9_sad16x8x3_sse3; + if (flags & HAS_SSSE3) vp9_sad16x8x3 = vp9_sad16x8x3_ssse3; + vp9_sad4x4x3 = vp9_sad4x4x3_c; + if (flags & HAS_SSE3) vp9_sad4x4x3 = vp9_sad4x4x3_sse3; + vp9_sad8x16x3 = vp9_sad8x16x3_c; + if (flags & HAS_SSE3) vp9_sad8x16x3 = vp9_sad8x16x3_sse3; + vp9_sad8x8x3 = vp9_sad8x8x3_c; + if (flags & HAS_SSE3) vp9_sad8x8x3 = vp9_sad8x8x3_sse3; + vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x16 = vp9_sub_pixel_avg_variance16x16_ssse3; + vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x32 = vp9_sub_pixel_avg_variance16x32_ssse3; + vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance16x8 = vp9_sub_pixel_avg_variance16x8_ssse3; + vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x16 = vp9_sub_pixel_avg_variance32x16_ssse3; + vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x32 = vp9_sub_pixel_avg_variance32x32_ssse3; + vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance32x64 = vp9_sub_pixel_avg_variance32x64_ssse3; + vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x4 = vp9_sub_pixel_avg_variance4x4_ssse3; + vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance4x8 = vp9_sub_pixel_avg_variance4x8_ssse3; + vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x32 = vp9_sub_pixel_avg_variance64x32_ssse3; + vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance64x64 = vp9_sub_pixel_avg_variance64x64_ssse3; + vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x16 = vp9_sub_pixel_avg_variance8x16_ssse3; + vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x4 = vp9_sub_pixel_avg_variance8x4_ssse3; + vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_avg_variance8x8 = vp9_sub_pixel_avg_variance8x8_ssse3; + vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x16 = vp9_sub_pixel_variance16x16_ssse3; + vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x32 = vp9_sub_pixel_variance16x32_ssse3; + vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance16x8 = vp9_sub_pixel_variance16x8_ssse3; + vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x16 = vp9_sub_pixel_variance32x16_ssse3; + vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x32 = vp9_sub_pixel_variance32x32_ssse3; + vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance32x64 = vp9_sub_pixel_variance32x64_ssse3; + vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x4 = vp9_sub_pixel_variance4x4_ssse3; + vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_sse; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance4x8 = vp9_sub_pixel_variance4x8_ssse3; + vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x32 = vp9_sub_pixel_variance64x32_ssse3; + vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance64x64 = vp9_sub_pixel_variance64x64_ssse3; + vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x16 = vp9_sub_pixel_variance8x16_ssse3; + vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x4 = vp9_sub_pixel_variance8x4_ssse3; + vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_sse2; + if (flags & HAS_SSSE3) vp9_sub_pixel_variance8x8 = vp9_sub_pixel_variance8x8_ssse3; } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vpx/internal/vpx_codec_internal.h b/media/libvpx/vpx/internal/vpx_codec_internal.h index 05fed977ec22..cbfffd0af2ab 100644 --- a/media/libvpx/vpx/internal/vpx_codec_internal.h +++ b/media/libvpx/vpx/internal/vpx_codec_internal.h @@ -41,12 +41,15 @@ * Once initialized, the instance is manged using other functions from * the vpx_codec_* family. */ -#ifndef VPX_CODEC_INTERNAL_H -#define VPX_CODEC_INTERNAL_H +#ifndef VPX_INTERNAL_VPX_CODEC_INTERNAL_H_ +#define VPX_INTERNAL_VPX_CODEC_INTERNAL_H_ #include "../vpx_decoder.h" #include "../vpx_encoder.h" #include +#ifdef __cplusplus +extern "C" { +#endif /*!\brief Current ABI version number * @@ -56,7 +59,7 @@ * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ -#define VPX_CODEC_INTERNAL_ABI_VERSION (4) /**<\hideinitializer*/ +#define VPX_CODEC_INTERNAL_ABI_VERSION (5) /**<\hideinitializer*/ typedef struct vpx_codec_alg_priv vpx_codec_alg_priv_t; typedef struct vpx_codec_priv_enc_mr_cfg vpx_codec_priv_enc_mr_cfg_t; @@ -151,9 +154,8 @@ typedef vpx_codec_err_t (*vpx_codec_get_si_fn_t)(vpx_codec_alg_priv_t *ctx, * \retval #VPX_CODEC_OK * The internal state data was deserialized. */ -typedef vpx_codec_err_t (*vpx_codec_control_fn_t)(vpx_codec_alg_priv_t *ctx, - int ctrl_id, - va_list ap); +typedef vpx_codec_err_t (*vpx_codec_control_fn_t)(vpx_codec_alg_priv_t *ctx, + va_list ap); /*!\brief control function pointer mapping * @@ -167,8 +169,8 @@ typedef vpx_codec_err_t (*vpx_codec_control_fn_t)(vpx_codec_alg_priv_t *ctx, * \ref MUST be non-zero. */ typedef const struct vpx_codec_ctrl_fn_map { - int ctrl_id; - vpx_codec_control_fn_t fn; + int ctrl_id; + vpx_codec_control_fn_t fn; } vpx_codec_ctrl_fn_map_t; /*!\brief decode data function pointer prototype @@ -215,37 +217,36 @@ typedef vpx_codec_err_t (*vpx_codec_decode_fn_t)(vpx_codec_alg_priv_t *ctx, typedef vpx_image_t *(*vpx_codec_get_frame_fn_t)(vpx_codec_alg_priv_t *ctx, vpx_codec_iter_t *iter); - -/*\brief eXternal Memory Allocation memory map get iterator +/*!\brief Pass in external frame buffers for the decoder to use. * - * Iterates over a list of the memory maps requested by the decoder. The - * iterator storage should be initialized to NULL to start the iteration. - * Iteration is complete when this function returns NULL. + * Registers functions to be called when libvpx needs a frame buffer + * to decode the current frame and a function to be called when libvpx does + * not internally reference the frame buffer. This set function must + * be called before the first call to decode or libvpx will assume the + * default behavior of allocating frame buffers internally. * - * \param[in out] iter Iterator storage, initialized to NULL - * - * \return Returns a pointer to an memory segment descriptor, or NULL to - * indicate end-of-list. - */ -typedef vpx_codec_err_t (*vpx_codec_get_mmap_fn_t)(const vpx_codec_ctx_t *ctx, - vpx_codec_mmap_t *mmap, - vpx_codec_iter_t *iter); - - -/*\brief eXternal Memory Allocation memory map set iterator - * - * Sets a memory descriptor inside the decoder instance. - * - * \param[in] ctx Pointer to this instance's context - * \param[in] mmap Memory map to store. + * \param[in] ctx Pointer to this instance's context + * \param[in] cb_get Pointer to the get callback function + * \param[in] cb_release Pointer to the release callback function + * \param[in] cb_priv Callback's private data * * \retval #VPX_CODEC_OK - * The memory map was accepted and stored. - * \retval #VPX_CODEC_MEM_ERROR - * The memory map was rejected. + * External frame buffers will be used by libvpx. + * \retval #VPX_CODEC_INVALID_PARAM + * One or more of the callbacks were NULL. + * \retval #VPX_CODEC_ERROR + * Decoder context not initialized, or algorithm not capable of + * using external frame buffers. + * + * \note + * When decoding VP9, the application may be required to pass in at least + * #VP9_MAXIMUM_REF_BUFFERS + #VPX_MAXIMUM_WORK_BUFFERS external frame + * buffers. */ -typedef vpx_codec_err_t (*vpx_codec_set_mmap_fn_t)(vpx_codec_ctx_t *ctx, - const vpx_codec_mmap_t *mmap); +typedef vpx_codec_err_t (*vpx_codec_set_fb_fn_t)( + vpx_codec_alg_priv_t *ctx, + vpx_get_frame_buffer_cb_fn_t cb_get, + vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv); typedef vpx_codec_err_t (*vpx_codec_encode_fn_t)(vpx_codec_alg_priv_t *ctx, @@ -285,8 +286,6 @@ typedef const struct vpx_codec_enc_cfg_map { vpx_codec_enc_cfg_t cfg; } vpx_codec_enc_cfg_map_t; -#define NOT_IMPLEMENTED 0 - /*!\brief Decoder algorithm interface interface * * All decoders \ref MUST expose a variable of this type. @@ -298,15 +297,15 @@ struct vpx_codec_iface { vpx_codec_init_fn_t init; /**< \copydoc ::vpx_codec_init_fn_t */ vpx_codec_destroy_fn_t destroy; /**< \copydoc ::vpx_codec_destroy_fn_t */ vpx_codec_ctrl_fn_map_t *ctrl_maps; /**< \copydoc ::vpx_codec_ctrl_fn_map_t */ - vpx_codec_get_mmap_fn_t get_mmap; /**< \copydoc ::vpx_codec_get_mmap_fn_t */ - vpx_codec_set_mmap_fn_t set_mmap; /**< \copydoc ::vpx_codec_set_mmap_fn_t */ struct vpx_codec_dec_iface { vpx_codec_peek_si_fn_t peek_si; /**< \copydoc ::vpx_codec_peek_si_fn_t */ vpx_codec_get_si_fn_t get_si; /**< \copydoc ::vpx_codec_get_si_fn_t */ vpx_codec_decode_fn_t decode; /**< \copydoc ::vpx_codec_decode_fn_t */ vpx_codec_get_frame_fn_t get_frame; /**< \copydoc ::vpx_codec_get_frame_fn_t */ + vpx_codec_set_fb_fn_t set_fb_fn; /**< \copydoc ::vpx_codec_set_fb_fn_t */ } dec; struct vpx_codec_enc_iface { + int cfg_map_count; vpx_codec_enc_cfg_map_t *cfg_maps; /**< \copydoc ::vpx_codec_enc_cfg_map_t */ vpx_codec_encode_fn_t encode; /**< \copydoc ::vpx_codec_encode_fn_t */ vpx_codec_get_cx_data_fn_t get_cx_data; /**< \copydoc ::vpx_codec_get_cx_data_fn_t */ @@ -336,9 +335,6 @@ typedef struct vpx_codec_priv_cb_pair { * and the pointer cast to the proper type. */ struct vpx_codec_priv { - unsigned int sz; - vpx_codec_iface_t *iface; - struct vpx_codec_alg_priv *alg_priv; const char *err_detail; vpx_codec_flags_t init_flags; struct { @@ -346,8 +342,7 @@ struct vpx_codec_priv { vpx_codec_priv_cb_pair_t put_slice_cb; } dec; struct { - int tbd; - struct vpx_fixed_buf cx_data_dst_buf; + vpx_fixed_buf_t cx_data_dst_buf; unsigned int cx_data_pad_before; unsigned int cx_data_pad_after; vpx_codec_cx_pkt_t cx_data_pkt; @@ -368,36 +363,13 @@ struct vpx_codec_priv_enc_mr_cfg #undef VPX_CTRL_USE_TYPE #define VPX_CTRL_USE_TYPE(id, typ) \ - static typ id##__value(va_list args) {return va_arg(args, typ);} \ - static typ id##__convert(void *x)\ - {\ - union\ - {\ - void *x;\ - typ d;\ - } u;\ - u.x = x;\ - return u.d;\ - } - + static VPX_INLINE typ id##__value(va_list args) {return va_arg(args, typ);} #undef VPX_CTRL_USE_TYPE_DEPRECATED #define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \ - static typ id##__value(va_list args) {return va_arg(args, typ);} \ - static typ id##__convert(void *x)\ - {\ - union\ - {\ - void *x;\ - typ d;\ - } u;\ - u.x = x;\ - return u.d;\ - } + static VPX_INLINE typ id##__value(va_list args) {return va_arg(args, typ);} #define CAST(id, arg) id##__value(arg) -#define RECAST(id, x) id##__convert(x) - /* CODEC_INTERFACE convenience macro * @@ -444,6 +416,7 @@ vpx_codec_pkt_list_get(struct vpx_codec_pkt_list *list, #include #include + struct vpx_internal_error_info { vpx_codec_err_t error_code; int has_detail; @@ -452,52 +425,13 @@ struct vpx_internal_error_info { jmp_buf jmp; }; -static void vpx_internal_error(struct vpx_internal_error_info *info, - vpx_codec_err_t error, - const char *fmt, - ...) { - va_list ap; +void vpx_internal_error(struct vpx_internal_error_info *info, + vpx_codec_err_t error, + const char *fmt, + ...); - info->error_code = error; - info->has_detail = 0; - - if (fmt) { - size_t sz = sizeof(info->detail); - - info->has_detail = 1; - va_start(ap, fmt); - vsnprintf(info->detail, sz - 1, fmt, ap); - va_end(ap); - info->detail[sz - 1] = '\0'; - } - - if (info->setjmp) - longjmp(info->jmp, info->error_code); -} - -//------------------------------------------------------------------------------ -// mmap interface - -typedef struct { - unsigned int id; - unsigned long sz; - unsigned int align; - unsigned int flags; - unsigned long (*calc_sz)(const vpx_codec_dec_cfg_t *, vpx_codec_flags_t); -} mem_req_t; - -// Allocates mmap.priv and sets mmap.base based on mmap.sz/align/flags -// requirements. -// Returns #VPX_CODEC_OK on success, #VPX_CODEC_MEM_ERROR otherwise. -vpx_codec_err_t vpx_mmap_alloc(vpx_codec_mmap_t *mmap); - -// Frees mmap.base allocated by a call to vpx_mmap_alloc(). -void vpx_mmap_dtor(vpx_codec_mmap_t *mmap); - -// Checks each mmap has the size requirement specificied by mem_reqs. -// Returns #VPX_CODEC_OK on success, #VPX_CODEC_MEM_ERROR otherwise. -vpx_codec_err_t vpx_validate_mmaps(const vpx_codec_stream_info_t *si, - const vpx_codec_mmap_t *mmaps, - const mem_req_t *mem_reqs, int nreqs, - vpx_codec_flags_t init_flags); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VPX_INTERNAL_VPX_CODEC_INTERNAL_H_ diff --git a/media/libvpx/vpx/internal/vpx_psnr.h b/media/libvpx/vpx/internal/vpx_psnr.h new file mode 100644 index 000000000000..07d81bb8d904 --- /dev/null +++ b/media/libvpx/vpx/internal/vpx_psnr.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_INTERNAL_VPX_PSNR_H_ +#define VPX_INTERNAL_VPX_PSNR_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +// TODO(dkovalev) change vpx_sse_to_psnr signature: double -> int64_t + +/*!\brief Converts SSE to PSNR + * + * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR). + * + * \param[in] samples Number of samples + * \param[in] peak Max sample value + * \param[in] sse Sum of squared errors + */ +double vpx_sse_to_psnr(double samples, double peak, double sse); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_INTERNAL_VPX_PSNR_H_ diff --git a/media/libvpx/vpx/src/svc_encodeframe.c b/media/libvpx/vpx/src/svc_encodeframe.c index 2514ad303b91..8911e83040bc 100644 --- a/media/libvpx/vpx/src/svc_encodeframe.c +++ b/media/libvpx/vpx/src/svc_encodeframe.c @@ -13,15 +13,20 @@ * VP9 SVC encoding support via libvpx */ +#include +#include #include #include #include #include #define VPX_DISABLE_CTRL_TYPECHECKS 1 #define VPX_CODEC_DISABLE_COMPAT 1 +#include "./vpx_config.h" #include "vpx/svc_context.h" #include "vpx/vp8cx.h" #include "vpx/vpx_encoder.h" +#include "vpx_mem/vpx_mem.h" +#include "vp9/common/vp9_onyxc_int.h" #ifdef __MINGW32__ #define strtok_r strtok_s @@ -40,10 +45,19 @@ _CRTIMP char *__cdecl strtok_s(char *str, const char *delim, char **context); #define SUPERFRAME_SLOTS (8) #define SUPERFRAME_BUFFER_SIZE (SUPERFRAME_SLOTS * sizeof(uint32_t) + 2) #define OPTION_BUFFER_SIZE 256 +#define COMPONENTS 4 // psnr & sse statistics maintained for total, y, u, v static const char *DEFAULT_QUANTIZER_VALUES = "60,53,39,33,27"; static const char *DEFAULT_SCALE_FACTORS = "4/16,5/16,7/16,11/16,16/16"; +// One encoded frame +typedef struct FrameData { + void *buf; // compressed data buffer + size_t size; // length of compressed data + vpx_codec_frame_flags_t flags; /**< flags for this frame */ + struct FrameData *next; +} FrameData; + typedef struct SvcInternal { char options[OPTION_BUFFER_SIZE]; // set by vpx_svc_set_options char quantizers[OPTION_BUFFER_SIZE]; // set by vpx_svc_set_quantizers @@ -53,10 +67,12 @@ typedef struct SvcInternal { int scaling_factor_num[VPX_SS_MAX_LAYERS]; int scaling_factor_den[VPX_SS_MAX_LAYERS]; int quantizer[VPX_SS_MAX_LAYERS]; + int enable_auto_alt_ref[VPX_SS_MAX_LAYERS]; // accumulated statistics - double psnr_in_layer[VPX_SS_MAX_LAYERS]; - uint32_t bytes_in_layer[VPX_SS_MAX_LAYERS]; + double psnr_sum[VPX_SS_MAX_LAYERS][COMPONENTS]; // total/Y/U/V + uint64_t sse_sum[VPX_SS_MAX_LAYERS][COMPONENTS]; + uint32_t bytes_sum[VPX_SS_MAX_LAYERS]; // codec encoding values int width; // width of highest layer @@ -65,156 +81,87 @@ typedef struct SvcInternal { // state variables int encode_frame_count; + int frame_received; int frame_within_gop; - vpx_enc_frame_flags_t enc_frame_flags; int layers; int layer; int is_keyframe; + int use_multiple_frame_contexts; - size_t frame_size; - size_t buffer_size; - void *buffer; + FrameData *frame_list; + FrameData *frame_temp; + + char *rc_stats_buf; + size_t rc_stats_buf_size; + size_t rc_stats_buf_used; char message_buffer[2048]; vpx_codec_ctx_t *codec_ctx; } SvcInternal; -// Superframe is used to generate an index of individual frames (i.e., layers) -struct Superframe { - int count; - uint32_t sizes[SUPERFRAME_SLOTS]; - uint32_t magnitude; - uint8_t buffer[SUPERFRAME_BUFFER_SIZE]; - size_t index_size; -}; - -// One encoded frame layer -struct LayerData { - void *buf; // compressed data buffer - size_t size; // length of compressed data - struct LayerData *next; -}; - -// create LayerData from encoder output -static struct LayerData *ld_create(void *buf, size_t size) { - struct LayerData *const layer_data = malloc(sizeof(*layer_data)); - if (layer_data == NULL) { +// create FrameData from encoder output +static struct FrameData *fd_create(void *buf, size_t size, + vpx_codec_frame_flags_t flags) { + struct FrameData *const frame_data = + (struct FrameData *)vpx_malloc(sizeof(*frame_data)); + if (frame_data == NULL) { return NULL; } - layer_data->buf = malloc(size); - if (layer_data->buf == NULL) { - free(layer_data); + frame_data->buf = vpx_malloc(size); + if (frame_data->buf == NULL) { + vpx_free(frame_data); return NULL; } - memcpy(layer_data->buf, buf, size); - layer_data->size = size; - return layer_data; + vpx_memcpy(frame_data->buf, buf, size); + frame_data->size = size; + frame_data->flags = flags; + return frame_data; } -// free LayerData -static void ld_free(struct LayerData *layer_data) { - if (layer_data) { - if (layer_data->buf) { - free(layer_data->buf); - layer_data->buf = NULL; - } - free(layer_data); +// free FrameData +static void fd_free(struct FrameData *p) { + if (p) { + if (p->buf) + vpx_free(p->buf); + vpx_free(p); } } -// add layer data to list -static void ld_list_add(struct LayerData **list, struct LayerData *layer_data) { - struct LayerData **p = list; +// add FrameData to list +static void fd_list_add(struct FrameData **list, struct FrameData *layer_data) { + struct FrameData **p = list; while (*p != NULL) p = &(*p)->next; *p = layer_data; layer_data->next = NULL; } -// get accumulated size of layer data -static size_t ld_list_get_buffer_size(struct LayerData *list) { - struct LayerData *p; - size_t size = 0; - - for (p = list; p != NULL; p = p->next) { - size += p->size; - } - return size; -} - -// copy layer data to buffer -static void ld_list_copy_to_buffer(struct LayerData *list, uint8_t *buffer) { - struct LayerData *p; - - for (p = list; p != NULL; p = p->next) { - buffer[0] = 1; - memcpy(buffer, p->buf, p->size); - buffer += p->size; - } -} - -// free layer data list -static void ld_list_free(struct LayerData *list) { - struct LayerData *p = list; +// free FrameData list +static void fd_free_list(struct FrameData *list) { + struct FrameData *p = list; while (p) { list = list->next; - ld_free(p); + fd_free(p); p = list; } } -static void sf_create_index(struct Superframe *sf) { - uint8_t marker = 0xc0; - int i; - uint32_t mag, mask; - uint8_t *bufp; - - if (sf->count == 0 || sf->count >= 8) return; - - // Add the number of frames to the marker byte - marker |= sf->count - 1; - - // Choose the magnitude - for (mag = 0, mask = 0xff; mag < 4; ++mag) { - if (sf->magnitude < mask) break; - mask <<= 8; - mask |= 0xff; - } - marker |= mag << 3; - - // Write the index - sf->index_size = 2 + (mag + 1) * sf->count; - bufp = sf->buffer; - - *bufp++ = marker; - for (i = 0; i < sf->count; ++i) { - int this_sz = sf->sizes[i]; - uint32_t j; - - for (j = 0; j <= mag; ++j) { - *bufp++ = this_sz & 0xff; - this_sz >>= 8; - } - } - *bufp++ = marker; -} - static SvcInternal *get_svc_internal(SvcContext *svc_ctx) { if (svc_ctx == NULL) return NULL; if (svc_ctx->internal == NULL) { - SvcInternal *const si = malloc(sizeof(*si)); + SvcInternal *const si = (SvcInternal *)malloc(sizeof(*si)); if (si != NULL) { memset(si, 0, sizeof(*si)); } svc_ctx->internal = si; } - return svc_ctx->internal; + return (SvcInternal *)svc_ctx->internal; } static const SvcInternal *get_const_svc_internal(const SvcContext *svc_ctx) { if (svc_ctx == NULL) return NULL; - return svc_ctx->internal; + return (const SvcInternal *)svc_ctx->internal; } static void svc_log_reset(SvcContext *svc_ctx) { @@ -222,7 +169,8 @@ static void svc_log_reset(SvcContext *svc_ctx) { si->message_buffer[0] = '\0'; } -static int svc_log(SvcContext *svc_ctx, int level, const char *fmt, ...) { +static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level, + const char *fmt, ...) { char buf[512]; int retval = 0; va_list ap; @@ -249,23 +197,6 @@ static int svc_log(SvcContext *svc_ctx, int level, const char *fmt, ...) { return retval; } -static vpx_codec_err_t set_option_encoding_mode(SvcContext *svc_ctx, - const char *value_str) { - if (strcmp(value_str, "i") == 0) { - svc_ctx->encoding_mode = INTER_LAYER_PREDICTION_I; - } else if (strcmp(value_str, "alt-ip") == 0) { - svc_ctx->encoding_mode = ALT_INTER_LAYER_PREDICTION_IP; - } else if (strcmp(value_str, "ip") == 0) { - svc_ctx->encoding_mode = INTER_LAYER_PREDICTION_IP; - } else if (strcmp(value_str, "gf") == 0) { - svc_ctx->encoding_mode = USE_GOLDEN_FRAME; - } else { - svc_log(svc_ctx, SVC_LOG_ERROR, "invalid encoding mode: %s", value_str); - return VPX_CODEC_INVALID_PARAM; - } - return VPX_CODEC_OK; -} - static vpx_codec_err_t parse_quantizer_values(SvcContext *svc_ctx, const char *quantizer_values) { char *input_string; @@ -274,7 +205,7 @@ static vpx_codec_err_t parse_quantizer_values(SvcContext *svc_ctx, char *save_ptr; int found = 0; int i, q; - int res = VPX_CODEC_OK; + vpx_codec_err_t res = VPX_CODEC_OK; SvcInternal *const si = get_svc_internal(svc_ctx); if (quantizer_values == NULL || strlen(quantizer_values) == 0) { @@ -310,6 +241,59 @@ static vpx_codec_err_t parse_quantizer_values(SvcContext *svc_ctx, return res; } +static vpx_codec_err_t parse_auto_alt_ref(SvcContext *svc_ctx, + const char *alt_ref_options) { + char *input_string; + char *token; + const char *delim = ","; + char *save_ptr; + int found = 0, enabled = 0; + int i, value; + vpx_codec_err_t res = VPX_CODEC_OK; + SvcInternal *const si = get_svc_internal(svc_ctx); + + if (alt_ref_options == NULL || strlen(alt_ref_options) == 0) { + return VPX_CODEC_INVALID_PARAM; + } else { + input_string = strdup(alt_ref_options); + } + + token = strtok_r(input_string, delim, &save_ptr); + for (i = 0; i < svc_ctx->spatial_layers; ++i) { + if (token != NULL) { + value = atoi(token); + if (value < 0 || value > 1) { + svc_log(svc_ctx, SVC_LOG_ERROR, + "enable auto alt ref values: invalid value %s\n", token); + res = VPX_CODEC_INVALID_PARAM; + break; + } + token = strtok_r(NULL, delim, &save_ptr); + found = i + 1; + } else { + value = 0; + } + si->enable_auto_alt_ref[i] = value; + if (value > 0) + ++enabled; + } + if (res == VPX_CODEC_OK && found != svc_ctx->spatial_layers) { + svc_log(svc_ctx, SVC_LOG_ERROR, + "svc: quantizers: %d values required, but only %d specified\n", + svc_ctx->spatial_layers, found); + res = VPX_CODEC_INVALID_PARAM; + } + if (enabled > REF_FRAMES - svc_ctx->spatial_layers) { + svc_log(svc_ctx, SVC_LOG_ERROR, + "svc: auto alt ref: Maxinum %d(REF_FRAMES - layers) layers could" + "enabled auto alt reference frame, but % layers are enabled\n", + REF_FRAMES - svc_ctx->spatial_layers, enabled); + res = VPX_CODEC_INVALID_PARAM; + } + free(input_string); + return res; +} + static void log_invalid_scale_factor(SvcContext *svc_ctx, const char *value) { svc_log(svc_ctx, SVC_LOG_ERROR, "svc scale-factors: invalid value %s\n", value); @@ -324,7 +308,7 @@ static vpx_codec_err_t parse_scale_factors(SvcContext *svc_ctx, int found = 0; int i; int64_t num, den; - int res = VPX_CODEC_OK; + vpx_codec_err_t res = VPX_CODEC_OK; SvcInternal *const si = get_svc_internal(svc_ctx); if (scale_factors == NULL || strlen(scale_factors) == 0) { @@ -383,7 +367,8 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) { char *option_name; char *option_value; char *input_ptr; - int res = VPX_CODEC_OK; + SvcInternal *const si = get_svc_internal(svc_ctx); + vpx_codec_err_t res = VPX_CODEC_OK; if (options == NULL) return VPX_CODEC_OK; input_string = strdup(options); @@ -399,17 +384,21 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) { res = VPX_CODEC_INVALID_PARAM; break; } - if (strcmp("encoding-mode", option_name) == 0) { - res = set_option_encoding_mode(svc_ctx, option_value); - if (res != VPX_CODEC_OK) break; - } else if (strcmp("layers", option_name) == 0) { + if (strcmp("spatial-layers", option_name) == 0) { svc_ctx->spatial_layers = atoi(option_value); + } else if (strcmp("temporal-layers", option_name) == 0) { + svc_ctx->temporal_layers = atoi(option_value); } else if (strcmp("scale-factors", option_name) == 0) { res = parse_scale_factors(svc_ctx, option_value); if (res != VPX_CODEC_OK) break; } else if (strcmp("quantizers", option_name) == 0) { res = parse_quantizer_values(svc_ctx, option_value); if (res != VPX_CODEC_OK) break; + } else if (strcmp("auto-alt-refs", option_name) == 0) { + res = parse_auto_alt_ref(svc_ctx, option_value); + if (res != VPX_CODEC_OK) break; + } else if (strcmp("multi-frame-contexts", option_name) == 0) { + si->use_multiple_frame_contexts = atoi(option_value); } else { svc_log(svc_ctx, SVC_LOG_ERROR, "invalid option: %s\n", option_name); res = VPX_CODEC_INVALID_PARAM; @@ -418,6 +407,12 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) { option_name = strtok_r(NULL, "=", &input_ptr); } free(input_string); + + if (si->use_multiple_frame_contexts && + (svc_ctx->spatial_layers > 3 || + svc_ctx->spatial_layers * svc_ctx->temporal_layers > 4)) + res = VPX_CODEC_INVALID_PARAM; + return res; } @@ -456,8 +451,8 @@ vpx_codec_err_t vpx_svc_set_scale_factors(SvcContext *svc_ctx, vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, vpx_codec_iface_t *iface, vpx_codec_enc_cfg_t *enc_cfg) { - int max_intra_size_pct; vpx_codec_err_t res; + int i; SvcInternal *const si = get_svc_internal(svc_ctx); if (svc_ctx == NULL || codec_ctx == NULL || iface == NULL || enc_cfg == NULL) { @@ -485,9 +480,6 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, svc_ctx->spatial_layers); return VPX_CODEC_INVALID_PARAM; } - // use SvcInternal value for number of layers to enable forcing single layer - // for first frame - si->layers = svc_ctx->spatial_layers; res = parse_quantizer_values(svc_ctx, si->quantizers); if (res != VPX_CODEC_OK) return res; @@ -495,30 +487,86 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, res = parse_scale_factors(svc_ctx, si->scale_factors); if (res != VPX_CODEC_OK) return res; - // parse aggregate command line options + // Parse aggregate command line options. Options must start with + // "layers=xx" then followed by other options res = parse_options(svc_ctx, si->options); if (res != VPX_CODEC_OK) return res; + if (svc_ctx->spatial_layers < 1) + svc_ctx->spatial_layers = 1; + if (svc_ctx->spatial_layers > VPX_SS_MAX_LAYERS) + svc_ctx->spatial_layers = VPX_SS_MAX_LAYERS; + + if (svc_ctx->temporal_layers < 1) + svc_ctx->temporal_layers = 1; + if (svc_ctx->temporal_layers > VPX_TS_MAX_LAYERS) + svc_ctx->temporal_layers = VPX_TS_MAX_LAYERS; + + si->layers = svc_ctx->spatial_layers; + + // Assign target bitrate for each layer. We calculate the ratio + // from the resolution for now. + // TODO(Minghai): Optimize the mechanism of allocating bits after + // implementing svc two pass rate control. + if (si->layers > 1) { + float total = 0; + float alloc_ratio[VPX_SS_MAX_LAYERS] = {0}; + + assert(si->layers <= VPX_SS_MAX_LAYERS); + for (i = 0; i < si->layers; ++i) { + int pos = i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers; + if (pos < VPX_SS_MAX_LAYERS && si->scaling_factor_den[pos] > 0) { + alloc_ratio[i] = (float)(si->scaling_factor_num[pos] * 1.0 / + si->scaling_factor_den[pos]); + + alloc_ratio[i] *= alloc_ratio[i]; + total += alloc_ratio[i]; + } + } + + for (i = 0; i < si->layers; ++i) { + if (total > 0) { + enc_cfg->ss_target_bitrate[i] = (unsigned int) + (enc_cfg->rc_target_bitrate * alloc_ratio[i] / total); + } + } + } + +#if CONFIG_SPATIAL_SVC + for (i = 0; i < si->layers; ++i) + enc_cfg->ss_enable_auto_alt_ref[i] = si->enable_auto_alt_ref[i]; +#endif + + if (svc_ctx->temporal_layers > 1) { + int i; + for (i = 0; i < svc_ctx->temporal_layers; ++i) { + enc_cfg->ts_target_bitrate[i] = enc_cfg->rc_target_bitrate / + svc_ctx->temporal_layers; + enc_cfg->ts_rate_decimator[i] = 1 << (svc_ctx->temporal_layers - 1 - i); + } + } + // modify encoder configuration enc_cfg->ss_number_layers = si->layers; - enc_cfg->kf_mode = VPX_KF_DISABLED; - enc_cfg->g_pass = VPX_RC_ONE_PASS; - // Lag in frames not currently supported - enc_cfg->g_lag_in_frames = 0; + enc_cfg->ts_number_layers = svc_ctx->temporal_layers; // TODO(ivanmaltz): determine if these values need to be set explicitly for // svc, or if the normal default/override mechanism can be used enc_cfg->rc_dropframe_thresh = 0; - enc_cfg->rc_end_usage = VPX_CBR; enc_cfg->rc_resize_allowed = 0; - enc_cfg->rc_min_quantizer = 33; - enc_cfg->rc_max_quantizer = 33; + + if (enc_cfg->g_pass == VPX_RC_ONE_PASS) { + enc_cfg->rc_min_quantizer = 33; + enc_cfg->rc_max_quantizer = 33; + } + enc_cfg->rc_undershoot_pct = 100; enc_cfg->rc_overshoot_pct = 15; enc_cfg->rc_buf_initial_sz = 500; enc_cfg->rc_buf_optimal_sz = 600; enc_cfg->rc_buf_sz = 1000; - enc_cfg->g_error_resilient = 1; + if (enc_cfg->g_error_resilient == 0 && si->use_multiple_frame_contexts == 0) + enc_cfg->g_error_resilient = 1; // Initialize codec res = vpx_codec_enc_init(codec_ctx, iface, enc_cfg, VPX_CODEC_USE_PSNR); @@ -528,142 +576,11 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, } vpx_codec_control(codec_ctx, VP9E_SET_SVC, 1); - vpx_codec_control(codec_ctx, VP8E_SET_CPUUSED, 1); - vpx_codec_control(codec_ctx, VP8E_SET_STATIC_THRESHOLD, 1); - vpx_codec_control(codec_ctx, VP8E_SET_NOISE_SENSITIVITY, 1); vpx_codec_control(codec_ctx, VP8E_SET_TOKEN_PARTITIONS, 1); - max_intra_size_pct = - (int)(((double)enc_cfg->rc_buf_optimal_sz * 0.5) * - ((double)enc_cfg->g_timebase.den / enc_cfg->g_timebase.num) / 10.0); - vpx_codec_control(codec_ctx, VP8E_SET_MAX_INTRA_BITRATE_PCT, - max_intra_size_pct); return VPX_CODEC_OK; } -// SVC Algorithm flags - these get mapped to VP8_EFLAG_* defined in vp8cx.h - -// encoder should reference the last frame -#define USE_LAST (1 << 0) - -// encoder should reference the alt ref frame -#define USE_ARF (1 << 1) - -// encoder should reference the golden frame -#define USE_GF (1 << 2) - -// encoder should copy current frame to the last frame buffer -#define UPDATE_LAST (1 << 3) - -// encoder should copy current frame to the alt ref frame buffer -#define UPDATE_ARF (1 << 4) - -// encoder should copy current frame to the golden frame -#define UPDATE_GF (1 << 5) - -static int map_vp8_flags(int svc_flags) { - int flags = 0; - - if (!(svc_flags & USE_LAST)) flags |= VP8_EFLAG_NO_REF_LAST; - if (!(svc_flags & USE_ARF)) flags |= VP8_EFLAG_NO_REF_ARF; - if (!(svc_flags & USE_GF)) flags |= VP8_EFLAG_NO_REF_GF; - - if (svc_flags & UPDATE_LAST) { - // last is updated automatically - } else { - flags |= VP8_EFLAG_NO_UPD_LAST; - } - if (svc_flags & UPDATE_ARF) { - flags |= VP8_EFLAG_FORCE_ARF; - } else { - flags |= VP8_EFLAG_NO_UPD_ARF; - } - if (svc_flags & UPDATE_GF) { - flags |= VP8_EFLAG_FORCE_GF; - } else { - flags |= VP8_EFLAG_NO_UPD_GF; - } - return flags; -} - -/** - * Helper to check if the current frame is the first, full resolution dummy. - */ -static int vpx_svc_dummy_frame(SvcContext *svc_ctx) { - SvcInternal *const si = get_svc_internal(svc_ctx); - return svc_ctx->first_frame_full_size == 1 && si->encode_frame_count == 0; -} - -static void calculate_enc_frame_flags(SvcContext *svc_ctx) { - vpx_enc_frame_flags_t flags = VPX_EFLAG_FORCE_KF; - SvcInternal *const si = get_svc_internal(svc_ctx); - const int is_keyframe = (si->frame_within_gop == 0); - - // keyframe layer zero is identical for all modes - if ((is_keyframe && si->layer == 0) || vpx_svc_dummy_frame(svc_ctx)) { - si->enc_frame_flags = VPX_EFLAG_FORCE_KF; - return; - } - - switch (svc_ctx->encoding_mode) { - case ALT_INTER_LAYER_PREDICTION_IP: - if (si->layer == 0) { - flags = map_vp8_flags(USE_LAST | UPDATE_LAST); - } else if (is_keyframe) { - if (si->layer == si->layers - 1) { - flags = map_vp8_flags(USE_ARF | UPDATE_LAST); - } else { - flags = map_vp8_flags(USE_ARF | UPDATE_LAST | UPDATE_GF); - } - } else { - flags = map_vp8_flags(USE_LAST | USE_ARF | UPDATE_LAST); - } - break; - case INTER_LAYER_PREDICTION_I: - if (si->layer == 0) { - flags = map_vp8_flags(USE_LAST | UPDATE_LAST); - } else if (is_keyframe) { - flags = map_vp8_flags(USE_ARF | UPDATE_LAST); - } else { - flags = map_vp8_flags(USE_LAST | UPDATE_LAST); - } - break; - case INTER_LAYER_PREDICTION_IP: - if (si->layer == 0) { - flags = map_vp8_flags(USE_LAST | UPDATE_LAST); - } else if (is_keyframe) { - flags = map_vp8_flags(USE_ARF | UPDATE_LAST); - } else { - flags = map_vp8_flags(USE_LAST | USE_ARF | UPDATE_LAST); - } - break; - case USE_GOLDEN_FRAME: - if (2 * si->layers - SVC_REFERENCE_FRAMES <= si->layer) { - if (si->layer == 0) { - flags = map_vp8_flags(USE_LAST | USE_GF | UPDATE_LAST); - } else if (is_keyframe) { - flags = map_vp8_flags(USE_ARF | UPDATE_LAST | UPDATE_GF); - } else { - flags = map_vp8_flags(USE_LAST | USE_ARF | USE_GF | UPDATE_LAST); - } - } else { - if (si->layer == 0) { - flags = map_vp8_flags(USE_LAST | UPDATE_LAST); - } else if (is_keyframe) { - flags = map_vp8_flags(USE_ARF | UPDATE_LAST); - } else { - flags = map_vp8_flags(USE_LAST | UPDATE_LAST); - } - } - break; - default: - svc_log(svc_ctx, SVC_LOG_ERROR, "unexpected encoding mode: %d\n", - svc_ctx->encoding_mode); - break; - } - si->enc_frame_flags = flags; -} - vpx_codec_err_t vpx_svc_get_layer_resolution(const SvcContext *svc_ctx, int layer, unsigned int *width, @@ -701,66 +618,26 @@ static void set_svc_parameters(SvcContext *svc_ctx, SvcInternal *const si = get_svc_internal(svc_ctx); memset(&svc_params, 0, sizeof(svc_params)); - svc_params.layer = si->layer; - svc_params.flags = si->enc_frame_flags; + svc_params.temporal_layer = 0; + svc_params.spatial_layer = si->layer; layer = si->layer; - if (svc_ctx->encoding_mode == ALT_INTER_LAYER_PREDICTION_IP && - si->frame_within_gop == 0) { - // layers 1 & 3 don't exist in this mode, use the higher one - if (layer == 0 || layer == 2) { - layer += 1; - } - } if (VPX_CODEC_OK != vpx_svc_get_layer_resolution(svc_ctx, layer, &svc_params.width, &svc_params.height)) { svc_log(svc_ctx, SVC_LOG_ERROR, "vpx_svc_get_layer_resolution failed\n"); } layer_index = layer + VPX_SS_MAX_LAYERS - si->layers; - svc_params.min_quantizer = si->quantizer[layer_index]; - svc_params.max_quantizer = si->quantizer[layer_index]; + + if (codec_ctx->config.enc->g_pass == VPX_RC_ONE_PASS) { + svc_params.min_quantizer = si->quantizer[layer_index]; + svc_params.max_quantizer = si->quantizer[layer_index]; + } else { + svc_params.min_quantizer = codec_ctx->config.enc->rc_min_quantizer; + svc_params.max_quantizer = codec_ctx->config.enc->rc_max_quantizer; + } + svc_params.distance_from_i_frame = si->frame_within_gop; - - // Use buffer i for layer i LST - svc_params.lst_fb_idx = si->layer; - - // Use buffer i-1 for layer i Alt (Inter-layer prediction) - if (si->layer != 0) { - const int use_higher_layer = - svc_ctx->encoding_mode == ALT_INTER_LAYER_PREDICTION_IP && - si->frame_within_gop == 0; - svc_params.alt_fb_idx = use_higher_layer ? si->layer - 2 : si->layer - 1; - } - - if (svc_ctx->encoding_mode == ALT_INTER_LAYER_PREDICTION_IP) { - svc_params.gld_fb_idx = si->layer + 1; - } else { - if (si->layer < 2 * si->layers - SVC_REFERENCE_FRAMES) - svc_params.gld_fb_idx = svc_params.lst_fb_idx; - else - svc_params.gld_fb_idx = 2 * si->layers - 1 - si->layer; - } - - svc_log(svc_ctx, SVC_LOG_DEBUG, "SVC frame: %d, layer: %d, %dx%d, q: %d\n", - si->encode_frame_count, si->layer, svc_params.width, - svc_params.height, svc_params.min_quantizer); - - if (svc_params.flags == VPX_EFLAG_FORCE_KF) { - svc_log(svc_ctx, SVC_LOG_DEBUG, "flags == VPX_EFLAG_FORCE_KF\n"); - } else { - svc_log( - svc_ctx, SVC_LOG_DEBUG, "Using: LST/GLD/ALT [%2d|%2d|%2d]\n", - svc_params.flags & VP8_EFLAG_NO_REF_LAST ? -1 : svc_params.lst_fb_idx, - svc_params.flags & VP8_EFLAG_NO_REF_GF ? -1 : svc_params.gld_fb_idx, - svc_params.flags & VP8_EFLAG_NO_REF_ARF ? -1 : svc_params.alt_fb_idx); - svc_log( - svc_ctx, SVC_LOG_DEBUG, "Updating: LST/GLD/ALT [%2d|%2d|%2d]\n", - svc_params.flags & VP8_EFLAG_NO_UPD_LAST ? -1 : svc_params.lst_fb_idx, - svc_params.flags & VP8_EFLAG_NO_UPD_GF ? -1 : svc_params.gld_fb_idx, - svc_params.flags & VP8_EFLAG_NO_UPD_ARF ? -1 : svc_params.alt_fb_idx); - } - vpx_codec_control(codec_ctx, VP9E_SET_SVC_PARAMETERS, &svc_params); } @@ -774,123 +651,116 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, vpx_codec_err_t res; vpx_codec_iter_t iter; const vpx_codec_cx_pkt_t *cx_pkt; - struct LayerData *cx_layer_list = NULL; - struct LayerData *layer_data; - struct Superframe superframe; + int layer_for_psnr = 0; SvcInternal *const si = get_svc_internal(svc_ctx); - if (svc_ctx == NULL || codec_ctx == NULL || rawimg == NULL || si == NULL) { + if (svc_ctx == NULL || codec_ctx == NULL || si == NULL) { return VPX_CODEC_INVALID_PARAM; } - memset(&superframe, 0, sizeof(superframe)); svc_log_reset(svc_ctx); + si->rc_stats_buf_used = 0; - si->layers = vpx_svc_dummy_frame(svc_ctx) ? 1 : svc_ctx->spatial_layers; - if (si->frame_within_gop >= si->kf_dist || - si->encode_frame_count == 0 || - (si->encode_frame_count == 1 && svc_ctx->first_frame_full_size == 1)) { + si->layers = svc_ctx->spatial_layers; + if (si->encode_frame_count == 0) { si->frame_within_gop = 0; } si->is_keyframe = (si->frame_within_gop == 0); - si->frame_size = 0; - svc_log(svc_ctx, SVC_LOG_DEBUG, - "vpx_svc_encode layers: %d, frame_count: %d, frame_within_gop: %d\n", - si->layers, si->encode_frame_count, si->frame_within_gop); + if (rawimg != NULL) { + svc_log(svc_ctx, SVC_LOG_DEBUG, + "vpx_svc_encode layers: %d, frame_count: %d, " + "frame_within_gop: %d\n", si->layers, si->encode_frame_count, + si->frame_within_gop); + } - // encode each layer - for (si->layer = 0; si->layer < si->layers; ++si->layer) { - if (svc_ctx->encoding_mode == ALT_INTER_LAYER_PREDICTION_IP && - si->is_keyframe && (si->layer == 1 || si->layer == 3)) { - svc_log(svc_ctx, SVC_LOG_DEBUG, "Skip encoding layer %d\n", si->layer); - continue; - } - calculate_enc_frame_flags(svc_ctx); - - if (vpx_svc_dummy_frame(svc_ctx)) { - // do not set svc parameters, use normal encode - svc_log(svc_ctx, SVC_LOG_DEBUG, "encoding full size first frame\n"); - } else { + if (rawimg != NULL) { + // encode each layer + for (si->layer = 0; si->layer < si->layers; ++si->layer) { set_svc_parameters(svc_ctx, codec_ctx); } - res = vpx_codec_encode(codec_ctx, rawimg, pts, (uint32_t)duration, - si->enc_frame_flags, deadline); - if (res != VPX_CODEC_OK) { - return res; - } - // save compressed data - iter = NULL; - while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) { - switch (cx_pkt->kind) { - case VPX_CODEC_CX_FRAME_PKT: { - const uint32_t frame_pkt_size = (uint32_t)(cx_pkt->data.frame.sz); - if (!vpx_svc_dummy_frame(svc_ctx)) { - si->bytes_in_layer[si->layer] += frame_pkt_size; - svc_log(svc_ctx, SVC_LOG_DEBUG, - "SVC frame: %d, layer: %d, size: %u\n", - si->encode_frame_count, si->layer, frame_pkt_size); - } - layer_data = - ld_create(cx_pkt->data.frame.buf, (size_t)frame_pkt_size); - if (layer_data == NULL) { - svc_log(svc_ctx, SVC_LOG_ERROR, "Error allocating LayerData\n"); - return 0; - } - ld_list_add(&cx_layer_list, layer_data); + } - // save layer size in superframe index - superframe.sizes[superframe.count++] = frame_pkt_size; - superframe.magnitude |= frame_pkt_size; - break; + res = vpx_codec_encode(codec_ctx, rawimg, pts, (uint32_t)duration, 0, + deadline); + if (res != VPX_CODEC_OK) { + return res; + } + // save compressed data + iter = NULL; + while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) { + switch (cx_pkt->kind) { + case VPX_CODEC_CX_FRAME_PKT: { + fd_list_add(&si->frame_list, fd_create(cx_pkt->data.frame.buf, + cx_pkt->data.frame.sz, + cx_pkt->data.frame.flags)); + + svc_log(svc_ctx, SVC_LOG_DEBUG, "SVC frame: %d, kf: %d, size: %d, " + "pts: %d\n", si->frame_received, + (cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? 1 : 0, + (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts); + + ++si->frame_received; + layer_for_psnr = 0; + break; + } + case VPX_CODEC_PSNR_PKT: { + int i; + svc_log(svc_ctx, SVC_LOG_DEBUG, + "SVC frame: %d, layer: %d, PSNR(Total/Y/U/V): " + "%2.3f %2.3f %2.3f %2.3f \n", + si->frame_received, layer_for_psnr, + cx_pkt->data.psnr.psnr[0], cx_pkt->data.psnr.psnr[1], + cx_pkt->data.psnr.psnr[2], cx_pkt->data.psnr.psnr[3]); + svc_log(svc_ctx, SVC_LOG_DEBUG, + "SVC frame: %d, layer: %d, SSE(Total/Y/U/V): " + "%2.3f %2.3f %2.3f %2.3f \n", + si->frame_received, layer_for_psnr, + cx_pkt->data.psnr.sse[0], cx_pkt->data.psnr.sse[1], + cx_pkt->data.psnr.sse[2], cx_pkt->data.psnr.sse[3]); + for (i = 0; i < COMPONENTS; i++) { + si->psnr_sum[layer_for_psnr][i] += cx_pkt->data.psnr.psnr[i]; + si->sse_sum[layer_for_psnr][i] += cx_pkt->data.psnr.sse[i]; } - case VPX_CODEC_PSNR_PKT: { - if (!vpx_svc_dummy_frame(svc_ctx)) { - svc_log(svc_ctx, SVC_LOG_DEBUG, - "SVC frame: %d, layer: %d, PSNR(Total/Y/U/V): " - "%2.3f %2.3f %2.3f %2.3f \n", - si->encode_frame_count, si->layer, - cx_pkt->data.psnr.psnr[0], cx_pkt->data.psnr.psnr[1], - cx_pkt->data.psnr.psnr[2], cx_pkt->data.psnr.psnr[3]); - si->psnr_in_layer[si->layer] += cx_pkt->data.psnr.psnr[0]; + ++layer_for_psnr; + break; + } + case VPX_CODEC_STATS_PKT: { + size_t new_size = si->rc_stats_buf_used + + cx_pkt->data.twopass_stats.sz; + + if (new_size > si->rc_stats_buf_size) { + char *p = (char*)realloc(si->rc_stats_buf, new_size); + if (p == NULL) { + svc_log(svc_ctx, SVC_LOG_ERROR, "Error allocating stats buf\n"); + return VPX_CODEC_MEM_ERROR; } - break; - } - default: { - break; + si->rc_stats_buf = p; + si->rc_stats_buf_size = new_size; } + + memcpy(si->rc_stats_buf + si->rc_stats_buf_used, + cx_pkt->data.twopass_stats.buf, cx_pkt->data.twopass_stats.sz); + si->rc_stats_buf_used += cx_pkt->data.twopass_stats.sz; + break; + } +#if CONFIG_SPATIAL_SVC + case VPX_CODEC_SPATIAL_SVC_LAYER_SIZES: { + int i; + for (i = 0; i < si->layers; ++i) + si->bytes_sum[i] += cx_pkt->data.layer_sizes[i]; + break; + } +#endif + default: { + break; } } } - // add superframe index to layer data list - if (!vpx_svc_dummy_frame(svc_ctx)) { - sf_create_index(&superframe); - layer_data = ld_create(superframe.buffer, superframe.index_size); - ld_list_add(&cx_layer_list, layer_data); + + if (rawimg != NULL) { + ++si->frame_within_gop; + ++si->encode_frame_count; } - // get accumulated size of layer data - si->frame_size = ld_list_get_buffer_size(cx_layer_list); - if (si->frame_size == 0) return VPX_CODEC_ERROR; - - // all layers encoded, create single buffer with concatenated layers - if (si->frame_size > si->buffer_size) { - free(si->buffer); - si->buffer = malloc(si->frame_size); - if (si->buffer == NULL) { - ld_list_free(cx_layer_list); - return VPX_CODEC_MEM_ERROR; - } - si->buffer_size = si->frame_size; - } - // copy layer data into packet - ld_list_copy_to_buffer(cx_layer_list, si->buffer); - - ld_list_free(cx_layer_list); - - svc_log(svc_ctx, SVC_LOG_DEBUG, "SVC frame: %d, kf: %d, size: %d, pts: %d\n", - si->encode_frame_count, si->is_keyframe, (int)si->frame_size, - (int)pts); - ++si->frame_within_gop; - ++si->encode_frame_count; return VPX_CODEC_OK; } @@ -901,16 +771,27 @@ const char *vpx_svc_get_message(const SvcContext *svc_ctx) { return si->message_buffer; } -void *vpx_svc_get_buffer(const SvcContext *svc_ctx) { - const SvcInternal *const si = get_const_svc_internal(svc_ctx); - if (svc_ctx == NULL || si == NULL) return NULL; - return si->buffer; +// We will maintain a list of output frame buffers since with lag_in_frame +// we need to output all frame buffers at the end. vpx_svc_get_buffer() will +// remove a frame buffer from the list the put it to a temporal pointer, which +// will be removed at the next vpx_svc_get_buffer() or when closing encoder. +void *vpx_svc_get_buffer(SvcContext *svc_ctx) { + SvcInternal *const si = get_svc_internal(svc_ctx); + if (svc_ctx == NULL || si == NULL || si->frame_list == NULL) return NULL; + + if (si->frame_temp) + fd_free(si->frame_temp); + + si->frame_temp = si->frame_list; + si->frame_list = si->frame_list->next; + + return si->frame_temp->buf; } size_t vpx_svc_get_frame_size(const SvcContext *svc_ctx) { const SvcInternal *const si = get_const_svc_internal(svc_ctx); - if (svc_ctx == NULL || si == NULL) return 0; - return si->frame_size; + if (svc_ctx == NULL || si == NULL || si->frame_list == NULL) return 0; + return si->frame_list->size; } int vpx_svc_get_encode_frame_count(const SvcContext *svc_ctx) { @@ -921,8 +802,8 @@ int vpx_svc_get_encode_frame_count(const SvcContext *svc_ctx) { int vpx_svc_is_keyframe(const SvcContext *svc_ctx) { const SvcInternal *const si = get_const_svc_internal(svc_ctx); - if (svc_ctx == NULL || si == NULL) return 0; - return si->is_keyframe; + if (svc_ctx == NULL || si == NULL || si->frame_list == NULL) return 0; + return (si->frame_list->flags & VPX_FRAME_IS_KEY) != 0; } void vpx_svc_set_keyframe(SvcContext *svc_ctx) { @@ -931,35 +812,63 @@ void vpx_svc_set_keyframe(SvcContext *svc_ctx) { si->frame_within_gop = 0; } +static double calc_psnr(double d) { + if (d == 0) return 100; + return -10.0 * log(d) / log(10.0); +} + // dump accumulated statistics and reset accumulated values const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) { - int number_of_frames, number_of_keyframes, encode_frame_count; - int i; + int number_of_frames, encode_frame_count; + int i, j; uint32_t bytes_total = 0; + double scale[COMPONENTS]; + double psnr[COMPONENTS]; + double mse[COMPONENTS]; + double y_scale; + SvcInternal *const si = get_svc_internal(svc_ctx); if (svc_ctx == NULL || si == NULL) return NULL; svc_log_reset(svc_ctx); encode_frame_count = si->encode_frame_count; - if (svc_ctx->first_frame_full_size) encode_frame_count--; if (si->encode_frame_count <= 0) return vpx_svc_get_message(svc_ctx); svc_log(svc_ctx, SVC_LOG_INFO, "\n"); - number_of_keyframes = encode_frame_count / si->kf_dist + 1; for (i = 0; i < si->layers; ++i) { number_of_frames = encode_frame_count; - if (svc_ctx->encoding_mode == ALT_INTER_LAYER_PREDICTION_IP && - (i == 1 || i == 3)) { - number_of_frames -= number_of_keyframes; + svc_log(svc_ctx, SVC_LOG_INFO, + "Layer %d Average PSNR=[%2.3f, %2.3f, %2.3f, %2.3f], Bytes=[%u]\n", + i, (double)si->psnr_sum[i][0] / number_of_frames, + (double)si->psnr_sum[i][1] / number_of_frames, + (double)si->psnr_sum[i][2] / number_of_frames, + (double)si->psnr_sum[i][3] / number_of_frames, si->bytes_sum[i]); + // the following psnr calculation is deduced from ffmpeg.c#print_report + y_scale = si->width * si->height * 255.0 * 255.0 * number_of_frames; + scale[1] = y_scale; + scale[2] = scale[3] = y_scale / 4; // U or V + scale[0] = y_scale * 1.5; // total + + for (j = 0; j < COMPONENTS; j++) { + psnr[j] = calc_psnr(si->sse_sum[i][j] / scale[j]); + mse[j] = si->sse_sum[i][j] * 255.0 * 255.0 / scale[j]; + } + svc_log(svc_ctx, SVC_LOG_INFO, + "Layer %d Overall PSNR=[%2.3f, %2.3f, %2.3f, %2.3f]\n", i, psnr[0], + psnr[1], psnr[2], psnr[3]); + svc_log(svc_ctx, SVC_LOG_INFO, + "Layer %d Overall MSE=[%2.3f, %2.3f, %2.3f, %2.3f]\n", i, mse[0], + mse[1], mse[2], mse[3]); + + bytes_total += si->bytes_sum[i]; + // clear sums for next time + si->bytes_sum[i] = 0; + for (j = 0; j < COMPONENTS; ++j) { + si->psnr_sum[i][j] = 0; + si->sse_sum[i][j] = 0; } - svc_log(svc_ctx, SVC_LOG_INFO, "Layer %d PSNR=[%2.3f], Bytes=[%u]\n", i, - (double)si->psnr_in_layer[i] / number_of_frames, - si->bytes_in_layer[i]); - bytes_total += si->bytes_in_layer[i]; - si->psnr_in_layer[i] = 0; - si->bytes_in_layer[i] = 0; } // only display statistics once @@ -976,8 +885,26 @@ void vpx_svc_release(SvcContext *svc_ctx) { // SvcInternal if it was not already allocated si = (SvcInternal *)svc_ctx->internal; if (si != NULL) { - free(si->buffer); + fd_free(si->frame_temp); + fd_free_list(si->frame_list); + if (si->rc_stats_buf) { + free(si->rc_stats_buf); + } free(si); svc_ctx->internal = NULL; } } + +size_t vpx_svc_get_rc_stats_buffer_size(const SvcContext *svc_ctx) { + const SvcInternal *const si = get_const_svc_internal(svc_ctx); + if (svc_ctx == NULL || si == NULL) return 0; + return si->rc_stats_buf_used; +} + +char *vpx_svc_get_rc_stats_buffer(const SvcContext *svc_ctx) { + const SvcInternal *const si = get_const_svc_internal(svc_ctx); + if (svc_ctx == NULL || si == NULL) return NULL; + return si->rc_stats_buf; +} + + diff --git a/media/libvpx/vpx/src/vpx_codec.c b/media/libvpx/vpx/src/vpx_codec.c index 1f664ae494bb..5a495ce814b8 100644 --- a/media/libvpx/vpx/src/vpx_codec.c +++ b/media/libvpx/vpx/src/vpx_codec.c @@ -88,8 +88,7 @@ vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx) { else if (!ctx->iface || !ctx->priv) res = VPX_CODEC_ERROR; else { - if (ctx->priv->alg_priv) - ctx->iface->destroy(ctx->priv->alg_priv); + ctx->iface->destroy((vpx_codec_alg_priv_t *)ctx->priv); ctx->iface = NULL; ctx->name = NULL; @@ -125,7 +124,7 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, va_list ap; va_start(ap, ctrl_id); - res = entry->fn(ctx->priv->alg_priv, ctrl_id, ap); + res = entry->fn((vpx_codec_alg_priv_t *)ctx->priv, ap); va_end(ap); break; } @@ -135,50 +134,25 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, return SAVE_STATUS(ctx, res); } -//------------------------------------------------------------------------------ -// mmap interface +void vpx_internal_error(struct vpx_internal_error_info *info, + vpx_codec_err_t error, + const char *fmt, + ...) { + va_list ap; -vpx_codec_err_t vpx_mmap_alloc(vpx_codec_mmap_t *mmap) { - unsigned int align = mmap->align ? mmap->align - 1 : 0; + info->error_code = error; + info->has_detail = 0; - if (mmap->flags & VPX_CODEC_MEM_ZERO) - mmap->priv = calloc(1, mmap->sz + align); - else - mmap->priv = malloc(mmap->sz + align); + if (fmt) { + size_t sz = sizeof(info->detail); - if (mmap->priv == NULL) return VPX_CODEC_MEM_ERROR; - mmap->base = (void *)((((uintptr_t)mmap->priv) + align) & ~(uintptr_t)align); - mmap->dtor = vpx_mmap_dtor; - return VPX_CODEC_OK; -} - -void vpx_mmap_dtor(vpx_codec_mmap_t *mmap) { - free(mmap->priv); -} - -vpx_codec_err_t vpx_validate_mmaps(const vpx_codec_stream_info_t *si, - const vpx_codec_mmap_t *mmaps, - const mem_req_t *mem_reqs, int nreqs, - vpx_codec_flags_t init_flags) { - int i; - - for (i = 0; i < nreqs - 1; ++i) { - /* Ensure the segment has been allocated */ - if (mmaps[i].base == NULL) { - return VPX_CODEC_MEM_ERROR; - } - - /* Verify variable size segment is big enough for the current si. */ - if (mem_reqs[i].calc_sz != NULL) { - vpx_codec_dec_cfg_t cfg; - - cfg.w = si->w; - cfg.h = si->h; - - if (mmaps[i].sz < mem_reqs[i].calc_sz(&cfg, init_flags)) { - return VPX_CODEC_MEM_ERROR; - } - } + info->has_detail = 1; + va_start(ap, fmt); + vsnprintf(info->detail, sz - 1, fmt, ap); + va_end(ap); + info->detail[sz - 1] = '\0'; } - return VPX_CODEC_OK; + + if (info->setjmp) + longjmp(info->jmp, info->error_code); } diff --git a/media/libvpx/vpx/src/vpx_decoder.c b/media/libvpx/vpx/src/vpx_decoder.c index 1f575e0a0423..802d8edd8a43 100644 --- a/media/libvpx/vpx/src/vpx_decoder.c +++ b/media/libvpx/vpx/src/vpx_decoder.c @@ -18,9 +18,13 @@ #define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var) +static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) { + return (vpx_codec_alg_priv_t *)ctx->priv; +} + vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx, vpx_codec_iface_t *iface, - vpx_codec_dec_cfg_t *cfg, + const vpx_codec_dec_cfg_t *cfg, vpx_codec_flags_t flags, int ver) { vpx_codec_err_t res; @@ -31,8 +35,6 @@ vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx, res = VPX_CODEC_INVALID_PARAM; else if (iface->abi_version != VPX_CODEC_INTERNAL_ABI_VERSION) res = VPX_CODEC_ABI_MISMATCH; - else if ((flags & VPX_CODEC_USE_XMA) && !(iface->caps & VPX_CODEC_CAP_XMA)) - res = VPX_CODEC_INCAPABLE; else if ((flags & VPX_CODEC_USE_POSTPROC) && !(iface->caps & VPX_CODEC_CAP_POSTPROC)) res = VPX_CODEC_INCAPABLE; else if ((flags & VPX_CODEC_USE_ERROR_CONCEALMENT) && @@ -50,18 +52,11 @@ vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx, ctx->priv = NULL; ctx->init_flags = flags; ctx->config.dec = cfg; - res = VPX_CODEC_OK; - if (!(flags & VPX_CODEC_USE_XMA)) { - res = ctx->iface->init(ctx, NULL); - - if (res) { - ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL; - vpx_codec_destroy(ctx); - } - - if (ctx->priv) - ctx->priv->iface = ctx->iface; + res = ctx->iface->init(ctx, NULL); + if (res) { + ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL; + vpx_codec_destroy(ctx); } } @@ -103,7 +98,7 @@ vpx_codec_err_t vpx_codec_get_stream_info(vpx_codec_ctx_t *ctx, si->w = 0; si->h = 0; - res = ctx->iface->dec.get_si(ctx->priv->alg_priv, si); + res = ctx->iface->dec.get_si(get_alg_priv(ctx), si); } return SAVE_STATUS(ctx, res); @@ -119,13 +114,13 @@ vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t *ctx, /* Sanity checks */ /* NULL data ptr allowed if data_sz is 0 too */ - if (!ctx || (!data && data_sz)) + if (!ctx || (!data && data_sz) || (data && !data_sz)) res = VPX_CODEC_INVALID_PARAM; else if (!ctx->iface || !ctx->priv) res = VPX_CODEC_ERROR; else { - res = ctx->iface->dec.decode(ctx->priv->alg_priv, data, data_sz, - user_priv, deadline); + res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv, + deadline); } return SAVE_STATUS(ctx, res); @@ -138,7 +133,7 @@ vpx_image_t *vpx_codec_get_frame(vpx_codec_ctx_t *ctx, if (!ctx || !iter || !ctx->iface || !ctx->priv) img = NULL; else - img = ctx->iface->dec.get_frame(ctx->priv->alg_priv, iter); + img = ctx->iface->dec.get_frame(get_alg_priv(ctx), iter); return img; } @@ -172,7 +167,7 @@ vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx if (!ctx || !cb) res = VPX_CODEC_INVALID_PARAM; else if (!ctx->iface || !ctx->priv - || !(ctx->iface->caps & VPX_CODEC_CAP_PUT_FRAME)) + || !(ctx->iface->caps & VPX_CODEC_CAP_PUT_SLICE)) res = VPX_CODEC_ERROR; else { ctx->priv->dec.put_slice_cb.u.put_slice = cb; @@ -183,45 +178,19 @@ vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t *ctx return SAVE_STATUS(ctx, res); } +vpx_codec_err_t vpx_codec_set_frame_buffer_functions( + vpx_codec_ctx_t *ctx, vpx_get_frame_buffer_cb_fn_t cb_get, + vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) { + vpx_codec_err_t res; -vpx_codec_err_t vpx_codec_get_mem_map(vpx_codec_ctx_t *ctx, - vpx_codec_mmap_t *mmap, - vpx_codec_iter_t *iter) { - vpx_codec_err_t res = VPX_CODEC_OK; - - if (!ctx || !mmap || !iter || !ctx->iface) + if (!ctx || !cb_get || !cb_release) { res = VPX_CODEC_INVALID_PARAM; - else if (!(ctx->iface->caps & VPX_CODEC_CAP_XMA)) + } else if (!ctx->iface || !ctx->priv || + !(ctx->iface->caps & VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) { res = VPX_CODEC_ERROR; - else - res = ctx->iface->get_mmap(ctx, mmap, iter); - - return SAVE_STATUS(ctx, res); -} - - -vpx_codec_err_t vpx_codec_set_mem_map(vpx_codec_ctx_t *ctx, - vpx_codec_mmap_t *mmap, - unsigned int num_maps) { - vpx_codec_err_t res = VPX_CODEC_MEM_ERROR; - - if (!ctx || !mmap || !ctx->iface) - res = VPX_CODEC_INVALID_PARAM; - else if (!(ctx->iface->caps & VPX_CODEC_CAP_XMA)) - res = VPX_CODEC_ERROR; - else { - unsigned int i; - - for (i = 0; i < num_maps; i++, mmap++) { - if (!mmap->base) - break; - - /* Everything look ok, set the mmap in the decoder */ - res = ctx->iface->set_mmap(ctx, mmap); - - if (res) - break; - } + } else { + res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release, + cb_priv); } return SAVE_STATUS(ctx, res); diff --git a/media/libvpx/vpx/src/vpx_encoder.c b/media/libvpx/vpx/src/vpx_encoder.c index e0a29cb94562..1903b5575c4b 100644 --- a/media/libvpx/vpx/src/vpx_encoder.c +++ b/media/libvpx/vpx/src/vpx_encoder.c @@ -20,9 +20,13 @@ #define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var) +static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) { + return (vpx_codec_alg_priv_t *)ctx->priv; +} + vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, vpx_codec_iface_t *iface, - vpx_codec_enc_cfg_t *cfg, + const vpx_codec_enc_cfg_t *cfg, vpx_codec_flags_t flags, int ver) { vpx_codec_err_t res; @@ -35,8 +39,6 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, res = VPX_CODEC_ABI_MISMATCH; else if (!(iface->caps & VPX_CODEC_CAP_ENCODER)) res = VPX_CODEC_INCAPABLE; - else if ((flags & VPX_CODEC_USE_XMA) && !(iface->caps & VPX_CODEC_CAP_XMA)) - res = VPX_CODEC_INCAPABLE; else if ((flags & VPX_CODEC_USE_PSNR) && !(iface->caps & VPX_CODEC_CAP_PSNR)) res = VPX_CODEC_INCAPABLE; @@ -55,9 +57,6 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL; vpx_codec_destroy(ctx); } - - if (ctx->priv) - ctx->priv->iface = ctx->iface; } return SAVE_STATUS(ctx, res); @@ -70,7 +69,7 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t *ctx, vpx_codec_flags_t flags, vpx_rational_t *dsf, int ver) { - vpx_codec_err_t res = 0; + vpx_codec_err_t res = VPX_CODEC_OK; if (ver != VPX_ENCODER_ABI_VERSION) res = VPX_CODEC_ABI_MISMATCH; @@ -80,8 +79,6 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t *ctx, res = VPX_CODEC_ABI_MISMATCH; else if (!(iface->caps & VPX_CODEC_CAP_ENCODER)) res = VPX_CODEC_INCAPABLE; - else if ((flags & VPX_CODEC_USE_XMA) && !(iface->caps & VPX_CODEC_CAP_XMA)) - res = VPX_CODEC_INCAPABLE; else if ((flags & VPX_CODEC_USE_PSNR) && !(iface->caps & VPX_CODEC_CAP_PSNR)) res = VPX_CODEC_INCAPABLE; @@ -139,9 +136,6 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t *ctx, } } - if (ctx->priv) - ctx->priv->iface = ctx->iface; - if (res) break; @@ -149,6 +143,7 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t *ctx, cfg++; dsf++; } + ctx--; } } @@ -161,6 +156,7 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, unsigned int usage) { vpx_codec_err_t res; vpx_codec_enc_cfg_map_t *map; + int i; if (!iface || !cfg || usage > INT_MAX) res = VPX_CODEC_INVALID_PARAM; @@ -169,7 +165,8 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, else { res = VPX_CODEC_INVALID_PARAM; - for (map = iface->enc.cfg_maps; map->usage >= 0; map++) { + for (i = 0; i < iface->enc.cfg_map_count; ++i) { + map = iface->enc.cfg_maps + i; if (map->usage == (int)usage) { *cfg = map->cfg; cfg->g_usage = usage; @@ -206,7 +203,7 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, unsigned long duration, vpx_enc_frame_flags_t flags, unsigned long deadline) { - vpx_codec_err_t res = 0; + vpx_codec_err_t res = VPX_CODEC_OK; if (!ctx || (img && !duration)) res = VPX_CODEC_INVALID_PARAM; @@ -223,7 +220,7 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, FLOATING_POINT_INIT(); if (num_enc == 1) - res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts, + res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration, flags, deadline); else { /* Multi-resolution encoding: @@ -237,7 +234,7 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, if (img) img += num_enc - 1; for (i = num_enc - 1; i >= 0; i--) { - if ((res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts, + if ((res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration, flags, deadline))) break; @@ -254,8 +251,8 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, } -const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t *ctx, - vpx_codec_iter_t *iter) { +const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t *ctx, + vpx_codec_iter_t *iter) { const vpx_codec_cx_pkt_t *pkt = NULL; if (ctx) { @@ -266,36 +263,34 @@ const vpx_codec_cx_pkt_t *vpx_codec_get_cx_data(vpx_codec_ctx_t *ctx, else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER)) ctx->err = VPX_CODEC_INCAPABLE; else - pkt = ctx->iface->enc.get_cx_data(ctx->priv->alg_priv, iter); + pkt = ctx->iface->enc.get_cx_data(get_alg_priv(ctx), iter); } if (pkt && pkt->kind == VPX_CODEC_CX_FRAME_PKT) { - /* If the application has specified a destination area for the - * compressed data, and the codec has not placed the data there, - * and it fits, copy it. - */ - char *dst_buf = ctx->priv->enc.cx_data_dst_buf.buf; + // If the application has specified a destination area for the + // compressed data, and the codec has not placed the data there, + // and it fits, copy it. + vpx_codec_priv_t *const priv = ctx->priv; + char *const dst_buf = (char *)priv->enc.cx_data_dst_buf.buf; - if (dst_buf - && pkt->data.raw.buf != dst_buf - && pkt->data.raw.sz - + ctx->priv->enc.cx_data_pad_before - + ctx->priv->enc.cx_data_pad_after - <= ctx->priv->enc.cx_data_dst_buf.sz) { - vpx_codec_cx_pkt_t *modified_pkt = &ctx->priv->enc.cx_data_pkt; + if (dst_buf && + pkt->data.raw.buf != dst_buf && + pkt->data.raw.sz + priv->enc.cx_data_pad_before + + priv->enc.cx_data_pad_after <= priv->enc.cx_data_dst_buf.sz) { + vpx_codec_cx_pkt_t *modified_pkt = &priv->enc.cx_data_pkt; - memcpy(dst_buf + ctx->priv->enc.cx_data_pad_before, - pkt->data.raw.buf, pkt->data.raw.sz); + memcpy(dst_buf + priv->enc.cx_data_pad_before, pkt->data.raw.buf, + pkt->data.raw.sz); *modified_pkt = *pkt; modified_pkt->data.raw.buf = dst_buf; - modified_pkt->data.raw.sz += ctx->priv->enc.cx_data_pad_before - + ctx->priv->enc.cx_data_pad_after; + modified_pkt->data.raw.sz += priv->enc.cx_data_pad_before + + priv->enc.cx_data_pad_after; pkt = modified_pkt; } if (dst_buf == pkt->data.raw.buf) { - ctx->priv->enc.cx_data_dst_buf.buf = dst_buf + pkt->data.raw.sz; - ctx->priv->enc.cx_data_dst_buf.sz -= pkt->data.raw.sz; + priv->enc.cx_data_dst_buf.buf = dst_buf + pkt->data.raw.sz; + priv->enc.cx_data_dst_buf.sz -= pkt->data.raw.sz; } } @@ -336,7 +331,7 @@ const vpx_image_t *vpx_codec_get_preview_frame(vpx_codec_ctx_t *ctx) { else if (!ctx->iface->enc.get_preview) ctx->err = VPX_CODEC_INCAPABLE; else - img = ctx->iface->enc.get_preview(ctx->priv->alg_priv); + img = ctx->iface->enc.get_preview(get_alg_priv(ctx)); } return img; @@ -354,7 +349,7 @@ vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx) { else if (!ctx->iface->enc.get_glob_hdrs) ctx->err = VPX_CODEC_INCAPABLE; else - buf = ctx->iface->enc.get_glob_hdrs(ctx->priv->alg_priv); + buf = ctx->iface->enc.get_glob_hdrs(get_alg_priv(ctx)); } return buf; @@ -370,7 +365,7 @@ vpx_codec_err_t vpx_codec_enc_config_set(vpx_codec_ctx_t *ctx, else if (!(ctx->iface->caps & VPX_CODEC_CAP_ENCODER)) res = VPX_CODEC_INCAPABLE; else - res = ctx->iface->enc.cfg_set(ctx->priv->alg_priv, cfg); + res = ctx->iface->enc.cfg_set(get_alg_priv(ctx), cfg); return SAVE_STATUS(ctx, res); } @@ -395,7 +390,7 @@ const vpx_codec_cx_pkt_t *vpx_codec_pkt_list_get(struct vpx_codec_pkt_list *list *iter = list->pkts; } - pkt = (const void *) * iter; + pkt = (const vpx_codec_cx_pkt_t *)*iter; if ((size_t)(pkt - list->pkts) < list->cnt) *iter = pkt + 1; diff --git a/media/libvpx/vpx/src/vpx_image.c b/media/libvpx/vpx/src/vpx_image.c index 36eda958ede7..e58b61ea390d 100644 --- a/media/libvpx/vpx/src/vpx_image.c +++ b/media/libvpx/vpx/src/vpx_image.c @@ -8,45 +8,20 @@ * be found in the AUTHORS file in the root of the source tree. */ - #include #include + #include "vpx/vpx_image.h" +#include "vpx/vpx_integer.h" +#include "vpx_mem/vpx_mem.h" -#define ADDRESS_STORAGE_SIZE sizeof(size_t) -/*returns an addr aligned to the byte boundary specified by align*/ -#define align_addr(addr,align) (void*)(((size_t)(addr) + ((align) - 1)) & (size_t)-(align)) - -/* Memalign code is copied from vpx_mem.c */ -static void *img_buf_memalign(size_t align, size_t size) { - void *addr, - * x = NULL; - - addr = malloc(size + align - 1 + ADDRESS_STORAGE_SIZE); - - if (addr) { - x = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, (int)align); - /* save the actual malloc address */ - ((size_t *)x)[-1] = (size_t)addr; - } - - return x; -} - -static void img_buf_free(void *memblk) { - if (memblk) { - void *addr = (void *)(((size_t *)memblk)[-1]); - free(addr); - } -} - -static vpx_image_t *img_alloc_helper(vpx_image_t *img, - vpx_img_fmt_t fmt, - unsigned int d_w, - unsigned int d_h, - unsigned int buf_align, - unsigned int stride_align, - unsigned char *img_data) { +static vpx_image_t *img_alloc_helper(vpx_image_t *img, + vpx_img_fmt_t fmt, + unsigned int d_w, + unsigned int d_h, + unsigned int buf_align, + unsigned int stride_align, + unsigned char *img_data) { unsigned int h, w, s, xcs, ycs, bps; int align; @@ -94,6 +69,21 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, case VPX_IMG_FMT_VPXYV12: bps = 12; break; + case VPX_IMG_FMT_I422: + bps = 16; + break; + case VPX_IMG_FMT_I444: + bps = 24; + break; + case VPX_IMG_FMT_I42016: + bps = 24; + break; + case VPX_IMG_FMT_I42216: + bps = 32; + break; + case VPX_IMG_FMT_I44416: + bps = 48; + break; default: bps = 16; break; @@ -105,6 +95,9 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, case VPX_IMG_FMT_YV12: case VPX_IMG_FMT_VPXI420: case VPX_IMG_FMT_VPXYV12: + case VPX_IMG_FMT_I422: + case VPX_IMG_FMT_I42016: + case VPX_IMG_FMT_I42216: xcs = 1; break; default: @@ -147,8 +140,13 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, img->img_data = img_data; if (!img_data) { - img->img_data = img_buf_memalign(buf_align, ((fmt & VPX_IMG_FMT_PLANAR) ? - h * s * bps / 8 : h * s)); + const uint64_t alloc_size = (fmt & VPX_IMG_FMT_PLANAR) ? + (uint64_t)h * s * bps / 8 : (uint64_t)h * s; + + if (alloc_size != (size_t)alloc_size) + goto fail; + + img->img_data = (uint8_t *)vpx_memalign(buf_align, (size_t)alloc_size); img->img_data_owner = 1; } @@ -156,6 +154,7 @@ static vpx_image_t *img_alloc_helper(vpx_image_t *img, goto fail; img->fmt = fmt; + img->bit_depth = (fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 16 : 8; img->w = w; img->h = h; img->x_chroma_shift = xcs; @@ -271,7 +270,7 @@ void vpx_img_flip(vpx_image_t *img) { void vpx_img_free(vpx_image_t *img) { if (img) { if (img->img_data && img->img_data_owner) - img_buf_free(img->img_data); + vpx_free(img->img_data); if (img->self_allocd) free(img); diff --git a/media/libvpx/vpx/src/vpx_psnr.c b/media/libvpx/vpx/src/vpx_psnr.c new file mode 100644 index 000000000000..05843acb61f0 --- /dev/null +++ b/media/libvpx/vpx/src/vpx_psnr.c @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vpx/internal/vpx_psnr.h" + +#define MAX_PSNR 100.0 + +double vpx_sse_to_psnr(double samples, double peak, double sse) { + if (sse > 0.0) { + const double psnr = 10.0 * log10(samples * peak * peak / sse); + return psnr > MAX_PSNR ? MAX_PSNR : psnr; + } else { + return MAX_PSNR; + } +} diff --git a/media/libvpx/vpx/svc_context.h b/media/libvpx/vpx/svc_context.h index f4933f8d6ef5..eea3b131aae4 100644 --- a/media/libvpx/vpx/svc_context.h +++ b/media/libvpx/vpx/svc_context.h @@ -16,20 +16,13 @@ #ifndef VPX_SVC_CONTEXT_H_ #define VPX_SVC_CONTEXT_H_ -#include "vpx/vp8cx.h" -#include "vpx/vpx_encoder.h" +#include "./vp8cx.h" +#include "./vpx_encoder.h" #ifdef __cplusplus extern "C" { #endif -typedef enum SVC_ENCODING_MODE { - INTER_LAYER_PREDICTION_I, - ALT_INTER_LAYER_PREDICTION_IP, - INTER_LAYER_PREDICTION_IP, - USE_GOLDEN_FRAME -} SVC_ENCODING_MODE; - typedef enum SVC_LOG_LEVEL { SVC_LOG_ERROR, SVC_LOG_INFO, @@ -38,9 +31,8 @@ typedef enum SVC_LOG_LEVEL { typedef struct { // public interface to svc_command options - int spatial_layers; // number of layers - int first_frame_full_size; // set to one to force first frame full size - SVC_ENCODING_MODE encoding_mode; // svc encoding strategy + int spatial_layers; // number of spatial layers + int temporal_layers; // number of temporal layers SVC_LOG_LEVEL log_level; // amount of information to display int log_print; // when set, printf log messages instead of returning the // message with svc_get_message @@ -104,14 +96,27 @@ const char *vpx_svc_dump_statistics(SvcContext *svc_ctx); const char *vpx_svc_get_message(const SvcContext *svc_ctx); /** - * return size of encoded data to be returned by vpx_svc_get_buffer + * return size of encoded data to be returned by vpx_svc_get_buffer. + * it needs to be called before vpx_svc_get_buffer. */ size_t vpx_svc_get_frame_size(const SvcContext *svc_ctx); /** - * return buffer with encoded data + * return buffer with encoded data. encoder will maintain a list of frame + * buffers. each call of vpx_svc_get_buffer() will return one frame. */ -void *vpx_svc_get_buffer(const SvcContext *svc_ctx); +void *vpx_svc_get_buffer(SvcContext *svc_ctx); + +/** + * return size of two pass rate control stats data to be returned by + * vpx_svc_get_rc_stats_buffer + */ +size_t vpx_svc_get_rc_stats_buffer_size(const SvcContext *svc_ctx); + +/** + * return buffer two pass of rate control stats data + */ +char *vpx_svc_get_rc_stats_buffer(const SvcContext *svc_ctx); /** * return spatial resolution of the specified layer @@ -139,4 +144,4 @@ void vpx_svc_set_keyframe(SvcContext *svc_ctx); } // extern "C" #endif -#endif /* VPX_SVC_CONTEXT_H_ */ +#endif // VPX_SVC_CONTEXT_H_ diff --git a/media/libvpx/vpx/vp8.h b/media/libvpx/vpx/vp8.h index 056fa7aed72e..2a31af6d10af 100644 --- a/media/libvpx/vpx/vp8.h +++ b/media/libvpx/vpx/vp8.h @@ -27,8 +27,8 @@ /*!\file * \brief Provides controls common to both the VP8 encoder and decoder. */ -#ifndef VP8_H -#define VP8_H +#ifndef VPX_VP8_H_ +#define VPX_VP8_H_ #include "./vpx_codec.h" #include "./vpx_image.h" @@ -135,4 +135,4 @@ VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE, vp9_ref_frame_t *) } // extern "C" #endif -#endif +#endif // VPX_VP8_H_ diff --git a/media/libvpx/vpx/vp8cx.h b/media/libvpx/vpx/vp8cx.h index c0424f1466a4..796a7a1c209d 100644 --- a/media/libvpx/vpx/vp8cx.h +++ b/media/libvpx/vpx/vp8cx.h @@ -7,15 +7,15 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8CX_H -#define VP8CX_H +#ifndef VPX_VP8CX_H_ +#define VPX_VP8CX_H_ /*!\defgroup vp8_encoder WebM VP8 Encoder * \ingroup vp8 * * @{ */ -#include "vp8.h" +#include "./vp8.h" /*!\file * \brief Provides definitions for using the VP8 encoder algorithm within the @@ -38,8 +38,6 @@ extern vpx_codec_iface_t *vpx_codec_vp8_cx(void); /* TODO(jkoleszar): These move to VP9 in a later patch set. */ extern vpx_codec_iface_t vpx_codec_vp9_cx_algo; extern vpx_codec_iface_t *vpx_codec_vp9_cx(void); -extern vpx_codec_iface_t vpx_codec_vp9x_cx_algo; -extern vpx_codec_iface_t *vpx_codec_vp9x_cx(void); /*!@} - end algorithm interface member group*/ @@ -162,8 +160,12 @@ enum vp8e_enc_control_id { scale as used by the rc_*_quantizer config parameters */ VP8E_SET_ARNR_MAXFRAMES, /**< control function to set the max number of frames blurred creating arf*/ - VP8E_SET_ARNR_STRENGTH, /**< control function to set the filter strength for the arf */ - VP8E_SET_ARNR_TYPE, /**< control function to set the type of filter to use for the arf*/ + VP8E_SET_ARNR_STRENGTH, //!< control function to set the filter + //!< strength for the arf + + /*!\deprecated control function to set the filter type to use for the arf */ + VP8E_SET_ARNR_TYPE, + VP8E_SET_TUNING, /**< control function to set visual tuning */ /*!\brief control function to set constrained quality level * @@ -194,9 +196,17 @@ enum vp8e_enc_control_id { VP9E_SET_TILE_ROWS, VP9E_SET_FRAME_PARALLEL_DECODING, VP9E_SET_AQ_MODE, + VP9E_SET_FRAME_PERIODIC_BOOST, VP9E_SET_SVC, - VP9E_SET_SVC_PARAMETERS + VP9E_SET_SVC_PARAMETERS, + /*!\brief control function to set svc layer for spatial and temporal. + * \note Valid ranges: 0..#vpx_codec_enc_cfg::ss_number_layers for spatial + * layer and 0..#vpx_codec_enc_cfg::ts_number_layers for + * temporal layer. + */ + VP9E_SET_SVC_LAYER_ID, + VP9E_SET_TUNE_CONTENT }; /*!\brief vpx 1-D scaling mode @@ -257,7 +267,7 @@ typedef struct vpx_scaling_mode { /*!\brief VP8 token partition mode * * This defines VP8 partitioning mode for compressed data, i.e., the number of - * sub-streams in the bitstream. Used for parallelized decoding. + * sub-streams in the bitstream. Used for parallelized decoding. * */ @@ -268,6 +278,12 @@ typedef enum { VP8_EIGHT_TOKENPARTITION = 3 } vp8e_token_partitions; +/*!brief VP9 encoder content type */ +typedef enum { + VP9E_CONTENT_DEFAULT, + VP9E_CONTENT_SCREEN, + VP9E_CONTENT_INVALID +} vp9e_tune_content; /*!\brief VP8 model tuning parameters * @@ -287,8 +303,8 @@ typedef enum { typedef struct vpx_svc_parameters { unsigned int width; /**< width of current spatial layer */ unsigned int height; /**< height of current spatial layer */ - int layer; /**< current layer number - 0 = base */ - int flags; /**< encode frame flags */ + int spatial_layer; /**< current spatial layer number - 0 = base */ + int temporal_layer; /**< current temporal layer number - 0 = base */ int max_quantizer; /**< max quantizer for current layer */ int min_quantizer; /**< min quantizer for current layer */ int distance_from_i_frame; /**< frame number within current gop */ @@ -297,6 +313,18 @@ typedef struct vpx_svc_parameters { int alt_fb_idx; /**< alt reference frame frame buffer index */ } vpx_svc_parameters_t; +/*!\brief vp9 svc layer parameters + * + * This defines the spatial and temporal layer id numbers for svc encoding. + * This is used with the #VP9E_SET_SVC_LAYER_ID control to set the spatial and + * temporal layer id for the current frame. + * + */ +typedef struct vpx_svc_layer_id { + int spatial_layer_id; /**< Spatial layer id number. */ + int temporal_layer_id; /**< Temporal layer id number. */ +} vpx_svc_layer_id_t; + /*!\brief VP8 encoder control function parameter type * * Defines the data types that VP8E control functions take. Note that @@ -318,6 +346,7 @@ VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE, vpx_scaling_mode_t *) VPX_CTRL_USE_TYPE(VP9E_SET_SVC, int) VPX_CTRL_USE_TYPE(VP9E_SET_SVC_PARAMETERS, vpx_svc_parameters_t *) +VPX_CTRL_USE_TYPE(VP9E_SET_SVC_LAYER_ID, vpx_svc_layer_id_t *) VPX_CTRL_USE_TYPE(VP8E_SET_CPUUSED, int) VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOALTREF, unsigned int) @@ -328,7 +357,7 @@ VPX_CTRL_USE_TYPE(VP8E_SET_TOKEN_PARTITIONS, int) /* vp8e_token_partitions */ VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_MAXFRAMES, unsigned int) VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_STRENGTH, unsigned int) -VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_TYPE, unsigned int) +VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_SET_ARNR_TYPE, unsigned int) VPX_CTRL_USE_TYPE(VP8E_SET_TUNING, int) /* vp8e_tuning */ VPX_CTRL_USE_TYPE(VP8E_SET_CQ_LEVEL, unsigned int) @@ -346,9 +375,12 @@ VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PARALLEL_DECODING, unsigned int) VPX_CTRL_USE_TYPE(VP9E_SET_AQ_MODE, unsigned int) +VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PERIODIC_BOOST, unsigned int) + +VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vp9e_tune_content */ /*! @} - end defgroup vp8_encoder */ #ifdef __cplusplus } // extern "C" #endif -#endif +#endif // VPX_VP8CX_H_ diff --git a/media/libvpx/vpx/vp8dx.h b/media/libvpx/vpx/vp8dx.h index d3093c48c5db..379b3062089c 100644 --- a/media/libvpx/vpx/vp8dx.h +++ b/media/libvpx/vpx/vp8dx.h @@ -9,8 +9,6 @@ */ -#include "vp8.h" - /*!\defgroup vp8_decoder WebM VP8 Decoder * \ingroup vp8 * @@ -20,13 +18,16 @@ * \brief Provides definitions for using the VP8 algorithm within the vpx Decoder * interface. */ -#ifndef VP8DX_H -#define VP8DX_H +#ifndef VPX_VP8DX_H_ +#define VPX_VP8DX_H_ #ifdef __cplusplus extern "C" { #endif +/* Include controls common to both the encoder and decoder */ +#include "./vp8.h" + /*!\name Algorithm interface for VP8 * * This interface provides the capability to decode raw VP8 streams, as would @@ -41,9 +42,6 @@ extern vpx_codec_iface_t vpx_codec_vp9_dx_algo; extern vpx_codec_iface_t *vpx_codec_vp9_dx(void); /*!@} - end algorithm interface member group*/ -/* Include controls common to both the encoder and decoder */ -#include "vp8.h" - /*!\enum vp8_dec_control_id * \brief VP8 decoder control functions @@ -68,10 +66,17 @@ enum vp8_dec_control_id { VP8D_GET_LAST_REF_USED, /** decryption function to decrypt encoded buffer data immediately - * before decoding. Takes a vp8_decrypt_init, which contains + * before decoding. Takes a vpx_decrypt_init, which contains * a callback function and opaque context pointer. */ - VP8D_SET_DECRYPTOR, + VPXD_SET_DECRYPTOR, + VP8D_SET_DECRYPTOR = VPXD_SET_DECRYPTOR, + + /** control function to get the display dimensions for the current frame. */ + VP9D_GET_DISPLAY_SIZE, + + /** control function to get the bit depth of the stream. */ + VP9D_GET_BIT_DEPTH, /** For testing. */ VP9_INVERT_TILE_DECODE_ORDER, @@ -79,19 +84,28 @@ enum vp8_dec_control_id { VP8_DECODER_CTRL_ID_MAX }; +/** Decrypt n bytes of data from input -> output, using the decrypt_state + * passed in VPXD_SET_DECRYPTOR. + */ +typedef void (*vpx_decrypt_cb)(void *decrypt_state, const unsigned char *input, + unsigned char *output, int count); + /*!\brief Structure to hold decryption state * * Defines a structure to hold the decryption state and access function. */ -typedef struct vp8_decrypt_init { - /** Decrypt n bytes of data from input -> output, using the decrypt_state - * passed in VP8D_SET_DECRYPTOR. - */ - void (*decrypt_cb)(void *decrypt_state, const unsigned char *input, - unsigned char *output, int count); +typedef struct vpx_decrypt_init { + /*! Decrypt callback. */ + vpx_decrypt_cb decrypt_cb; + /*! Decryption state. */ void *decrypt_state; -} vp8_decrypt_init; +} vpx_decrypt_init; + +/*!\brief A deprecated alias for vpx_decrypt_init. + */ +typedef vpx_decrypt_init vp8_decrypt_init; + /*!\brief VP8 decoder control function parameter type * @@ -101,10 +115,13 @@ typedef struct vp8_decrypt_init { */ -VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_UPDATES, int *) -VPX_CTRL_USE_TYPE(VP8D_GET_FRAME_CORRUPTED, int *) -VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED, int *) -VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR, vp8_decrypt_init *) +VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_UPDATES, int *) +VPX_CTRL_USE_TYPE(VP8D_GET_FRAME_CORRUPTED, int *) +VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED, int *) +VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR, vpx_decrypt_init *) +VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR, vpx_decrypt_init *) +VPX_CTRL_USE_TYPE(VP9D_GET_DISPLAY_SIZE, int *) +VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH, unsigned int *) VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int) /*! @} - end defgroup vp8_decoder */ @@ -113,4 +130,4 @@ VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int) } // extern "C" #endif -#endif +#endif // VPX_VP8DX_H_ diff --git a/media/libvpx/vpx/vpx_codec.h b/media/libvpx/vpx/vpx_codec.h index 3ea36d6ef168..b25308ed9a6c 100644 --- a/media/libvpx/vpx/vpx_codec.h +++ b/media/libvpx/vpx/vpx_codec.h @@ -36,15 +36,15 @@ * Once initialized, the instance is manged using other functions from * the vpx_codec_* family. */ -#ifndef VPX_CODEC_H -#define VPX_CODEC_H +#ifndef VPX_VPX_CODEC_H_ +#define VPX_VPX_CODEC_H_ #ifdef __cplusplus extern "C" { #endif -#include "vpx_integer.h" -#include "vpx_image.h" +#include "./vpx_integer.h" +#include "./vpx_image.h" /*!\brief Decorator indicating a function is deprecated */ #ifndef DEPRECATED @@ -153,7 +153,6 @@ extern "C" { typedef long vpx_codec_caps_t; #define VPX_CODEC_CAP_DECODER 0x1 /**< Is a decoder */ #define VPX_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */ -#define VPX_CODEC_CAP_XMA 0x4 /**< Supports eXternal Memory Allocation */ /*! \brief Initialization-time Feature Enabling @@ -164,7 +163,6 @@ extern "C" { * The available flags are specified by VPX_CODEC_USE_* defines. */ typedef long vpx_codec_flags_t; -#define VPX_CODEC_USE_XMA 0x00000001 /**< Use eXternal Memory Allocation mode */ /*!\brief Codec interface structure. @@ -205,13 +203,24 @@ extern "C" { const char *err_detail; /**< Detailed info, if available */ vpx_codec_flags_t init_flags; /**< Flags passed at init time */ union { - struct vpx_codec_dec_cfg *dec; /**< Decoder Configuration Pointer */ - struct vpx_codec_enc_cfg *enc; /**< Encoder Configuration Pointer */ - void *raw; + /**< Decoder Configuration Pointer */ + const struct vpx_codec_dec_cfg *dec; + /**< Encoder Configuration Pointer */ + const struct vpx_codec_enc_cfg *enc; + const void *raw; } config; /**< Configuration pointer aliasing union */ vpx_codec_priv_t *priv; /**< Algorithm private storage */ } vpx_codec_ctx_t; + /*!\brief Bit depth for codec + * * + * This enumeration determines the bit depth of the codec. + */ + typedef enum vpx_bit_depth { + VPX_BITS_8 = 8, /**< 8 bits */ + VPX_BITS_10 = 10, /**< 10 bits */ + VPX_BITS_12 = 12, /**< 12 bits */ + } vpx_bit_depth_t; /* * Library Version Number Interface @@ -462,97 +471,9 @@ extern "C" { #endif - - /*!\defgroup cap_xma External Memory Allocation Functions - * - * The following functions are required to be implemented for all codecs - * that advertise the VPX_CODEC_CAP_XMA capability. Calling these functions - * for codecs that don't advertise this capability will result in an error - * code being returned, usually VPX_CODEC_INCAPABLE - * @{ - */ - - - /*!\brief Memory Map Entry - * - * This structure is used to contain the properties of a memory segment. It - * is populated by the codec in the request phase, and by the calling - * application once the requested allocation has been performed. - */ - typedef struct vpx_codec_mmap { - /* - * The following members are set by the codec when requesting a segment - */ - unsigned int id; /**< identifier for the segment's contents */ - unsigned long sz; /**< size of the segment, in bytes */ - unsigned int align; /**< required alignment of the segment, in bytes */ - unsigned int flags; /**< bitfield containing segment properties */ -#define VPX_CODEC_MEM_ZERO 0x1 /**< Segment must be zeroed by allocation */ -#define VPX_CODEC_MEM_WRONLY 0x2 /**< Segment need not be readable */ -#define VPX_CODEC_MEM_FAST 0x4 /**< Place in fast memory, if available */ - - /* The following members are to be filled in by the allocation function */ - void *base; /**< pointer to the allocated segment */ - void (*dtor)(struct vpx_codec_mmap *map); /**< destructor to call */ - void *priv; /**< allocator private storage */ - } vpx_codec_mmap_t; /**< alias for struct vpx_codec_mmap */ - - - /*!\brief Iterate over the list of segments to allocate. - * - * Iterates over a list of the segments to allocate. The iterator storage - * should be initialized to NULL to start the iteration. Iteration is complete - * when this function returns VPX_CODEC_LIST_END. The amount of memory needed to - * allocate is dependent upon the size of the encoded stream. In cases where the - * stream is not available at allocation time, a fixed size must be requested. - * The codec will not be able to operate on streams larger than the size used at - * allocation time. - * - * \param[in] ctx Pointer to this instance's context. - * \param[out] mmap Pointer to the memory map entry to populate. - * \param[in,out] iter Iterator storage, initialized to NULL - * - * \retval #VPX_CODEC_OK - * The memory map entry was populated. - * \retval #VPX_CODEC_ERROR - * Codec does not support XMA mode. - * \retval #VPX_CODEC_MEM_ERROR - * Unable to determine segment size from stream info. - */ - vpx_codec_err_t vpx_codec_get_mem_map(vpx_codec_ctx_t *ctx, - vpx_codec_mmap_t *mmap, - vpx_codec_iter_t *iter); - - - /*!\brief Identify allocated segments to codec instance - * - * Stores a list of allocated segments in the codec. Segments \ref MUST be - * passed in the order they are read from vpx_codec_get_mem_map(), but may be - * passed in groups of any size. Segments \ref MUST be set only once. The - * allocation function \ref MUST ensure that the vpx_codec_mmap_t::base member - * is non-NULL. If the segment requires cleanup handling (e.g., calling free() - * or close()) then the vpx_codec_mmap_t::dtor member \ref MUST be populated. - * - * \param[in] ctx Pointer to this instance's context. - * \param[in] mmaps Pointer to the first memory map entry in the list. - * \param[in] num_maps Number of entries being set at this time - * - * \retval #VPX_CODEC_OK - * The segment was stored in the codec context. - * \retval #VPX_CODEC_INCAPABLE - * Codec does not support XMA mode. - * \retval #VPX_CODEC_MEM_ERROR - * Segment base address was not set, or segment was already stored. - - */ - vpx_codec_err_t vpx_codec_set_mem_map(vpx_codec_ctx_t *ctx, - vpx_codec_mmap_t *mmaps, - unsigned int num_maps); - - /*!@} - end defgroup cap_xma*/ /*!@} - end defgroup codec*/ #ifdef __cplusplus } #endif -#endif +#endif // VPX_VPX_CODEC_H_ diff --git a/media/libvpx/vpx/vpx_decoder.h b/media/libvpx/vpx/vpx_decoder.h index 2dcd024cccaa..62fd91975642 100644 --- a/media/libvpx/vpx/vpx_decoder.h +++ b/media/libvpx/vpx/vpx_decoder.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DECODER_H -#define VPX_DECODER_H +#ifndef VPX_VPX_DECODER_H_ +#define VPX_VPX_DECODER_H_ /*!\defgroup decoder Decoder Algorithm Interface * \ingroup codec @@ -29,7 +29,8 @@ extern "C" { #endif -#include "vpx_codec.h" +#include "./vpx_codec.h" +#include "./vpx_frame_buffer.h" /*!\brief Current ABI version number * @@ -39,7 +40,7 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ -#define VPX_DECODER_ABI_VERSION (2 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/ +#define VPX_DECODER_ABI_VERSION (3 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/ /*! \brief Decoder capabilities bitfield * @@ -66,6 +67,8 @@ extern "C" { */ #define VPX_CODEC_CAP_FRAME_THREADING 0x200000 /**< Can support frame-based multi-threading */ +#define VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x400000 /**< Can support external + frame buffers */ #define VPX_CODEC_USE_POSTPROC 0x10000 /**< Postprocess decoded frame */ #define VPX_CODEC_USE_ERROR_CONCEALMENT 0x20000 /**< Conceal errors in decoded @@ -119,10 +122,6 @@ extern "C" { * is not thread safe and should be guarded with a lock if being used * in a multithreaded context. * - * In XMA mode (activated by setting VPX_CODEC_USE_XMA in the flags - * parameter), the storage pointed to by the cfg parameter must be - * kept readable and stable until all memory maps have been set. - * * \param[in] ctx Pointer to this instance's context. * \param[in] iface Pointer to the algorithm interface to use. * \param[in] cfg Configuration to use, if known. May be NULL. @@ -136,7 +135,7 @@ extern "C" { */ vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx, vpx_codec_iface_t *iface, - vpx_codec_dec_cfg_t *cfg, + const vpx_codec_dec_cfg_t *cfg, vpx_codec_flags_t flags, int ver); @@ -326,9 +325,54 @@ extern "C" { /*!@} - end defgroup cap_put_slice*/ + /*!\defgroup cap_external_frame_buffer External Frame Buffer Functions + * + * The following section is required to be implemented for all decoders + * that advertise the VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability. + * Calling this function for codecs that don't advertise this capability + * will result in an error code being returned, usually VPX_CODEC_ERROR. + * + * \note + * Currently this only works with VP9. + * @{ + */ + + /*!\brief Pass in external frame buffers for the decoder to use. + * + * Registers functions to be called when libvpx needs a frame buffer + * to decode the current frame and a function to be called when libvpx does + * not internally reference the frame buffer. This set function must + * be called before the first call to decode or libvpx will assume the + * default behavior of allocating frame buffers internally. + * + * \param[in] ctx Pointer to this instance's context + * \param[in] cb_get Pointer to the get callback function + * \param[in] cb_release Pointer to the release callback function + * \param[in] cb_priv Callback's private data + * + * \retval #VPX_CODEC_OK + * External frame buffers will be used by libvpx. + * \retval #VPX_CODEC_INVALID_PARAM + * One or more of the callbacks were NULL. + * \retval #VPX_CODEC_ERROR + * Decoder context not initialized, or algorithm not capable of + * using external frame buffers. + * + * \note + * When decoding VP9, the application may be required to pass in at least + * #VP9_MAXIMUM_REF_BUFFERS + #VPX_MAXIMUM_WORK_BUFFERS external frame + * buffers. + */ + vpx_codec_err_t vpx_codec_set_frame_buffer_functions( + vpx_codec_ctx_t *ctx, + vpx_get_frame_buffer_cb_fn_t cb_get, + vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv); + + /*!@} - end defgroup cap_external_frame_buffer */ + /*!@} - end defgroup decoder*/ #ifdef __cplusplus } #endif -#endif +#endif // VPX_VPX_DECODER_H_ diff --git a/media/libvpx/vpx/vpx_encoder.h b/media/libvpx/vpx/vpx_encoder.h index 56752cfcfb5a..fdabed121705 100644 --- a/media/libvpx/vpx/vpx_encoder.h +++ b/media/libvpx/vpx/vpx_encoder.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_ENCODER_H -#define VPX_ENCODER_H +#ifndef VPX_VPX_ENCODER_H_ +#define VPX_VPX_ENCODER_H_ /*!\defgroup encoder Encoder Algorithm Interface * \ingroup codec @@ -29,7 +29,7 @@ extern "C" { #endif -#include "vpx_codec.h" +#include "./vpx_codec.h" /*! Temporal Scalability: Maximum length of the sequence defining frame * layer membership @@ -49,7 +49,7 @@ extern "C" { #define VPX_SS_MAX_LAYERS 5 /*! Spatial Scalability: Default number of coding layers */ -#define VPX_SS_DEFAULT_LAYERS 3 +#define VPX_SS_DEFAULT_LAYERS 1 /*!\brief Current ABI version number * @@ -80,6 +80,9 @@ extern "C" { */ #define VPX_CODEC_CAP_OUTPUT_PARTITION 0x20000 +/*! Can support input images at greater than 8 bitdepth. + */ +#define VPX_CODEC_CAP_HIGHBITDEPTH 0x40000 /*! \brief Initialization-time Feature Enabling * @@ -91,6 +94,7 @@ extern "C" { #define VPX_CODEC_USE_PSNR 0x10000 /**< Calculate PSNR on each frame */ #define VPX_CODEC_USE_OUTPUT_PARTITION 0x20000 /**< Make the encoder output one partition at a time. */ +#define VPX_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */ /*!\brief Generic fixed size buffer structure @@ -155,7 +159,11 @@ extern "C" { enum vpx_codec_cx_pkt_kind { VPX_CODEC_CX_FRAME_PKT, /**< Compressed video frame */ VPX_CODEC_STATS_PKT, /**< Two-pass statistics for this frame */ + VPX_CODEC_FPMB_STATS_PKT, /**< first pass mb statistics for this frame */ VPX_CODEC_PSNR_PKT, /**< PSNR statistics for this frame */ +#if CONFIG_SPATIAL_SVC + VPX_CODEC_SPATIAL_SVC_LAYER_SIZES, /**< Sizes for each layer in this frame*/ +#endif VPX_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions */ }; @@ -184,13 +192,17 @@ extern "C" { has id 0.*/ } frame; /**< data for compressed frame packet */ - struct vpx_fixed_buf twopass_stats; /**< data for two-pass packet */ + vpx_fixed_buf_t twopass_stats; /**< data for two-pass packet */ + vpx_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */ struct vpx_psnr_pkt { unsigned int samples[4]; /**< Number of samples, total/y/u/v */ uint64_t sse[4]; /**< sum squared error, total/y/u/v */ double psnr[4]; /**< PSNR, total/y/u/v */ } psnr; /**< data for PSNR packet */ - struct vpx_fixed_buf raw; /**< data for arbitrary packets */ + vpx_fixed_buf_t raw; /**< data for arbitrary packets */ +#if CONFIG_SPATIAL_SVC + size_t layer_sizes[VPX_SS_MAX_LAYERS]; +#endif /* This packet size is fixed to allow codecs to extend this * interface without having to manage storage for raw packets, @@ -316,6 +328,21 @@ extern "C" { */ unsigned int g_h; + /*!\brief Bit-depth of the codec + * + * This value identifies the bit_depth of the codec, + * Only certain bit-depths are supported as identified in the + * vpx_bit_depth_t enum. + */ + vpx_bit_depth_t g_bit_depth; + + /*!\brief Bit-depth of the input frames + * + * This value identifies the bit_depth of the input frames in bits. + * Note that the frames passed as input to the encoder must have + * this bit-depth. + */ + unsigned int g_input_bit_depth; /*!\brief Stream timebase units * @@ -396,6 +423,19 @@ extern "C" { */ unsigned int rc_resize_allowed; + /*!\brief Internal coded frame width. + * + * If spatial resampling is enabled this specifies the width of the + * encoded frame. + */ + unsigned int rc_scaled_width; + + /*!\brief Internal coded frame height. + * + * If spatial resampling is enabled this specifies the height of the + * encoded frame. + */ + unsigned int rc_scaled_height; /*!\brief Spatial resampling up watermark. * @@ -431,8 +471,14 @@ extern "C" { * A buffer containing all of the stats packets produced in the first * pass, concatenated. */ - struct vpx_fixed_buf rc_twopass_stats_in; + vpx_fixed_buf_t rc_twopass_stats_in; + /*!\brief first pass mb stats buffer. + * + * A buffer containing all of the first pass mb stats packets produced + * in the first pass, concatenated. + */ + vpx_fixed_buf_t rc_firstpass_mb_stats_in; /*!\brief Target data rate * @@ -604,47 +650,62 @@ extern "C" { * Spatial scalability settings (ss) */ - /*!\brief Number of coding layers (spatial) + /*!\brief Number of spatial coding layers. * - * This value specifies the number of coding layers to be used. + * This value specifies the number of spatial coding layers to be used. */ unsigned int ss_number_layers; - /*!\brief Number of coding layers + /*!\brief Enable auto alt reference flags for each spatial layer. * - * This value specifies the number of coding layers to be used. + * These values specify if auto alt reference frame is enabled for each + * spatial layer. + */ + int ss_enable_auto_alt_ref[VPX_SS_MAX_LAYERS]; + + /*!\brief Target bitrate for each spatial layer. + * + * These values specify the target coding bitrate to be used for each + * spatial layer. + */ + unsigned int ss_target_bitrate[VPX_SS_MAX_LAYERS]; + + /*!\brief Number of temporal coding layers. + * + * This value specifies the number of temporal layers to be used. */ unsigned int ts_number_layers; - /*!\brief Target bitrate for each layer + /*!\brief Target bitrate for each temporal layer. * - * These values specify the target coding bitrate for each coding layer. + * These values specify the target coding bitrate to be used for each + * temporal layer. */ unsigned int ts_target_bitrate[VPX_TS_MAX_LAYERS]; - /*!\brief Frame rate decimation factor for each layer + /*!\brief Frame rate decimation factor for each temporal layer. * * These values specify the frame rate decimation factors to apply - * to each layer. + * to each temporal layer. */ unsigned int ts_rate_decimator[VPX_TS_MAX_LAYERS]; - /*!\brief Length of the sequence defining frame layer membership + /*!\brief Length of the sequence defining frame temporal layer membership. * * This value specifies the length of the sequence that defines the - * membership of frames to layers. For example, if ts_periodicity=8 then - * frames are assigned to coding layers with a repeated sequence of - * length 8. - */ + * membership of frames to temporal layers. For example, if the + * ts_periodicity = 8, then the frames are assigned to coding layers with a + * repeated sequence of length 8. + */ unsigned int ts_periodicity; - /*!\brief Template defining the membership of frames to coding layers + /*!\brief Template defining the membership of frames to temporal layers. * - * This array defines the membership of frames to coding layers. For a - * 2-layer encoding that assigns even numbered frames to one layer (0) - * and odd numbered frames to a second layer (1) with ts_periodicity=8, - * then ts_layer_id = (0,1,0,1,0,1,0,1). - */ + * This array defines the membership of frames to temporal coding layers. + * For a 2-layer encoding that assigns even numbered frames to one temporal + * layer (0) and odd numbered frames to a second temporal layer (1) with + * ts_periodicity=8, then ts_layer_id = (0,1,0,1,0,1,0,1). + */ unsigned int ts_layer_id[VPX_TS_MAX_PERIODICITY]; } vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */ @@ -660,10 +721,6 @@ extern "C" { * is not thread safe and should be guarded with a lock if being used * in a multithreaded context. * - * In XMA mode (activated by setting VPX_CODEC_USE_XMA in the flags - * parameter), the storage pointed to by the cfg parameter must be - * kept readable and stable until all memory maps have been set. - * * \param[in] ctx Pointer to this instance's context. * \param[in] iface Pointer to the algorithm interface to use. * \param[in] cfg Configuration to use, if known. May be NULL. @@ -677,7 +734,7 @@ extern "C" { */ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, vpx_codec_iface_t *iface, - vpx_codec_enc_cfg_t *cfg, + const vpx_codec_enc_cfg_t *cfg, vpx_codec_flags_t flags, int ver); @@ -697,10 +754,6 @@ extern "C" { * instead of this function directly, to ensure that the ABI version number * parameter is properly initialized. * - * In XMA mode (activated by setting VPX_CODEC_USE_XMA in the flags - * parameter), the storage pointed to by the cfg parameter must be - * kept readable and stable until all memory maps have been set. - * * \param[in] ctx Pointer to this instance's context. * \param[in] iface Pointer to the algorithm interface to use. * \param[in] cfg Configuration to use, if known. May be NULL. @@ -932,5 +985,5 @@ extern "C" { #ifdef __cplusplus } #endif -#endif +#endif // VPX_VPX_ENCODER_H_ diff --git a/media/libvpx/vpx/vpx_frame_buffer.h b/media/libvpx/vpx/vpx_frame_buffer.h new file mode 100644 index 000000000000..e69df4bc8221 --- /dev/null +++ b/media/libvpx/vpx/vpx_frame_buffer.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_FRAME_BUFFER_H_ +#define VPX_VPX_FRAME_BUFFER_H_ + +/*!\file + * \brief Describes the decoder external frame buffer interface. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./vpx_integer.h" + +/*!\brief The maximum number of work buffers used by libvpx. + */ +#define VPX_MAXIMUM_WORK_BUFFERS 1 + +/*!\brief The maximum number of reference buffers that a VP9 encoder may use. + */ +#define VP9_MAXIMUM_REF_BUFFERS 8 + +/*!\brief External frame buffer + * + * This structure holds allocated frame buffers used by the decoder. + */ +typedef struct vpx_codec_frame_buffer { + uint8_t *data; /**< Pointer to the data buffer */ + size_t size; /**< Size of data in bytes */ + void *priv; /**< Frame's private data */ +} vpx_codec_frame_buffer_t; + +/*!\brief get frame buffer callback prototype + * + * This callback is invoked by the decoder to retrieve data for the frame + * buffer in order for the decode call to complete. The callback must + * allocate at least min_size in bytes and assign it to fb->data. Then the + * callback must set fb->size to the allocated size. The application does not + * need to align the allocated data. The callback is triggered when the + * decoder needs a frame buffer to decode a compressed image into. This + * function may be called more than once for every call to vpx_codec_decode. + * The application may set fb->priv to some data which will be passed + * back in the ximage and the release function call. |fb| is guaranteed to + * not be NULL. On success the callback must return 0. Any failure the + * callback must return a value less than 0. + * + * \param[in] priv Callback's private data + * \param[in] new_size Size in bytes needed by the buffer + * \param[in,out] fb Pointer to vpx_codec_frame_buffer_t + */ +typedef int (*vpx_get_frame_buffer_cb_fn_t)( + void *priv, size_t min_size, vpx_codec_frame_buffer_t *fb); + +/*!\brief release frame buffer callback prototype + * + * This callback is invoked by the decoder when the frame buffer is not + * referenced by any other buffers. |fb| is guaranteed to not be NULL. On + * success the callback must return 0. Any failure the callback must return + * a value less than 0. + * + * \param[in] priv Callback's private data + * \param[in] fb Pointer to vpx_codec_frame_buffer_t + */ +typedef int (*vpx_release_frame_buffer_cb_fn_t)( + void *priv, vpx_codec_frame_buffer_t *fb); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_FRAME_BUFFER_H_ diff --git a/media/libvpx/vpx/vpx_image.h b/media/libvpx/vpx/vpx_image.h index c304bacd07e1..0b7bb90572da 100644 --- a/media/libvpx/vpx/vpx_image.h +++ b/media/libvpx/vpx/vpx_image.h @@ -13,13 +13,13 @@ * \brief Describes the vpx image descriptor and associated operations * */ +#ifndef VPX_VPX_IMAGE_H_ +#define VPX_VPX_IMAGE_H_ + #ifdef __cplusplus extern "C" { #endif -#ifndef VPX_IMAGE_H -#define VPX_IMAGE_H - /*!\brief Current ABI version number * * \internal @@ -28,13 +28,13 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ -#define VPX_IMAGE_ABI_VERSION (1) /**<\hideinitializer*/ +#define VPX_IMAGE_ABI_VERSION (2) /**<\hideinitializer*/ -#define VPX_IMG_FMT_PLANAR 0x100 /**< Image is a planar format */ -#define VPX_IMG_FMT_UV_FLIP 0x200 /**< V plane precedes U plane in memory */ -#define VPX_IMG_FMT_HAS_ALPHA 0x400 /**< Image has an alpha channel component */ - +#define VPX_IMG_FMT_PLANAR 0x100 /**< Image is a planar format. */ +#define VPX_IMG_FMT_UV_FLIP 0x200 /**< V plane precedes U in memory. */ +#define VPX_IMG_FMT_HAS_ALPHA 0x400 /**< Image has an alpha channel. */ +#define VPX_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */ /*!\brief List of supported image formats */ typedef enum vpx_img_fmt { @@ -58,7 +58,10 @@ extern "C" { VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4, VPX_IMG_FMT_I422 = VPX_IMG_FMT_PLANAR | 5, VPX_IMG_FMT_I444 = VPX_IMG_FMT_PLANAR | 6, - VPX_IMG_FMT_444A = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 7 + VPX_IMG_FMT_444A = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 7, + VPX_IMG_FMT_I42016 = VPX_IMG_FMT_I420 | VPX_IMG_FMT_HIGHBITDEPTH, + VPX_IMG_FMT_I42216 = VPX_IMG_FMT_I422 | VPX_IMG_FMT_HIGHBITDEPTH, + VPX_IMG_FMT_I44416 = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGHBITDEPTH } vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */ #if !defined(VPX_CODEC_DISABLE_COMPAT) || !VPX_CODEC_DISABLE_COMPAT @@ -100,8 +103,9 @@ extern "C" { vpx_img_fmt_t fmt; /**< Image Format */ /* Image storage dimensions */ - unsigned int w; /**< Stored image width */ - unsigned int h; /**< Stored image height */ + unsigned int w; /**< Stored image width */ + unsigned int h; /**< Stored image height */ + unsigned int bit_depth; /**< Stored image bit-depth */ /* Image display dimensions */ unsigned int d_w; /**< Displayed image width */ @@ -139,6 +143,8 @@ extern "C" { unsigned char *img_data; /**< private */ int img_data_owner; /**< private */ int self_allocd; /**< private */ + + void *fb_priv; /**< Frame buffer data associated with the image. */ } vpx_image_t; /**< alias for struct vpx_image */ /**\brief Representation of a rectangle on a surface */ @@ -237,7 +243,8 @@ extern "C" { */ void vpx_img_free(vpx_image_t *img); -#endif #ifdef __cplusplus -} +} // extern "C" #endif + +#endif // VPX_VPX_IMAGE_H_ diff --git a/media/libvpx/vpx/vpx_integer.h b/media/libvpx/vpx/vpx_integer.h index 4ef9243f7d4e..365a76783a7b 100644 --- a/media/libvpx/vpx/vpx_integer.h +++ b/media/libvpx/vpx/vpx_integer.h @@ -9,14 +9,23 @@ */ -#ifndef VPX_INTEGER_H -#define VPX_INTEGER_H +#ifndef VPX_VPX_INTEGER_H_ +#define VPX_VPX_INTEGER_H_ /* get ptrdiff_t, size_t, wchar_t, NULL */ #include #if !defined(VPX_DONT_DEFINE_STDINT_TYPES) +#if defined(_MSC_VER) +#define VPX_FORCE_INLINE __forceinline +#define VPX_INLINE __inline +#else +#define VPX_FORCE_INLINE __inline__ __attribute__(always_inline) +// TODO(jbb): Allow a way to force inline off for older compilers. +#define VPX_INLINE inline +#endif + #if (defined(_MSC_VER) && (_MSC_VER < 1600)) || defined(VPX_EMULATE_INTTYPES) typedef signed char int8_t; typedef signed short int16_t; @@ -49,13 +58,13 @@ typedef size_t uintptr_t; #endif -#endif +#endif // VPX_DONT_DEFINE_STDINT_TYPES /* VS2010 defines stdint.h, but not inttypes.h */ -#if defined(_MSC_VER) +#if defined(_MSC_VER) && _MSC_VER < 1800 #define PRId64 "I64d" #else #include #endif -#endif +#endif // VPX_VPX_INTEGER_H_ diff --git a/media/libvpx/vpx_config_armv7-android-gcc.asm b/media/libvpx/vpx_config_armv7-android-gcc.asm index 974c0d82cc0c..e6c44ca538df 100644 --- a/media/libvpx/vpx_config_armv7-android-gcc.asm +++ b/media/libvpx/vpx_config_armv7-android-gcc.asm @@ -10,8 +10,10 @@ .equ HAVE_EDSP , 1 .equ HAVE_MEDIA , 1 .equ HAVE_NEON , 1 +.equ HAVE_NEON_ASM , 1 .equ HAVE_MIPS32 , 0 .equ HAVE_DSPR2 , 0 +.equ HAVE_MIPS64 , 0 .equ HAVE_MMX , 0 .equ HAVE_SSE , 0 .equ HAVE_SSE2 , 0 @@ -47,7 +49,6 @@ .equ CONFIG_MEM_MANAGER , 0 .equ CONFIG_MEM_TRACKER , 0 .equ CONFIG_MEM_CHECKS , 0 -.equ CONFIG_MD5 , 1 .equ CONFIG_DEQUANT_TOKENS , 0 .equ CONFIG_DC_RECON , 0 .equ CONFIG_RUNTIME_CPU_DETECT , 1 @@ -74,13 +75,19 @@ .equ CONFIG_POSTPROC_VISUALIZER , 0 .equ CONFIG_OS_SUPPORT , 1 .equ CONFIG_UNIT_TESTS , 0 +.equ CONFIG_WEBM_IO , 1 +.equ CONFIG_LIBYUV , 1 +.equ CONFIG_DECODE_PERF_TESTS , 0 +.equ CONFIG_ENCODE_PERF_TESTS , 0 .equ CONFIG_MULTI_RES_ENCODING , 1 .equ CONFIG_TEMPORAL_DENOISING , 1 +.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 +.equ CONFIG_VP9_HIGHBITDEPTH , 0 .equ CONFIG_EXPERIMENTAL , 0 -.equ CONFIG_DECRYPT , 0 -.equ CONFIG_MULTIPLE_ARF , 0 -.equ CONFIG_NON420 , 0 -.equ CONFIG_ALPHA , 0 +.equ CONFIG_SIZE_LIMIT , 0 +.equ CONFIG_SPATIAL_SVC , 0 +.equ CONFIG_VP9_TEMPORAL_DENOISING , 0 +.equ CONFIG_FP_MB_STATS , 0 .section .note.GNU-stack,"",%progbits @ This file was created from a .asm file @ using the ads2gas.pl script. diff --git a/media/libvpx/vpx_config_armv7-android-gcc.h b/media/libvpx/vpx_config_armv7-android-gcc.h index 07ef47f81f36..e21327bd5dbb 100644 --- a/media/libvpx/vpx_config_armv7-android-gcc.h +++ b/media/libvpx/vpx_config_armv7-android-gcc.h @@ -19,8 +19,10 @@ #define HAVE_EDSP 1 #define HAVE_MEDIA 1 #define HAVE_NEON 1 +#define HAVE_NEON_ASM 1 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 +#define HAVE_MIPS64 0 #define HAVE_MMX 0 #define HAVE_SSE 0 #define HAVE_SSE2 0 @@ -56,7 +58,6 @@ #define CONFIG_MEM_MANAGER 0 #define CONFIG_MEM_TRACKER 0 #define CONFIG_MEM_CHECKS 0 -#define CONFIG_MD5 1 #define CONFIG_DEQUANT_TOKENS 0 #define CONFIG_DC_RECON 0 #define CONFIG_RUNTIME_CPU_DETECT 1 @@ -83,11 +84,17 @@ #define CONFIG_POSTPROC_VISUALIZER 0 #define CONFIG_OS_SUPPORT 1 #define CONFIG_UNIT_TESTS 0 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 -#define CONFIG_DECRYPT 0 -#define CONFIG_MULTIPLE_ARF 0 -#define CONFIG_NON420 0 -#define CONFIG_ALPHA 0 +#define CONFIG_SIZE_LIMIT 0 +#define CONFIG_SPATIAL_SVC 0 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_FP_MB_STATS 0 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/vpx_config_generic-gnu.asm b/media/libvpx/vpx_config_generic-gnu.asm index 02578d28b21a..9c09306b6753 100644 --- a/media/libvpx/vpx_config_generic-gnu.asm +++ b/media/libvpx/vpx_config_generic-gnu.asm @@ -10,8 +10,10 @@ .equ HAVE_EDSP , 0 .equ HAVE_MEDIA , 0 .equ HAVE_NEON , 0 +.equ HAVE_NEON_ASM , 0 .equ HAVE_MIPS32 , 0 .equ HAVE_DSPR2 , 0 +.equ HAVE_MIPS64 , 0 .equ HAVE_MMX , 0 .equ HAVE_SSE , 0 .equ HAVE_SSE2 , 0 @@ -47,7 +49,6 @@ .equ CONFIG_MEM_MANAGER , 0 .equ CONFIG_MEM_TRACKER , 0 .equ CONFIG_MEM_CHECKS , 0 -.equ CONFIG_MD5 , 1 .equ CONFIG_DEQUANT_TOKENS , 0 .equ CONFIG_DC_RECON , 0 .equ CONFIG_RUNTIME_CPU_DETECT , 0 @@ -74,13 +75,19 @@ .equ CONFIG_POSTPROC_VISUALIZER , 0 .equ CONFIG_OS_SUPPORT , 1 .equ CONFIG_UNIT_TESTS , 1 +.equ CONFIG_WEBM_IO , 1 +.equ CONFIG_LIBYUV , 1 +.equ CONFIG_DECODE_PERF_TESTS , 0 +.equ CONFIG_ENCODE_PERF_TESTS , 0 .equ CONFIG_MULTI_RES_ENCODING , 1 .equ CONFIG_TEMPORAL_DENOISING , 1 +.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 +.equ CONFIG_VP9_HIGHBITDEPTH , 0 .equ CONFIG_EXPERIMENTAL , 0 -.equ CONFIG_DECRYPT , 0 -.equ CONFIG_MULTIPLE_ARF , 0 -.equ CONFIG_NON420 , 0 -.equ CONFIG_ALPHA , 0 +.equ CONFIG_SIZE_LIMIT , 0 +.equ CONFIG_SPATIAL_SVC , 0 +.equ CONFIG_VP9_TEMPORAL_DENOISING , 0 +.equ CONFIG_FP_MB_STATS , 0 .section .note.GNU-stack,"",%progbits @ This file was created from a .asm file @ using the ads2gas.pl script. diff --git a/media/libvpx/vpx_config_generic-gnu.h b/media/libvpx/vpx_config_generic-gnu.h index 66988faeca0d..d86f316883b7 100644 --- a/media/libvpx/vpx_config_generic-gnu.h +++ b/media/libvpx/vpx_config_generic-gnu.h @@ -19,8 +19,10 @@ #define HAVE_EDSP 0 #define HAVE_MEDIA 0 #define HAVE_NEON 0 +#define HAVE_NEON_ASM 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 +#define HAVE_MIPS64 0 #define HAVE_MMX 0 #define HAVE_SSE 0 #define HAVE_SSE2 0 @@ -56,7 +58,6 @@ #define CONFIG_MEM_MANAGER 0 #define CONFIG_MEM_TRACKER 0 #define CONFIG_MEM_CHECKS 0 -#define CONFIG_MD5 1 #define CONFIG_DEQUANT_TOKENS 0 #define CONFIG_DC_RECON 0 #define CONFIG_RUNTIME_CPU_DETECT 0 @@ -83,11 +84,17 @@ #define CONFIG_POSTPROC_VISUALIZER 0 #define CONFIG_OS_SUPPORT 1 #define CONFIG_UNIT_TESTS 1 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 -#define CONFIG_DECRYPT 0 -#define CONFIG_MULTIPLE_ARF 0 -#define CONFIG_NON420 0 -#define CONFIG_ALPHA 0 +#define CONFIG_SIZE_LIMIT 0 +#define CONFIG_SPATIAL_SVC 0 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_FP_MB_STATS 0 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/vpx_config_x86-darwin9-gcc.asm b/media/libvpx/vpx_config_x86-darwin9-gcc.asm index d490fb8f7c49..d3107552ed72 100644 --- a/media/libvpx/vpx_config_x86-darwin9-gcc.asm +++ b/media/libvpx/vpx_config_x86-darwin9-gcc.asm @@ -7,8 +7,10 @@ ARCH_PPC64 equ 0 HAVE_EDSP equ 0 HAVE_MEDIA equ 0 HAVE_NEON equ 0 +HAVE_NEON_ASM equ 0 HAVE_MIPS32 equ 0 HAVE_DSPR2 equ 0 +HAVE_MIPS64 equ 0 HAVE_MMX equ 1 HAVE_SSE equ 1 HAVE_SSE2 equ 1 @@ -44,7 +46,6 @@ CONFIG_FAST_UNALIGNED equ 1 CONFIG_MEM_MANAGER equ 0 CONFIG_MEM_TRACKER equ 0 CONFIG_MEM_CHECKS equ 0 -CONFIG_MD5 equ 1 CONFIG_DEQUANT_TOKENS equ 0 CONFIG_DC_RECON equ 0 CONFIG_RUNTIME_CPU_DETECT equ 1 @@ -71,10 +72,16 @@ CONFIG_SMALL equ 0 CONFIG_POSTPROC_VISUALIZER equ 0 CONFIG_OS_SUPPORT equ 1 CONFIG_UNIT_TESTS equ 1 +CONFIG_WEBM_IO equ 1 +CONFIG_LIBYUV equ 1 +CONFIG_DECODE_PERF_TESTS equ 0 +CONFIG_ENCODE_PERF_TESTS equ 0 CONFIG_MULTI_RES_ENCODING equ 1 CONFIG_TEMPORAL_DENOISING equ 1 +CONFIG_COEFFICIENT_RANGE_CHECKING equ 0 +CONFIG_VP9_HIGHBITDEPTH equ 0 CONFIG_EXPERIMENTAL equ 0 -CONFIG_DECRYPT equ 0 -CONFIG_MULTIPLE_ARF equ 0 -CONFIG_NON420 equ 0 -CONFIG_ALPHA equ 0 +CONFIG_SIZE_LIMIT equ 0 +CONFIG_SPATIAL_SVC equ 0 +CONFIG_VP9_TEMPORAL_DENOISING equ 0 +CONFIG_FP_MB_STATS equ 0 diff --git a/media/libvpx/vpx_config_x86-darwin9-gcc.h b/media/libvpx/vpx_config_x86-darwin9-gcc.h index f50cdac2aa87..7f91751b62ef 100644 --- a/media/libvpx/vpx_config_x86-darwin9-gcc.h +++ b/media/libvpx/vpx_config_x86-darwin9-gcc.h @@ -19,8 +19,10 @@ #define HAVE_EDSP 0 #define HAVE_MEDIA 0 #define HAVE_NEON 0 +#define HAVE_NEON_ASM 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 +#define HAVE_MIPS64 0 #define HAVE_MMX 1 #define HAVE_SSE 1 #define HAVE_SSE2 1 @@ -56,7 +58,6 @@ #define CONFIG_MEM_MANAGER 0 #define CONFIG_MEM_TRACKER 0 #define CONFIG_MEM_CHECKS 0 -#define CONFIG_MD5 1 #define CONFIG_DEQUANT_TOKENS 0 #define CONFIG_DC_RECON 0 #define CONFIG_RUNTIME_CPU_DETECT 1 @@ -83,11 +84,17 @@ #define CONFIG_POSTPROC_VISUALIZER 0 #define CONFIG_OS_SUPPORT 1 #define CONFIG_UNIT_TESTS 1 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 -#define CONFIG_DECRYPT 0 -#define CONFIG_MULTIPLE_ARF 0 -#define CONFIG_NON420 0 -#define CONFIG_ALPHA 0 +#define CONFIG_SIZE_LIMIT 0 +#define CONFIG_SPATIAL_SVC 0 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_FP_MB_STATS 0 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/vpx_config_x86-linux-gcc.asm b/media/libvpx/vpx_config_x86-linux-gcc.asm index d490fb8f7c49..d3107552ed72 100644 --- a/media/libvpx/vpx_config_x86-linux-gcc.asm +++ b/media/libvpx/vpx_config_x86-linux-gcc.asm @@ -7,8 +7,10 @@ ARCH_PPC64 equ 0 HAVE_EDSP equ 0 HAVE_MEDIA equ 0 HAVE_NEON equ 0 +HAVE_NEON_ASM equ 0 HAVE_MIPS32 equ 0 HAVE_DSPR2 equ 0 +HAVE_MIPS64 equ 0 HAVE_MMX equ 1 HAVE_SSE equ 1 HAVE_SSE2 equ 1 @@ -44,7 +46,6 @@ CONFIG_FAST_UNALIGNED equ 1 CONFIG_MEM_MANAGER equ 0 CONFIG_MEM_TRACKER equ 0 CONFIG_MEM_CHECKS equ 0 -CONFIG_MD5 equ 1 CONFIG_DEQUANT_TOKENS equ 0 CONFIG_DC_RECON equ 0 CONFIG_RUNTIME_CPU_DETECT equ 1 @@ -71,10 +72,16 @@ CONFIG_SMALL equ 0 CONFIG_POSTPROC_VISUALIZER equ 0 CONFIG_OS_SUPPORT equ 1 CONFIG_UNIT_TESTS equ 1 +CONFIG_WEBM_IO equ 1 +CONFIG_LIBYUV equ 1 +CONFIG_DECODE_PERF_TESTS equ 0 +CONFIG_ENCODE_PERF_TESTS equ 0 CONFIG_MULTI_RES_ENCODING equ 1 CONFIG_TEMPORAL_DENOISING equ 1 +CONFIG_COEFFICIENT_RANGE_CHECKING equ 0 +CONFIG_VP9_HIGHBITDEPTH equ 0 CONFIG_EXPERIMENTAL equ 0 -CONFIG_DECRYPT equ 0 -CONFIG_MULTIPLE_ARF equ 0 -CONFIG_NON420 equ 0 -CONFIG_ALPHA equ 0 +CONFIG_SIZE_LIMIT equ 0 +CONFIG_SPATIAL_SVC equ 0 +CONFIG_VP9_TEMPORAL_DENOISING equ 0 +CONFIG_FP_MB_STATS equ 0 diff --git a/media/libvpx/vpx_config_x86-linux-gcc.h b/media/libvpx/vpx_config_x86-linux-gcc.h index f50cdac2aa87..7f91751b62ef 100644 --- a/media/libvpx/vpx_config_x86-linux-gcc.h +++ b/media/libvpx/vpx_config_x86-linux-gcc.h @@ -19,8 +19,10 @@ #define HAVE_EDSP 0 #define HAVE_MEDIA 0 #define HAVE_NEON 0 +#define HAVE_NEON_ASM 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 +#define HAVE_MIPS64 0 #define HAVE_MMX 1 #define HAVE_SSE 1 #define HAVE_SSE2 1 @@ -56,7 +58,6 @@ #define CONFIG_MEM_MANAGER 0 #define CONFIG_MEM_TRACKER 0 #define CONFIG_MEM_CHECKS 0 -#define CONFIG_MD5 1 #define CONFIG_DEQUANT_TOKENS 0 #define CONFIG_DC_RECON 0 #define CONFIG_RUNTIME_CPU_DETECT 1 @@ -83,11 +84,17 @@ #define CONFIG_POSTPROC_VISUALIZER 0 #define CONFIG_OS_SUPPORT 1 #define CONFIG_UNIT_TESTS 1 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 -#define CONFIG_DECRYPT 0 -#define CONFIG_MULTIPLE_ARF 0 -#define CONFIG_NON420 0 -#define CONFIG_ALPHA 0 +#define CONFIG_SIZE_LIMIT 0 +#define CONFIG_SPATIAL_SVC 0 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_FP_MB_STATS 0 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/vpx_config_x86-win32-gcc.asm b/media/libvpx/vpx_config_x86-win32-gcc.asm index 70072f689f35..1a1680faf14b 100644 --- a/media/libvpx/vpx_config_x86-win32-gcc.asm +++ b/media/libvpx/vpx_config_x86-win32-gcc.asm @@ -7,16 +7,18 @@ ARCH_PPC64 equ 0 HAVE_EDSP equ 0 HAVE_MEDIA equ 0 HAVE_NEON equ 0 +HAVE_NEON_ASM equ 0 HAVE_MIPS32 equ 0 HAVE_DSPR2 equ 0 +HAVE_MIPS64 equ 0 HAVE_MMX equ 1 HAVE_SSE equ 1 HAVE_SSE2 equ 1 HAVE_SSE3 equ 1 HAVE_SSSE3 equ 1 HAVE_SSE4_1 equ 1 -HAVE_AVX equ 0 -HAVE_AVX2 equ 0 +HAVE_AVX equ 1 +HAVE_AVX2 equ 1 HAVE_ALTIVEC equ 0 HAVE_VPX_PORTS equ 1 HAVE_STDINT_H equ 1 @@ -44,7 +46,6 @@ CONFIG_FAST_UNALIGNED equ 1 CONFIG_MEM_MANAGER equ 0 CONFIG_MEM_TRACKER equ 0 CONFIG_MEM_CHECKS equ 0 -CONFIG_MD5 equ 1 CONFIG_DEQUANT_TOKENS equ 0 CONFIG_DC_RECON equ 0 CONFIG_RUNTIME_CPU_DETECT equ 1 @@ -71,10 +72,16 @@ CONFIG_SMALL equ 0 CONFIG_POSTPROC_VISUALIZER equ 0 CONFIG_OS_SUPPORT equ 1 CONFIG_UNIT_TESTS equ 1 +CONFIG_WEBM_IO equ 1 +CONFIG_LIBYUV equ 1 +CONFIG_DECODE_PERF_TESTS equ 0 +CONFIG_ENCODE_PERF_TESTS equ 0 CONFIG_MULTI_RES_ENCODING equ 1 CONFIG_TEMPORAL_DENOISING equ 1 +CONFIG_COEFFICIENT_RANGE_CHECKING equ 0 +CONFIG_VP9_HIGHBITDEPTH equ 0 CONFIG_EXPERIMENTAL equ 0 -CONFIG_DECRYPT equ 0 -CONFIG_MULTIPLE_ARF equ 0 -CONFIG_NON420 equ 0 -CONFIG_ALPHA equ 0 +CONFIG_SIZE_LIMIT equ 0 +CONFIG_SPATIAL_SVC equ 0 +CONFIG_VP9_TEMPORAL_DENOISING equ 0 +CONFIG_FP_MB_STATS equ 0 diff --git a/media/libvpx/vpx_config_x86-win32-gcc.h b/media/libvpx/vpx_config_x86-win32-gcc.h index 2e271e0b4657..a8e6599ff643 100644 --- a/media/libvpx/vpx_config_x86-win32-gcc.h +++ b/media/libvpx/vpx_config_x86-win32-gcc.h @@ -19,16 +19,18 @@ #define HAVE_EDSP 0 #define HAVE_MEDIA 0 #define HAVE_NEON 0 +#define HAVE_NEON_ASM 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 +#define HAVE_MIPS64 0 #define HAVE_MMX 1 #define HAVE_SSE 1 #define HAVE_SSE2 1 #define HAVE_SSE3 1 #define HAVE_SSSE3 1 #define HAVE_SSE4_1 1 -#define HAVE_AVX 0 -#define HAVE_AVX2 0 +#define HAVE_AVX 1 +#define HAVE_AVX2 1 #define HAVE_ALTIVEC 0 #define HAVE_VPX_PORTS 1 #define HAVE_STDINT_H 1 @@ -56,7 +58,6 @@ #define CONFIG_MEM_MANAGER 0 #define CONFIG_MEM_TRACKER 0 #define CONFIG_MEM_CHECKS 0 -#define CONFIG_MD5 1 #define CONFIG_DEQUANT_TOKENS 0 #define CONFIG_DC_RECON 0 #define CONFIG_RUNTIME_CPU_DETECT 1 @@ -83,11 +84,17 @@ #define CONFIG_POSTPROC_VISUALIZER 0 #define CONFIG_OS_SUPPORT 1 #define CONFIG_UNIT_TESTS 1 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 -#define CONFIG_DECRYPT 0 -#define CONFIG_MULTIPLE_ARF 0 -#define CONFIG_NON420 0 -#define CONFIG_ALPHA 0 +#define CONFIG_SIZE_LIMIT 0 +#define CONFIG_SPATIAL_SVC 0 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_FP_MB_STATS 0 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/vpx_config_x86-win32-vs8.asm b/media/libvpx/vpx_config_x86-win32-vs8.asm index b0df468e9e46..ea216f01c57c 100644 --- a/media/libvpx/vpx_config_x86-win32-vs8.asm +++ b/media/libvpx/vpx_config_x86-win32-vs8.asm @@ -7,8 +7,10 @@ ARCH_PPC64 equ 0 HAVE_EDSP equ 0 HAVE_MEDIA equ 0 HAVE_NEON equ 0 +HAVE_NEON_ASM equ 0 HAVE_MIPS32 equ 0 HAVE_DSPR2 equ 0 +HAVE_MIPS64 equ 0 HAVE_MMX equ 1 HAVE_SSE equ 1 HAVE_SSE2 equ 1 @@ -44,7 +46,6 @@ CONFIG_FAST_UNALIGNED equ 1 CONFIG_MEM_MANAGER equ 0 CONFIG_MEM_TRACKER equ 0 CONFIG_MEM_CHECKS equ 0 -CONFIG_MD5 equ 1 CONFIG_DEQUANT_TOKENS equ 0 CONFIG_DC_RECON equ 0 CONFIG_RUNTIME_CPU_DETECT equ 1 @@ -71,10 +72,16 @@ CONFIG_SMALL equ 0 CONFIG_POSTPROC_VISUALIZER equ 0 CONFIG_OS_SUPPORT equ 1 CONFIG_UNIT_TESTS equ 1 +CONFIG_WEBM_IO equ 1 +CONFIG_LIBYUV equ 1 +CONFIG_DECODE_PERF_TESTS equ 0 +CONFIG_ENCODE_PERF_TESTS equ 0 CONFIG_MULTI_RES_ENCODING equ 1 CONFIG_TEMPORAL_DENOISING equ 1 +CONFIG_COEFFICIENT_RANGE_CHECKING equ 0 +CONFIG_VP9_HIGHBITDEPTH equ 0 CONFIG_EXPERIMENTAL equ 0 -CONFIG_DECRYPT equ 0 -CONFIG_MULTIPLE_ARF equ 0 -CONFIG_NON420 equ 0 -CONFIG_ALPHA equ 0 +CONFIG_SIZE_LIMIT equ 0 +CONFIG_SPATIAL_SVC equ 0 +CONFIG_VP9_TEMPORAL_DENOISING equ 0 +CONFIG_FP_MB_STATS equ 0 diff --git a/media/libvpx/vpx_config_x86-win32-vs8.h b/media/libvpx/vpx_config_x86-win32-vs8.h index 55a54a08005e..dfa37dc5b3fd 100644 --- a/media/libvpx/vpx_config_x86-win32-vs8.h +++ b/media/libvpx/vpx_config_x86-win32-vs8.h @@ -19,8 +19,10 @@ #define HAVE_EDSP 0 #define HAVE_MEDIA 0 #define HAVE_NEON 0 +#define HAVE_NEON_ASM 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 +#define HAVE_MIPS64 0 #define HAVE_MMX 1 #define HAVE_SSE 1 #define HAVE_SSE2 1 @@ -56,7 +58,6 @@ #define CONFIG_MEM_MANAGER 0 #define CONFIG_MEM_TRACKER 0 #define CONFIG_MEM_CHECKS 0 -#define CONFIG_MD5 1 #define CONFIG_DEQUANT_TOKENS 0 #define CONFIG_DC_RECON 0 #define CONFIG_RUNTIME_CPU_DETECT 1 @@ -83,11 +84,17 @@ #define CONFIG_POSTPROC_VISUALIZER 0 #define CONFIG_OS_SUPPORT 1 #define CONFIG_UNIT_TESTS 1 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 -#define CONFIG_DECRYPT 0 -#define CONFIG_MULTIPLE_ARF 0 -#define CONFIG_NON420 0 -#define CONFIG_ALPHA 0 +#define CONFIG_SIZE_LIMIT 0 +#define CONFIG_SPATIAL_SVC 0 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_FP_MB_STATS 0 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/vpx_config_x86_64-darwin9-gcc.asm b/media/libvpx/vpx_config_x86_64-darwin9-gcc.asm index 9517d15de997..a6e218ce343d 100644 --- a/media/libvpx/vpx_config_x86_64-darwin9-gcc.asm +++ b/media/libvpx/vpx_config_x86_64-darwin9-gcc.asm @@ -7,8 +7,10 @@ ARCH_PPC64 equ 0 HAVE_EDSP equ 0 HAVE_MEDIA equ 0 HAVE_NEON equ 0 +HAVE_NEON_ASM equ 0 HAVE_MIPS32 equ 0 HAVE_DSPR2 equ 0 +HAVE_MIPS64 equ 0 HAVE_MMX equ 1 HAVE_SSE equ 1 HAVE_SSE2 equ 1 @@ -44,7 +46,6 @@ CONFIG_FAST_UNALIGNED equ 1 CONFIG_MEM_MANAGER equ 0 CONFIG_MEM_TRACKER equ 0 CONFIG_MEM_CHECKS equ 0 -CONFIG_MD5 equ 1 CONFIG_DEQUANT_TOKENS equ 0 CONFIG_DC_RECON equ 0 CONFIG_RUNTIME_CPU_DETECT equ 1 @@ -71,10 +72,16 @@ CONFIG_SMALL equ 0 CONFIG_POSTPROC_VISUALIZER equ 0 CONFIG_OS_SUPPORT equ 1 CONFIG_UNIT_TESTS equ 1 +CONFIG_WEBM_IO equ 1 +CONFIG_LIBYUV equ 1 +CONFIG_DECODE_PERF_TESTS equ 0 +CONFIG_ENCODE_PERF_TESTS equ 0 CONFIG_MULTI_RES_ENCODING equ 1 CONFIG_TEMPORAL_DENOISING equ 1 +CONFIG_COEFFICIENT_RANGE_CHECKING equ 0 +CONFIG_VP9_HIGHBITDEPTH equ 0 CONFIG_EXPERIMENTAL equ 0 -CONFIG_DECRYPT equ 0 -CONFIG_MULTIPLE_ARF equ 0 -CONFIG_NON420 equ 0 -CONFIG_ALPHA equ 0 +CONFIG_SIZE_LIMIT equ 0 +CONFIG_SPATIAL_SVC equ 0 +CONFIG_VP9_TEMPORAL_DENOISING equ 0 +CONFIG_FP_MB_STATS equ 0 diff --git a/media/libvpx/vpx_config_x86_64-darwin9-gcc.h b/media/libvpx/vpx_config_x86_64-darwin9-gcc.h index 1f6d1b64e2e5..f692ebccccf3 100644 --- a/media/libvpx/vpx_config_x86_64-darwin9-gcc.h +++ b/media/libvpx/vpx_config_x86_64-darwin9-gcc.h @@ -19,8 +19,10 @@ #define HAVE_EDSP 0 #define HAVE_MEDIA 0 #define HAVE_NEON 0 +#define HAVE_NEON_ASM 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 +#define HAVE_MIPS64 0 #define HAVE_MMX 1 #define HAVE_SSE 1 #define HAVE_SSE2 1 @@ -56,7 +58,6 @@ #define CONFIG_MEM_MANAGER 0 #define CONFIG_MEM_TRACKER 0 #define CONFIG_MEM_CHECKS 0 -#define CONFIG_MD5 1 #define CONFIG_DEQUANT_TOKENS 0 #define CONFIG_DC_RECON 0 #define CONFIG_RUNTIME_CPU_DETECT 1 @@ -83,11 +84,17 @@ #define CONFIG_POSTPROC_VISUALIZER 0 #define CONFIG_OS_SUPPORT 1 #define CONFIG_UNIT_TESTS 1 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 -#define CONFIG_DECRYPT 0 -#define CONFIG_MULTIPLE_ARF 0 -#define CONFIG_NON420 0 -#define CONFIG_ALPHA 0 +#define CONFIG_SIZE_LIMIT 0 +#define CONFIG_SPATIAL_SVC 0 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_FP_MB_STATS 0 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/vpx_config_x86_64-linux-gcc.asm b/media/libvpx/vpx_config_x86_64-linux-gcc.asm index 9517d15de997..a6e218ce343d 100644 --- a/media/libvpx/vpx_config_x86_64-linux-gcc.asm +++ b/media/libvpx/vpx_config_x86_64-linux-gcc.asm @@ -7,8 +7,10 @@ ARCH_PPC64 equ 0 HAVE_EDSP equ 0 HAVE_MEDIA equ 0 HAVE_NEON equ 0 +HAVE_NEON_ASM equ 0 HAVE_MIPS32 equ 0 HAVE_DSPR2 equ 0 +HAVE_MIPS64 equ 0 HAVE_MMX equ 1 HAVE_SSE equ 1 HAVE_SSE2 equ 1 @@ -44,7 +46,6 @@ CONFIG_FAST_UNALIGNED equ 1 CONFIG_MEM_MANAGER equ 0 CONFIG_MEM_TRACKER equ 0 CONFIG_MEM_CHECKS equ 0 -CONFIG_MD5 equ 1 CONFIG_DEQUANT_TOKENS equ 0 CONFIG_DC_RECON equ 0 CONFIG_RUNTIME_CPU_DETECT equ 1 @@ -71,10 +72,16 @@ CONFIG_SMALL equ 0 CONFIG_POSTPROC_VISUALIZER equ 0 CONFIG_OS_SUPPORT equ 1 CONFIG_UNIT_TESTS equ 1 +CONFIG_WEBM_IO equ 1 +CONFIG_LIBYUV equ 1 +CONFIG_DECODE_PERF_TESTS equ 0 +CONFIG_ENCODE_PERF_TESTS equ 0 CONFIG_MULTI_RES_ENCODING equ 1 CONFIG_TEMPORAL_DENOISING equ 1 +CONFIG_COEFFICIENT_RANGE_CHECKING equ 0 +CONFIG_VP9_HIGHBITDEPTH equ 0 CONFIG_EXPERIMENTAL equ 0 -CONFIG_DECRYPT equ 0 -CONFIG_MULTIPLE_ARF equ 0 -CONFIG_NON420 equ 0 -CONFIG_ALPHA equ 0 +CONFIG_SIZE_LIMIT equ 0 +CONFIG_SPATIAL_SVC equ 0 +CONFIG_VP9_TEMPORAL_DENOISING equ 0 +CONFIG_FP_MB_STATS equ 0 diff --git a/media/libvpx/vpx_config_x86_64-linux-gcc.h b/media/libvpx/vpx_config_x86_64-linux-gcc.h index 1f6d1b64e2e5..f692ebccccf3 100644 --- a/media/libvpx/vpx_config_x86_64-linux-gcc.h +++ b/media/libvpx/vpx_config_x86_64-linux-gcc.h @@ -19,8 +19,10 @@ #define HAVE_EDSP 0 #define HAVE_MEDIA 0 #define HAVE_NEON 0 +#define HAVE_NEON_ASM 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 +#define HAVE_MIPS64 0 #define HAVE_MMX 1 #define HAVE_SSE 1 #define HAVE_SSE2 1 @@ -56,7 +58,6 @@ #define CONFIG_MEM_MANAGER 0 #define CONFIG_MEM_TRACKER 0 #define CONFIG_MEM_CHECKS 0 -#define CONFIG_MD5 1 #define CONFIG_DEQUANT_TOKENS 0 #define CONFIG_DC_RECON 0 #define CONFIG_RUNTIME_CPU_DETECT 1 @@ -83,11 +84,17 @@ #define CONFIG_POSTPROC_VISUALIZER 0 #define CONFIG_OS_SUPPORT 1 #define CONFIG_UNIT_TESTS 1 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 -#define CONFIG_DECRYPT 0 -#define CONFIG_MULTIPLE_ARF 0 -#define CONFIG_NON420 0 -#define CONFIG_ALPHA 0 +#define CONFIG_SIZE_LIMIT 0 +#define CONFIG_SPATIAL_SVC 0 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_FP_MB_STATS 0 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/vpx_config_x86_64-win64-gcc.asm b/media/libvpx/vpx_config_x86_64-win64-gcc.asm index 2d88f79d6811..f7e306b01146 100644 --- a/media/libvpx/vpx_config_x86_64-win64-gcc.asm +++ b/media/libvpx/vpx_config_x86_64-win64-gcc.asm @@ -7,16 +7,18 @@ ARCH_PPC64 equ 0 HAVE_EDSP equ 0 HAVE_MEDIA equ 0 HAVE_NEON equ 0 +HAVE_NEON_ASM equ 0 HAVE_MIPS32 equ 0 HAVE_DSPR2 equ 0 +HAVE_MIPS64 equ 0 HAVE_MMX equ 1 HAVE_SSE equ 1 HAVE_SSE2 equ 1 HAVE_SSE3 equ 1 HAVE_SSSE3 equ 1 HAVE_SSE4_1 equ 1 -HAVE_AVX equ 0 -HAVE_AVX2 equ 0 +HAVE_AVX equ 1 +HAVE_AVX2 equ 1 HAVE_ALTIVEC equ 0 HAVE_VPX_PORTS equ 1 HAVE_STDINT_H equ 1 @@ -44,7 +46,6 @@ CONFIG_FAST_UNALIGNED equ 1 CONFIG_MEM_MANAGER equ 0 CONFIG_MEM_TRACKER equ 0 CONFIG_MEM_CHECKS equ 0 -CONFIG_MD5 equ 1 CONFIG_DEQUANT_TOKENS equ 0 CONFIG_DC_RECON equ 0 CONFIG_RUNTIME_CPU_DETECT equ 1 @@ -71,10 +72,16 @@ CONFIG_SMALL equ 0 CONFIG_POSTPROC_VISUALIZER equ 0 CONFIG_OS_SUPPORT equ 1 CONFIG_UNIT_TESTS equ 1 +CONFIG_WEBM_IO equ 1 +CONFIG_LIBYUV equ 1 +CONFIG_DECODE_PERF_TESTS equ 0 +CONFIG_ENCODE_PERF_TESTS equ 0 CONFIG_MULTI_RES_ENCODING equ 1 CONFIG_TEMPORAL_DENOISING equ 1 +CONFIG_COEFFICIENT_RANGE_CHECKING equ 0 +CONFIG_VP9_HIGHBITDEPTH equ 0 CONFIG_EXPERIMENTAL equ 0 -CONFIG_DECRYPT equ 0 -CONFIG_MULTIPLE_ARF equ 0 -CONFIG_NON420 equ 0 -CONFIG_ALPHA equ 0 +CONFIG_SIZE_LIMIT equ 0 +CONFIG_SPATIAL_SVC equ 0 +CONFIG_VP9_TEMPORAL_DENOISING equ 0 +CONFIG_FP_MB_STATS equ 0 diff --git a/media/libvpx/vpx_config_x86_64-win64-gcc.h b/media/libvpx/vpx_config_x86_64-win64-gcc.h index 8495092b226e..373e0f452c46 100644 --- a/media/libvpx/vpx_config_x86_64-win64-gcc.h +++ b/media/libvpx/vpx_config_x86_64-win64-gcc.h @@ -19,16 +19,18 @@ #define HAVE_EDSP 0 #define HAVE_MEDIA 0 #define HAVE_NEON 0 +#define HAVE_NEON_ASM 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 +#define HAVE_MIPS64 0 #define HAVE_MMX 1 #define HAVE_SSE 1 #define HAVE_SSE2 1 #define HAVE_SSE3 1 #define HAVE_SSSE3 1 #define HAVE_SSE4_1 1 -#define HAVE_AVX 0 -#define HAVE_AVX2 0 +#define HAVE_AVX 1 +#define HAVE_AVX2 1 #define HAVE_ALTIVEC 0 #define HAVE_VPX_PORTS 1 #define HAVE_STDINT_H 1 @@ -56,7 +58,6 @@ #define CONFIG_MEM_MANAGER 0 #define CONFIG_MEM_TRACKER 0 #define CONFIG_MEM_CHECKS 0 -#define CONFIG_MD5 1 #define CONFIG_DEQUANT_TOKENS 0 #define CONFIG_DC_RECON 0 #define CONFIG_RUNTIME_CPU_DETECT 1 @@ -83,11 +84,17 @@ #define CONFIG_POSTPROC_VISUALIZER 0 #define CONFIG_OS_SUPPORT 1 #define CONFIG_UNIT_TESTS 1 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 -#define CONFIG_DECRYPT 0 -#define CONFIG_MULTIPLE_ARF 0 -#define CONFIG_NON420 0 -#define CONFIG_ALPHA 0 +#define CONFIG_SIZE_LIMIT 0 +#define CONFIG_SPATIAL_SVC 0 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_FP_MB_STATS 0 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/vpx_config_x86_64-win64-vs8.asm b/media/libvpx/vpx_config_x86_64-win64-vs8.asm index debd8dc7aca5..8d533385c380 100644 --- a/media/libvpx/vpx_config_x86_64-win64-vs8.asm +++ b/media/libvpx/vpx_config_x86_64-win64-vs8.asm @@ -7,8 +7,10 @@ ARCH_PPC64 equ 0 HAVE_EDSP equ 0 HAVE_MEDIA equ 0 HAVE_NEON equ 0 +HAVE_NEON_ASM equ 0 HAVE_MIPS32 equ 0 HAVE_DSPR2 equ 0 +HAVE_MIPS64 equ 0 HAVE_MMX equ 1 HAVE_SSE equ 1 HAVE_SSE2 equ 1 @@ -44,7 +46,6 @@ CONFIG_FAST_UNALIGNED equ 1 CONFIG_MEM_MANAGER equ 0 CONFIG_MEM_TRACKER equ 0 CONFIG_MEM_CHECKS equ 0 -CONFIG_MD5 equ 1 CONFIG_DEQUANT_TOKENS equ 0 CONFIG_DC_RECON equ 0 CONFIG_RUNTIME_CPU_DETECT equ 1 @@ -71,10 +72,16 @@ CONFIG_SMALL equ 0 CONFIG_POSTPROC_VISUALIZER equ 0 CONFIG_OS_SUPPORT equ 1 CONFIG_UNIT_TESTS equ 1 +CONFIG_WEBM_IO equ 1 +CONFIG_LIBYUV equ 1 +CONFIG_DECODE_PERF_TESTS equ 0 +CONFIG_ENCODE_PERF_TESTS equ 0 CONFIG_MULTI_RES_ENCODING equ 1 CONFIG_TEMPORAL_DENOISING equ 1 +CONFIG_COEFFICIENT_RANGE_CHECKING equ 0 +CONFIG_VP9_HIGHBITDEPTH equ 0 CONFIG_EXPERIMENTAL equ 0 -CONFIG_DECRYPT equ 0 -CONFIG_MULTIPLE_ARF equ 0 -CONFIG_NON420 equ 0 -CONFIG_ALPHA equ 0 +CONFIG_SIZE_LIMIT equ 0 +CONFIG_SPATIAL_SVC equ 0 +CONFIG_VP9_TEMPORAL_DENOISING equ 0 +CONFIG_FP_MB_STATS equ 0 diff --git a/media/libvpx/vpx_config_x86_64-win64-vs8.h b/media/libvpx/vpx_config_x86_64-win64-vs8.h index 8c72c86819e5..2dfb605c8c20 100644 --- a/media/libvpx/vpx_config_x86_64-win64-vs8.h +++ b/media/libvpx/vpx_config_x86_64-win64-vs8.h @@ -19,8 +19,10 @@ #define HAVE_EDSP 0 #define HAVE_MEDIA 0 #define HAVE_NEON 0 +#define HAVE_NEON_ASM 0 #define HAVE_MIPS32 0 #define HAVE_DSPR2 0 +#define HAVE_MIPS64 0 #define HAVE_MMX 1 #define HAVE_SSE 1 #define HAVE_SSE2 1 @@ -56,7 +58,6 @@ #define CONFIG_MEM_MANAGER 0 #define CONFIG_MEM_TRACKER 0 #define CONFIG_MEM_CHECKS 0 -#define CONFIG_MD5 1 #define CONFIG_DEQUANT_TOKENS 0 #define CONFIG_DC_RECON 0 #define CONFIG_RUNTIME_CPU_DETECT 1 @@ -83,11 +84,17 @@ #define CONFIG_POSTPROC_VISUALIZER 0 #define CONFIG_OS_SUPPORT 1 #define CONFIG_UNIT_TESTS 1 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_EXPERIMENTAL 0 -#define CONFIG_DECRYPT 0 -#define CONFIG_MULTIPLE_ARF 0 -#define CONFIG_NON420 0 -#define CONFIG_ALPHA 0 +#define CONFIG_SIZE_LIMIT 0 +#define CONFIG_SPATIAL_SVC 0 +#define CONFIG_VP9_TEMPORAL_DENOISING 0 +#define CONFIG_FP_MB_STATS 0 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/vpx_mem/include/vpx_mem_intrnl.h b/media/libvpx/vpx_mem/include/vpx_mem_intrnl.h index 2248ad566fc8..225a3babfe3c 100644 --- a/media/libvpx/vpx_mem/include/vpx_mem_intrnl.h +++ b/media/libvpx/vpx_mem/include/vpx_mem_intrnl.h @@ -9,8 +9,8 @@ */ -#ifndef __VPX_MEM_INTRNL_H__ -#define __VPX_MEM_INTRNL_H__ +#ifndef VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_ +#define VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_ #include "./vpx_config.h" #ifndef CONFIG_MEM_MANAGER @@ -92,4 +92,4 @@ this should be a multiple of 4*/ /*returns an addr aligned to the byte boundary specified by align*/ #define align_addr(addr,align) (void*)(((size_t)(addr) + ((align) - 1)) & (size_t)-(align)) -#endif /*__VPX_MEM_INTRNL_H__*/ +#endif // VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_ diff --git a/media/libvpx/vpx_mem/include/vpx_mem_tracker.h b/media/libvpx/vpx_mem/include/vpx_mem_tracker.h index 3be0d2ddb0e5..1335e0017b32 100644 --- a/media/libvpx/vpx_mem/include/vpx_mem_tracker.h +++ b/media/libvpx/vpx_mem/include/vpx_mem_tracker.h @@ -9,8 +9,8 @@ */ -#ifndef __VPX_MEM_TRACKER_H__ -#define __VPX_MEM_TRACKER_H__ +#ifndef VPX_MEM_INCLUDE_VPX_MEM_TRACKER_H_ +#define VPX_MEM_INCLUDE_VPX_MEM_TRACKER_H_ /* vpx_mem_tracker version info */ #define vpx_mem_tracker_version "2.5.1.1" @@ -176,4 +176,4 @@ extern "C" { } #endif -#endif // __VPX_MEM_TRACKER_H__ +#endif // VPX_MEM_INCLUDE_VPX_MEM_TRACKER_H_ diff --git a/media/libvpx/vpx_mem/vpx_mem.c b/media/libvpx/vpx_mem/vpx_mem.c index 059248babde8..da616425c691 100644 --- a/media/libvpx/vpx_mem/vpx_mem.c +++ b/media/libvpx/vpx_mem/vpx_mem.c @@ -16,6 +16,7 @@ #include #include #include "include/vpx_mem_intrnl.h" +#include "vpx/vpx_integer.h" #if CONFIG_MEM_TRACKER #ifndef VPX_NO_GLOBALS @@ -452,6 +453,29 @@ void *vpx_memset(void *dest, int val, size_t length) { return VPX_MEMSET_L(dest, val, length); } +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH +void *vpx_memset16(void *dest, int val, size_t length) { +#if CONFIG_MEM_CHECKS + if ((int)dest < 0x4000) { + _P(printf("WARNING: vpx_memset dest:0x%x val:%d len:%d\n", + (int)dest, val, length);) + +#if defined(VXWORKS) + sp(get_my_tt, task_id_self(), 0, 0, 0, 0, 0, 0, 0, 0); + + vx_sleep(10000); +#endif + } +#endif + int i; + void *orig = dest; + uint16_t *dest16 = dest; + for (i = 0; i < length; i++) + *dest16++ = val; + return orig; +} +#endif // CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + void *vpx_memmove(void *dest, const void *src, size_t count) { #if CONFIG_MEM_CHECKS diff --git a/media/libvpx/vpx_mem/vpx_mem.h b/media/libvpx/vpx_mem/vpx_mem.h index c7321a95e785..e2391f49629d 100644 --- a/media/libvpx/vpx_mem/vpx_mem.h +++ b/media/libvpx/vpx_mem/vpx_mem.h @@ -9,8 +9,8 @@ */ -#ifndef __VPX_MEM_H__ -#define __VPX_MEM_H__ +#ifndef VPX_MEM_VPX_MEM_H_ +#define VPX_MEM_VPX_MEM_H_ #include "vpx_config.h" #if defined(__uClinux__) @@ -73,6 +73,9 @@ extern "C" { void *vpx_memcpy(void *dest, const void *src, size_t length); void *vpx_memset(void *dest, int val, size_t length); +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + void *vpx_memset16(void *dest, int val, size_t length); +#endif void *vpx_memmove(void *dest, const void *src, size_t count); /* special memory functions */ @@ -170,4 +173,4 @@ extern "C" { } #endif -#endif /* __VPX_MEM_H__ */ +#endif // VPX_MEM_VPX_MEM_H_ diff --git a/media/libvpx/vpx_ports/arm.h b/media/libvpx/vpx_ports/arm.h index 525a7641f2f7..1e4a8e28873e 100644 --- a/media/libvpx/vpx_ports/arm.h +++ b/media/libvpx/vpx_ports/arm.h @@ -9,11 +9,15 @@ */ -#ifndef VPX_PORTS_ARM_H -#define VPX_PORTS_ARM_H +#ifndef VPX_PORTS_ARM_H_ +#define VPX_PORTS_ARM_H_ #include #include "vpx_config.h" +#ifdef __cplusplus +extern "C" { +#endif + /*ARMv5TE "Enhanced DSP" instructions.*/ #define HAS_EDSP 0x01 /*ARMv6 "Parallel" or "Media" instructions.*/ @@ -23,5 +27,9 @@ int arm_cpu_caps(void); +#ifdef __cplusplus +} // extern "C" #endif +#endif // VPX_PORTS_ARM_H_ + diff --git a/media/libvpx/vpx_ports/arm_cpudetect.c b/media/libvpx/vpx_ports/arm_cpudetect.c index 542ff6786d3c..fa0e030b5520 100644 --- a/media/libvpx/vpx_ports/arm_cpudetect.c +++ b/media/libvpx/vpx_ports/arm_cpudetect.c @@ -12,6 +12,13 @@ #include #include "arm.h" +#ifdef WINAPI_FAMILY +#include +#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +#define getenv(x) NULL +#endif +#endif + static int arm_cpu_env_flags(int *flags) { char *env; env = getenv("VPX_SIMD_CAPS"); diff --git a/media/libvpx/vpx_ports/asm_offsets.h b/media/libvpx/vpx_ports/asm_offsets.h index d3a3e5a14b2a..317bbedcb136 100644 --- a/media/libvpx/vpx_ports/asm_offsets.h +++ b/media/libvpx/vpx_ports/asm_offsets.h @@ -9,8 +9,8 @@ */ -#ifndef VPX_PORTS_ASM_OFFSETS_H -#define VPX_PORTS_ASM_OFFSETS_H +#ifndef VPX_PORTS_ASM_OFFSETS_H_ +#define VPX_PORTS_ASM_OFFSETS_H_ #include @@ -28,4 +28,4 @@ #define END #endif -#endif /* VPX_PORTS_ASM_OFFSETS_H */ +#endif // VPX_PORTS_ASM_OFFSETS_H_ diff --git a/media/libvpx/vpx_ports/config.h b/media/libvpx/vpx_ports/config.h index 1abe70da9291..3c1ab99f4aa3 100644 --- a/media/libvpx/vpx_ports/config.h +++ b/media/libvpx/vpx_ports/config.h @@ -7,4 +7,10 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ + +#ifndef VPX_PORTS_CONFIG_H_ +#define VPX_PORTS_CONFIG_H_ + #include "vpx_config.h" + +#endif // VPX_PORTS_CONFIG_H_ diff --git a/media/libvpx/vpx_ports/emmintrin_compat.h b/media/libvpx/vpx_ports/emmintrin_compat.h index 782d603afb6c..16176383d290 100644 --- a/media/libvpx/vpx_ports/emmintrin_compat.h +++ b/media/libvpx/vpx_ports/emmintrin_compat.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_PORTS_EMMINTRIN_COMPAT_H -#define VPX_PORTS_EMMINTRIN_COMPAT_H +#ifndef VPX_PORTS_EMMINTRIN_COMPAT_H_ +#define VPX_PORTS_EMMINTRIN_COMPAT_H_ #if defined(__GNUC__) && __GNUC__ < 4 /* From emmintrin.h (gcc 4.5.3) */ @@ -52,4 +52,4 @@ _mm_castsi128_pd(__m128i __A) } #endif -#endif +#endif // VPX_PORTS_EMMINTRIN_COMPAT_H_ diff --git a/media/libvpx/vpx_ports/mem.h b/media/libvpx/vpx_ports/mem.h index 62b86bb1df8a..e91d776945f1 100644 --- a/media/libvpx/vpx_ports/mem.h +++ b/media/libvpx/vpx_ports/mem.h @@ -9,8 +9,8 @@ */ -#ifndef VPX_PORTS_MEM_H -#define VPX_PORTS_MEM_H +#ifndef VPX_PORTS_MEM_H_ +#define VPX_PORTS_MEM_H_ #include "vpx_config.h" #include "vpx/vpx_integer.h" @@ -44,4 +44,4 @@ #define UNINITIALIZED_IS_SAFE(x) x=x #else #define UNINITIALIZED_IS_SAFE(x) x -#endif +#endif // VPX_PORTS_MEM_H_ diff --git a/media/libvpx/vpx_ports/mem_ops.h b/media/libvpx/vpx_ports/mem_ops.h index 2d44a3a5827d..d4a3d773f3c3 100644 --- a/media/libvpx/vpx_ports/mem_ops.h +++ b/media/libvpx/vpx_ports/mem_ops.h @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VPX_PORTS_MEM_OPS_H_ +#define VPX_PORTS_MEM_OPS_H_ /* \file * \brief Provides portable memory access primitives @@ -131,7 +133,7 @@ static unsigned MEM_VALUE_T mem_get_le32(const void *vmem) { } #define mem_get_s_generic(end,sz) \ - static signed MEM_VALUE_T mem_get_s##end##sz(const void *vmem) {\ + static VPX_INLINE signed MEM_VALUE_T mem_get_s##end##sz(const void *vmem) {\ const MAU_T *mem = (const MAU_T*)vmem;\ signed MEM_VALUE_T val = mem_get_##end##sz(mem);\ return (val << (MEM_VALUE_T_SZ_BITS - sz)) >> (MEM_VALUE_T_SZ_BITS - sz);\ @@ -163,7 +165,7 @@ mem_get_s_generic(le, 32) #undef mem_put_be16 #define mem_put_be16 mem_ops_wrap_symbol(mem_put_be16) -static void mem_put_be16(void *vmem, MEM_VALUE_T val) { +static VPX_INLINE void mem_put_be16(void *vmem, MEM_VALUE_T val) { MAU_T *mem = (MAU_T *)vmem; mem[0] = (val >> 8) & 0xff; @@ -172,7 +174,7 @@ static void mem_put_be16(void *vmem, MEM_VALUE_T val) { #undef mem_put_be24 #define mem_put_be24 mem_ops_wrap_symbol(mem_put_be24) -static void mem_put_be24(void *vmem, MEM_VALUE_T val) { +static VPX_INLINE void mem_put_be24(void *vmem, MEM_VALUE_T val) { MAU_T *mem = (MAU_T *)vmem; mem[0] = (val >> 16) & 0xff; @@ -182,7 +184,7 @@ static void mem_put_be24(void *vmem, MEM_VALUE_T val) { #undef mem_put_be32 #define mem_put_be32 mem_ops_wrap_symbol(mem_put_be32) -static void mem_put_be32(void *vmem, MEM_VALUE_T val) { +static VPX_INLINE void mem_put_be32(void *vmem, MEM_VALUE_T val) { MAU_T *mem = (MAU_T *)vmem; mem[0] = (val >> 24) & 0xff; @@ -193,7 +195,7 @@ static void mem_put_be32(void *vmem, MEM_VALUE_T val) { #undef mem_put_le16 #define mem_put_le16 mem_ops_wrap_symbol(mem_put_le16) -static void mem_put_le16(void *vmem, MEM_VALUE_T val) { +static VPX_INLINE void mem_put_le16(void *vmem, MEM_VALUE_T val) { MAU_T *mem = (MAU_T *)vmem; mem[0] = (val >> 0) & 0xff; @@ -202,7 +204,7 @@ static void mem_put_le16(void *vmem, MEM_VALUE_T val) { #undef mem_put_le24 #define mem_put_le24 mem_ops_wrap_symbol(mem_put_le24) -static void mem_put_le24(void *vmem, MEM_VALUE_T val) { +static VPX_INLINE void mem_put_le24(void *vmem, MEM_VALUE_T val) { MAU_T *mem = (MAU_T *)vmem; mem[0] = (val >> 0) & 0xff; @@ -212,7 +214,7 @@ static void mem_put_le24(void *vmem, MEM_VALUE_T val) { #undef mem_put_le32 #define mem_put_le32 mem_ops_wrap_symbol(mem_put_le32) -static void mem_put_le32(void *vmem, MEM_VALUE_T val) { +static VPX_INLINE void mem_put_le32(void *vmem, MEM_VALUE_T val) { MAU_T *mem = (MAU_T *)vmem; mem[0] = (val >> 0) & 0xff; @@ -220,3 +222,5 @@ static void mem_put_le32(void *vmem, MEM_VALUE_T val) { mem[2] = (val >> 16) & 0xff; mem[3] = (val >> 24) & 0xff; } + +#endif // VPX_PORTS_MEM_OPS_H_ diff --git a/media/libvpx/vpx_ports/mem_ops_aligned.h b/media/libvpx/vpx_ports/mem_ops_aligned.h index 0100300a7ea0..c16111fec5d1 100644 --- a/media/libvpx/vpx_ports/mem_ops_aligned.h +++ b/media/libvpx/vpx_ports/mem_ops_aligned.h @@ -8,6 +8,10 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VPX_PORTS_MEM_OPS_ALIGNED_H_ +#define VPX_PORTS_MEM_OPS_ALIGNED_H_ + +#include "vpx/vpx_integer.h" /* \file * \brief Provides portable memory access primitives for operating on aligned @@ -40,19 +44,22 @@ #define swap_endian_32_se(val,raw) swap_endian_32(val,raw) #define mem_get_ne_aligned_generic(end,sz) \ - static unsigned MEM_VALUE_T mem_get_##end##sz##_aligned(const void *vmem) {\ + static VPX_INLINE unsigned MEM_VALUE_T \ + mem_get_##end##sz##_aligned(const void *vmem) {\ const uint##sz##_t *mem = (const uint##sz##_t *)vmem;\ return *mem;\ } #define mem_get_sne_aligned_generic(end,sz) \ - static signed MEM_VALUE_T mem_get_s##end##sz##_aligned(const void *vmem) {\ + static VPX_INLINE signed MEM_VALUE_T \ + mem_get_s##end##sz##_aligned(const void *vmem) {\ const int##sz##_t *mem = (const int##sz##_t *)vmem;\ return *mem;\ } #define mem_get_se_aligned_generic(end,sz) \ - static unsigned MEM_VALUE_T mem_get_##end##sz##_aligned(const void *vmem) {\ + static VPX_INLINE unsigned MEM_VALUE_T \ + mem_get_##end##sz##_aligned(const void *vmem) {\ const uint##sz##_t *mem = (const uint##sz##_t *)vmem;\ unsigned MEM_VALUE_T val, raw = *mem;\ swap_endian_##sz(val,raw);\ @@ -60,7 +67,8 @@ } #define mem_get_sse_aligned_generic(end,sz) \ - static signed MEM_VALUE_T mem_get_s##end##sz##_aligned(const void *vmem) {\ + static VPX_INLINE signed MEM_VALUE_T \ + mem_get_s##end##sz##_aligned(const void *vmem) {\ const int##sz##_t *mem = (const int##sz##_t *)vmem;\ unsigned MEM_VALUE_T val, raw = *mem;\ swap_endian_##sz##_se(val,raw);\ @@ -68,13 +76,15 @@ } #define mem_put_ne_aligned_generic(end,sz) \ - static void mem_put_##end##sz##_aligned(void *vmem, MEM_VALUE_T val) {\ + static VPX_INLINE void \ + mem_put_##end##sz##_aligned(void *vmem, MEM_VALUE_T val) {\ uint##sz##_t *mem = (uint##sz##_t *)vmem;\ *mem = (uint##sz##_t)val;\ } #define mem_put_se_aligned_generic(end,sz) \ - static void mem_put_##end##sz##_aligned(void *vmem, MEM_VALUE_T val) {\ + static VPX_INLINE void \ + mem_put_##end##sz##_aligned(void *vmem, MEM_VALUE_T val) {\ uint##sz##_t *mem = (uint##sz##_t *)vmem, raw;\ swap_endian_##sz(raw,val);\ *mem = (uint##sz##_t)raw;\ @@ -155,3 +165,5 @@ mem_put_le_aligned_generic(32) #undef swap_endian_32 #undef swap_endian_16_se #undef swap_endian_32_se + +#endif // VPX_PORTS_MEM_OPS_ALIGNED_H_ diff --git a/media/libvpx/vpx_ports/vpx_once.h b/media/libvpx/vpx_ports/vpx_once.h index 2687d63939df..bd9eebd643bd 100644 --- a/media/libvpx/vpx_ports/vpx_once.h +++ b/media/libvpx/vpx_ports/vpx_once.h @@ -7,8 +7,9 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_ONCE_H -#define VPX_ONCE_H + +#ifndef VPX_PORTS_VPX_ONCE_H_ +#define VPX_PORTS_VPX_ONCE_H_ #include "vpx_config.h" @@ -72,6 +73,33 @@ static void once(void (*func)(void)) } +#elif CONFIG_MULTITHREAD && defined(__OS2__) +#define INCL_DOS +#include +static void once(void (*func)(void)) +{ + static int done; + + /* If the initialization is complete, return early. */ + if(done) + return; + + /* Causes all other threads in the process to block themselves + * and give up their time slice. + */ + DosEnterCritSec(); + + if (!done) + { + func(); + done = 1; + } + + /* Restores normal thread dispatching for the current process. */ + DosExitCritSec(); +} + + #elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H #include static void once(void (*func)(void)) @@ -99,4 +127,4 @@ static void once(void (*func)(void)) } #endif -#endif +#endif // VPX_PORTS_VPX_ONCE_H_ diff --git a/media/libvpx/vpx_ports/vpx_timer.h b/media/libvpx/vpx_ports/vpx_timer.h index cdad9ef1d75e..dd98e291c2f7 100644 --- a/media/libvpx/vpx_ports/vpx_timer.h +++ b/media/libvpx/vpx_ports/vpx_timer.h @@ -9,8 +9,11 @@ */ -#ifndef VPX_TIMER_H -#define VPX_TIMER_H +#ifndef VPX_PORTS_VPX_TIMER_H_ +#define VPX_PORTS_VPX_TIMER_H_ + +#include "./vpx_config.h" + #include "vpx/vpx_integer.h" #if CONFIG_OS_SUPPORT @@ -53,7 +56,7 @@ struct vpx_usec_timer { }; -static void +static INLINE void vpx_usec_timer_start(struct vpx_usec_timer *t) { #if defined(_WIN32) QueryPerformanceCounter(&t->begin); @@ -63,7 +66,7 @@ vpx_usec_timer_start(struct vpx_usec_timer *t) { } -static void +static INLINE void vpx_usec_timer_mark(struct vpx_usec_timer *t) { #if defined(_WIN32) QueryPerformanceCounter(&t->end); @@ -73,7 +76,7 @@ vpx_usec_timer_mark(struct vpx_usec_timer *t) { } -static int64_t +static INLINE int64_t vpx_usec_timer_elapsed(struct vpx_usec_timer *t) { #if defined(_WIN32) LARGE_INTEGER freq, diff; @@ -101,17 +104,17 @@ struct vpx_usec_timer { void *dummy; }; -static void +static INLINE void vpx_usec_timer_start(struct vpx_usec_timer *t) { } -static void +static INLINE void vpx_usec_timer_mark(struct vpx_usec_timer *t) { } -static long +static INLINE int vpx_usec_timer_elapsed(struct vpx_usec_timer *t) { return 0; } #endif /* CONFIG_OS_SUPPORT */ -#endif +#endif // VPX_PORTS_VPX_TIMER_H_ diff --git a/media/libvpx/vpx_ports/x86.h b/media/libvpx/vpx_ports/x86.h index 29905834772b..81c2b8b873f3 100644 --- a/media/libvpx/vpx_ports/x86.h +++ b/media/libvpx/vpx_ports/x86.h @@ -9,11 +9,15 @@ */ -#ifndef VPX_PORTS_X86_H -#define VPX_PORTS_X86_H +#ifndef VPX_PORTS_X86_H_ +#define VPX_PORTS_X86_H_ #include #include "vpx_config.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef enum { VPX_CPU_UNKNOWN = -1, VPX_CPU_AMD, @@ -35,51 +39,63 @@ typedef enum { #if defined(__GNUC__) && __GNUC__ || defined(__ANDROID__) #if ARCH_X86_64 -#define cpuid(func,ax,bx,cx,dx)\ +#define cpuid(func, func2, ax, bx, cx, dx)\ __asm__ __volatile__ (\ "cpuid \n\t" \ : "=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) \ - : "a" (func)); + : "a" (func), "c" (func2)); #else -#define cpuid(func,ax,bx,cx,dx)\ +#define cpuid(func, func2, ax, bx, cx, dx)\ __asm__ __volatile__ (\ "mov %%ebx, %%edi \n\t" \ "cpuid \n\t" \ "xchg %%edi, %%ebx \n\t" \ : "=a" (ax), "=D" (bx), "=c" (cx), "=d" (dx) \ - : "a" (func)); + : "a" (func), "c" (func2)); #endif #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) /* end __GNUC__ or __ANDROID__*/ #if ARCH_X86_64 -#define cpuid(func,ax,bx,cx,dx)\ +#define cpuid(func, func2, ax, bx, cx, dx)\ asm volatile (\ "xchg %rsi, %rbx \n\t" \ "cpuid \n\t" \ "movl %ebx, %edi \n\t" \ "xchg %rsi, %rbx \n\t" \ : "=a" (ax), "=D" (bx), "=c" (cx), "=d" (dx) \ - : "a" (func)); + : "a" (func), "c" (func2)); #else -#define cpuid(func,ax,bx,cx,dx)\ +#define cpuid(func, func2, ax, bx, cx, dx)\ asm volatile (\ "pushl %ebx \n\t" \ "cpuid \n\t" \ "movl %ebx, %edi \n\t" \ "popl %ebx \n\t" \ : "=a" (ax), "=D" (bx), "=c" (cx), "=d" (dx) \ - : "a" (func)); + : "a" (func), "c" (func2)); #endif #else /* end __SUNPRO__ */ #if ARCH_X86_64 -void __cpuid(int CPUInfo[4], int info_type); -#pragma intrinsic(__cpuid) -#define cpuid(func,a,b,c,d) do{\ +#if defined(_MSC_VER) && _MSC_VER > 1500 +void __cpuidex(int CPUInfo[4], int info_type, int ecxvalue); +#pragma intrinsic(__cpuidex) +#define cpuid(func, func2, a, b, c, d) do {\ int regs[4];\ - __cpuid(regs,func); a=regs[0]; b=regs[1]; c=regs[2]; d=regs[3];\ + __cpuidex(regs, func, func2); \ + a = regs[0]; b = regs[1]; c = regs[2]; d = regs[3];\ } while(0) #else -#define cpuid(func,a,b,c,d)\ +void __cpuid(int CPUInfo[4], int info_type); +#pragma intrinsic(__cpuid) +#define cpuid(func, func2, a, b, c, d) do {\ + int regs[4];\ + __cpuid(regs, func); \ + a = regs[0]; b = regs[1]; c = regs[2]; d = regs[3];\ + } while (0) +#endif +#else +#define cpuid(func, func2, a, b, c, d)\ __asm mov eax, func\ + __asm mov ecx, func2\ __asm cpuid\ __asm mov a, eax\ __asm mov b, ebx\ @@ -100,7 +116,7 @@ void __cpuid(int CPUInfo[4], int info_type); #define BIT(n) (1< -#include "x86.h" - -struct cpuid_vendors { - char vendor_string[12]; - vpx_cpu_t vendor_id; -}; - -static struct cpuid_vendors cpuid_vendor_list[VPX_CPU_LAST] = { - { "AuthenticAMD", VPX_CPU_AMD }, - { "AMDisbetter!", VPX_CPU_AMD_OLD }, - { "CentaurHauls", VPX_CPU_CENTAUR }, - { "CyrixInstead", VPX_CPU_CYRIX }, - { "GenuineIntel", VPX_CPU_INTEL }, - { "NexGenDriven", VPX_CPU_NEXGEN }, - { "Geode by NSC", VPX_CPU_NSC }, - { "RiseRiseRise", VPX_CPU_RISE }, - { "SiS SiS SiS ", VPX_CPU_SIS }, - { "GenuineTMx86", VPX_CPU_TRANSMETA }, - { "TransmetaCPU", VPX_CPU_TRANSMETA_OLD }, - { "UMC UMC UMC ", VPX_CPU_UMC }, - { "VIA VIA VIA ", VPX_CPU_VIA }, -}; - -vpx_cpu_t vpx_x86_vendor(void) { - unsigned int reg_eax; - unsigned int vs[3]; - int i; - - /* Get the Vendor String from the CPU */ - cpuid(0, reg_eax, vs[0], vs[2], vs[1]); - - for (i = 0; i < VPX_CPU_LAST; i++) { - if (strncmp((const char *)vs, cpuid_vendor_list[i].vendor_string, 12) == 0) - return (cpuid_vendor_list[i].vendor_id); - } - - return VPX_CPU_UNKNOWN; -} diff --git a/media/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copy_y_neon.asm b/media/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copy_y_neon.asm deleted file mode 100644 index d070a476bbfa..000000000000 --- a/media/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copy_y_neon.asm +++ /dev/null @@ -1,123 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vpx_yv12_copy_y_neon| - - ARM - REQUIRE8 - PRESERVE8 - - INCLUDE vpx_scale_asm_offsets.asm - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void vpx_yv12_copy_y_neon(const YV12_BUFFER_CONFIG *src_ybc, -; YV12_BUFFER_CONFIG *dst_ybc) -|vpx_yv12_copy_y_neon| PROC - push {r4 - r11, lr} - vpush {d8-d15} - - ldr r4, [r0, #yv12_buffer_config_y_height] - ldr r5, [r0, #yv12_buffer_config_y_width] - ldr r6, [r0, #yv12_buffer_config_y_stride] - ldr r7, [r1, #yv12_buffer_config_y_stride] - ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 - ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1 - - ; copy two rows at one time - mov lr, r4, lsr #1 - -cp_src_to_dst_height_loop1 - mov r8, r2 - mov r9, r3 - add r10, r2, r6 - add r11, r3, r7 - movs r12, r5, lsr #7 - ble extra_copy_needed ; y_width < 128 - -cp_src_to_dst_width_loop1 - vld1.8 {q0, q1}, [r8]! - vld1.8 {q8, q9}, [r10]! - vld1.8 {q2, q3}, [r8]! - vld1.8 {q10, q11}, [r10]! - vld1.8 {q4, q5}, [r8]! - vld1.8 {q12, q13}, [r10]! - vld1.8 {q6, q7}, [r8]! - vld1.8 {q14, q15}, [r10]! - - subs r12, r12, #1 - - vst1.8 {q0, q1}, [r9]! - vst1.8 {q8, q9}, [r11]! - vst1.8 {q2, q3}, [r9]! - vst1.8 {q10, q11}, [r11]! - vst1.8 {q4, q5}, [r9]! - vst1.8 {q12, q13}, [r11]! - vst1.8 {q6, q7}, [r9]! - vst1.8 {q14, q15}, [r11]! - - bne cp_src_to_dst_width_loop1 - - subs lr, lr, #1 - add r2, r2, r6, lsl #1 - add r3, r3, r7, lsl #1 - - bne cp_src_to_dst_height_loop1 - -extra_copy_needed - ands r10, r5, #0x7f ;check to see if extra copy is needed - sub r11, r5, r10 - ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 - ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1 - bne extra_cp_src_to_dst_width1 -end_of_cp_src_to_dst1 - - vpop {d8 - d15} - pop {r4-r11, pc} - -;============================= -extra_cp_src_to_dst_width1 - add r2, r2, r11 - add r3, r3, r11 - add r0, r8, r6 - add r11, r9, r7 - - mov lr, r4, lsr #1 -extra_cp_src_to_dst_height_loop1 - mov r8, r2 - mov r9, r3 - add r0, r8, r6 - add r11, r9, r7 - - mov r12, r10 - -extra_cp_src_to_dst_width_loop1 - vld1.8 {q0}, [r8]! - vld1.8 {q1}, [r0]! - - subs r12, r12, #16 - - vst1.8 {q0}, [r9]! - vst1.8 {q1}, [r11]! - bne extra_cp_src_to_dst_width_loop1 - - subs lr, lr, #1 - - add r2, r2, r6, lsl #1 - add r3, r3, r7, lsl #1 - - bne extra_cp_src_to_dst_height_loop1 - - b end_of_cp_src_to_dst1 - - ENDP - - END diff --git a/media/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm b/media/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm deleted file mode 100644 index 696f47a7b4ed..000000000000 --- a/media/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm +++ /dev/null @@ -1,233 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_yv12_copy_frame_func_neon| - ARM - REQUIRE8 - PRESERVE8 - - INCLUDE vpx_scale_asm_offsets.asm - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void vp8_yv12_copy_frame_func_neon(const YV12_BUFFER_CONFIG *src_ybc, -; YV12_BUFFER_CONFIG *dst_ybc); - -|vp8_yv12_copy_frame_func_neon| PROC - push {r4 - r11, lr} - vpush {d8 - d15} - - sub sp, sp, #16 - - ;Copy Y plane - ldr r8, [r0, #yv12_buffer_config_u_buffer] ;srcptr1 - ldr r9, [r1, #yv12_buffer_config_u_buffer] ;srcptr1 - ldr r10, [r0, #yv12_buffer_config_v_buffer] ;srcptr1 - ldr r11, [r1, #yv12_buffer_config_v_buffer] ;srcptr1 - - ldr r4, [r0, #yv12_buffer_config_y_height] - ldr r5, [r0, #yv12_buffer_config_y_width] - ldr r6, [r0, #yv12_buffer_config_y_stride] - ldr r7, [r1, #yv12_buffer_config_y_stride] - ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 - ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1 - - str r8, [sp] - str r9, [sp, #4] - str r10, [sp, #8] - str r11, [sp, #12] - - ; copy two rows at one time - mov lr, r4, lsr #1 - -cp_src_to_dst_height_loop - mov r8, r2 - mov r9, r3 - add r10, r2, r6 - add r11, r3, r7 - movs r12, r5, lsr #7 - ble extra_cp_needed ; y_width < 128 - -cp_src_to_dst_width_loop - vld1.8 {q0, q1}, [r8]! - vld1.8 {q8, q9}, [r10]! - vld1.8 {q2, q3}, [r8]! - vld1.8 {q10, q11}, [r10]! - vld1.8 {q4, q5}, [r8]! - vld1.8 {q12, q13}, [r10]! - vld1.8 {q6, q7}, [r8]! - vld1.8 {q14, q15}, [r10]! - - subs r12, r12, #1 - - vst1.8 {q0, q1}, [r9]! - vst1.8 {q8, q9}, [r11]! - vst1.8 {q2, q3}, [r9]! - vst1.8 {q10, q11}, [r11]! - vst1.8 {q4, q5}, [r9]! - vst1.8 {q12, q13}, [r11]! - vst1.8 {q6, q7}, [r9]! - vst1.8 {q14, q15}, [r11]! - - bne cp_src_to_dst_width_loop - - subs lr, lr, #1 - add r2, r2, r6, lsl #1 - add r3, r3, r7, lsl #1 - - bne cp_src_to_dst_height_loop - -extra_cp_needed - ands r10, r5, #0x7f ;check to see if extra copy is needed - sub r11, r5, r10 - ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 - ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1 - bne extra_cp_src_to_dst_width -end_of_cp_src_to_dst - -;Copy U & V planes - ldr r2, [sp] ;srcptr1 - ldr r3, [sp, #4] ;dstptr1 - mov r4, r4, lsr #1 ;src uv_height - mov r5, r5, lsr #1 ;src uv_width - mov r6, r6, lsr #1 ;src uv_stride - mov r7, r7, lsr #1 ;dst uv_stride - - mov r1, #2 - -cp_uv_loop - - ;copy two rows at one time - mov lr, r4, lsr #1 - -cp_src_to_dst_height_uv_loop - mov r8, r2 - mov r9, r3 - add r10, r2, r6 - add r11, r3, r7 - movs r12, r5, lsr #6 - ble extra_uv_cp_needed - -cp_src_to_dst_width_uv_loop - vld1.8 {q0, q1}, [r8]! - vld1.8 {q8, q9}, [r10]! - vld1.8 {q2, q3}, [r8]! - vld1.8 {q10, q11}, [r10]! - - subs r12, r12, #1 - - vst1.8 {q0, q1}, [r9]! - vst1.8 {q8, q9}, [r11]! - vst1.8 {q2, q3}, [r9]! - vst1.8 {q10, q11}, [r11]! - - bne cp_src_to_dst_width_uv_loop - - subs lr, lr, #1 - add r2, r2, r6, lsl #1 - add r3, r3, r7, lsl #1 - - bne cp_src_to_dst_height_uv_loop - -extra_uv_cp_needed - ands r10, r5, #0x3f ;check to see if extra copy is needed - sub r11, r5, r10 - ldr r2, [sp] ;srcptr1 - ldr r3, [sp, #4] ;dstptr1 - bne extra_cp_src_to_dst_uv_width -end_of_cp_src_to_dst_uv - - subs r1, r1, #1 - - addne sp, sp, #8 - - ldrne r2, [sp] ;srcptr1 - ldrne r3, [sp, #4] ;dstptr1 - - bne cp_uv_loop - - add sp, sp, #8 - - vpop {d8 - d15} - pop {r4 - r11, pc} - -;============================= -extra_cp_src_to_dst_width - add r2, r2, r11 - add r3, r3, r11 - add r0, r8, r6 - add r11, r9, r7 - - mov lr, r4, lsr #1 -extra_cp_src_to_dst_height_loop - mov r8, r2 - mov r9, r3 - add r0, r8, r6 - add r11, r9, r7 - - mov r12, r10 - -extra_cp_src_to_dst_width_loop - vld1.8 {q0}, [r8]! - vld1.8 {q1}, [r0]! - - subs r12, r12, #16 - - vst1.8 {q0}, [r9]! - vst1.8 {q1}, [r11]! - bne extra_cp_src_to_dst_width_loop - - subs lr, lr, #1 - - add r2, r2, r6, lsl #1 - add r3, r3, r7, lsl #1 - - bne extra_cp_src_to_dst_height_loop - - b end_of_cp_src_to_dst - -;================================= -extra_cp_src_to_dst_uv_width - add r2, r2, r11 - add r3, r3, r11 - add r0, r8, r6 - add r11, r9, r7 - - mov lr, r4, lsr #1 -extra_cp_src_to_dst_height_uv_loop - mov r8, r2 - mov r9, r3 - add r0, r8, r6 - add r11, r9, r7 - - mov r12, r10 - -extra_cp_src_to_dst_width_uv_loop - vld1.8 {d0}, [r8]! - vld1.8 {d1}, [r0]! - - subs r12, r12, #8 - - vst1.8 {d0}, [r9]! - vst1.8 {d1}, [r11]! - bne extra_cp_src_to_dst_width_uv_loop - - subs lr, lr, #1 - - add r2, r2, r6, lsl #1 - add r3, r3, r7, lsl #1 - - bne extra_cp_src_to_dst_height_uv_loop - - b end_of_cp_src_to_dst_uv - - ENDP - END diff --git a/media/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm b/media/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm deleted file mode 100644 index d3306b669ae9..000000000000 --- a/media/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm +++ /dev/null @@ -1,259 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_yv12_copy_src_frame_func_neon| - ARM - REQUIRE8 - PRESERVE8 - - INCLUDE vpx_scale_asm_offsets.asm - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;Note: This function is used to copy source data in src_buffer[i] at beginning -;of the encoding. The buffer has a width and height of cpi->oxcf.Width and -;cpi->oxcf.Height, which can be ANY numbers(NOT always multiples of 16 or 4). - -;void vp8_yv12_copy_src_frame_func_neon(const YV12_BUFFER_CONFIG *src_ybc, -; YV12_BUFFER_CONFIG *dst_ybc); - -|vp8_yv12_copy_src_frame_func_neon| PROC - push {r4 - r11, lr} - vpush {d8 - d15} - - ;Copy Y plane - ldr r4, [r0, #yv12_buffer_config_y_height] - ldr r5, [r0, #yv12_buffer_config_y_width] - ldr r6, [r0, #yv12_buffer_config_y_stride] - ldr r7, [r1, #yv12_buffer_config_y_stride] - ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 - ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1 - - add r10, r2, r6 ;second row src - add r11, r3, r7 ;second row dst - mov r6, r6, lsl #1 - mov r7, r7, lsl #1 - sub r6, r6, r5 ;adjust stride - sub r7, r7, r5 - - ; copy two rows at one time - mov lr, r4, lsr #1 - -cp_src_to_dst_height_loop - mov r12, r5 - -cp_width_128_loop - vld1.8 {q0, q1}, [r2]! - vld1.8 {q4, q5}, [r10]! - vld1.8 {q2, q3}, [r2]! - vld1.8 {q6, q7}, [r10]! - vld1.8 {q8, q9}, [r2]! - vld1.8 {q12, q13}, [r10]! - vld1.8 {q10, q11}, [r2]! - vld1.8 {q14, q15}, [r10]! - sub r12, r12, #128 - cmp r12, #128 - vst1.8 {q0, q1}, [r3]! - vst1.8 {q4, q5}, [r11]! - vst1.8 {q2, q3}, [r3]! - vst1.8 {q6, q7}, [r11]! - vst1.8 {q8, q9}, [r3]! - vst1.8 {q12, q13}, [r11]! - vst1.8 {q10, q11}, [r3]! - vst1.8 {q14, q15}, [r11]! - bhs cp_width_128_loop - - cmp r12, #0 - beq cp_width_done - -cp_width_8_loop - vld1.8 {d0}, [r2]! - vld1.8 {d1}, [r10]! - sub r12, r12, #8 - cmp r12, #8 - vst1.8 {d0}, [r3]! - vst1.8 {d1}, [r11]! - bhs cp_width_8_loop - - cmp r12, #0 - beq cp_width_done - -cp_width_1_loop - ldrb r8, [r2], #1 - subs r12, r12, #1 - strb r8, [r3], #1 - ldrb r8, [r10], #1 - strb r8, [r11], #1 - bne cp_width_1_loop - -cp_width_done - subs lr, lr, #1 - add r2, r2, r6 - add r3, r3, r7 - add r10, r10, r6 - add r11, r11, r7 - bne cp_src_to_dst_height_loop - -;copy last line for Y if y_height is odd - tst r4, #1 - beq cp_width_done_1 - mov r12, r5 - -cp_width_128_loop_1 - vld1.8 {q0, q1}, [r2]! - vld1.8 {q2, q3}, [r2]! - vld1.8 {q8, q9}, [r2]! - vld1.8 {q10, q11}, [r2]! - sub r12, r12, #128 - cmp r12, #128 - vst1.8 {q0, q1}, [r3]! - vst1.8 {q2, q3}, [r3]! - vst1.8 {q8, q9}, [r3]! - vst1.8 {q10, q11}, [r3]! - bhs cp_width_128_loop_1 - - cmp r12, #0 - beq cp_width_done_1 - -cp_width_8_loop_1 - vld1.8 {d0}, [r2]! - sub r12, r12, #8 - cmp r12, #8 - vst1.8 {d0}, [r3]! - bhs cp_width_8_loop_1 - - cmp r12, #0 - beq cp_width_done_1 - -cp_width_1_loop_1 - ldrb r8, [r2], #1 - subs r12, r12, #1 - strb r8, [r3], #1 - bne cp_width_1_loop_1 -cp_width_done_1 - -;Copy U & V planes - ldr r4, [r0, #yv12_buffer_config_uv_height] - ldr r5, [r0, #yv12_buffer_config_uv_width] - ldr r6, [r0, #yv12_buffer_config_uv_stride] - ldr r7, [r1, #yv12_buffer_config_uv_stride] - ldr r2, [r0, #yv12_buffer_config_u_buffer] ;srcptr1 - ldr r3, [r1, #yv12_buffer_config_u_buffer] ;dstptr1 - - add r10, r2, r6 ;second row src - add r11, r3, r7 ;second row dst - mov r6, r6, lsl #1 - mov r7, r7, lsl #1 - sub r6, r6, r5 ;adjust stride - sub r7, r7, r5 - - mov r9, #2 - -cp_uv_loop - ;copy two rows at one time - mov lr, r4, lsr #1 - -cp_src_to_dst_height_uv_loop - mov r12, r5 - -cp_width_uv_64_loop - vld1.8 {q0, q1}, [r2]! - vld1.8 {q4, q5}, [r10]! - vld1.8 {q2, q3}, [r2]! - vld1.8 {q6, q7}, [r10]! - sub r12, r12, #64 - cmp r12, #64 - vst1.8 {q0, q1}, [r3]! - vst1.8 {q4, q5}, [r11]! - vst1.8 {q2, q3}, [r3]! - vst1.8 {q6, q7}, [r11]! - bhs cp_width_uv_64_loop - - cmp r12, #0 - beq cp_width_uv_done - -cp_width_uv_8_loop - vld1.8 {d0}, [r2]! - vld1.8 {d1}, [r10]! - sub r12, r12, #8 - cmp r12, #8 - vst1.8 {d0}, [r3]! - vst1.8 {d1}, [r11]! - bhs cp_width_uv_8_loop - - cmp r12, #0 - beq cp_width_uv_done - -cp_width_uv_1_loop - ldrb r8, [r2], #1 - subs r12, r12, #1 - strb r8, [r3], #1 - ldrb r8, [r10], #1 - strb r8, [r11], #1 - bne cp_width_uv_1_loop - -cp_width_uv_done - subs lr, lr, #1 - add r2, r2, r6 - add r3, r3, r7 - add r10, r10, r6 - add r11, r11, r7 - bne cp_src_to_dst_height_uv_loop - -;copy last line for U & V if uv_height is odd - tst r4, #1 - beq cp_width_uv_done_1 - mov r12, r5 - -cp_width_uv_64_loop_1 - vld1.8 {q0, q1}, [r2]! - vld1.8 {q2, q3}, [r2]! - sub r12, r12, #64 - cmp r12, #64 - vst1.8 {q0, q1}, [r3]! - vst1.8 {q2, q3}, [r3]! - bhs cp_width_uv_64_loop_1 - - cmp r12, #0 - beq cp_width_uv_done_1 - -cp_width_uv_8_loop_1 - vld1.8 {d0}, [r2]! - sub r12, r12, #8 - cmp r12, #8 - vst1.8 {d0}, [r3]! - bhs cp_width_uv_8_loop_1 - - cmp r12, #0 - beq cp_width_uv_done_1 - -cp_width_uv_1_loop_1 - ldrb r8, [r2], #1 - subs r12, r12, #1 - strb r8, [r3], #1 - bne cp_width_uv_1_loop_1 -cp_width_uv_done_1 - - subs r9, r9, #1 - ldrne r2, [r0, #yv12_buffer_config_v_buffer] ;srcptr1 - ldrne r3, [r1, #yv12_buffer_config_v_buffer] ;dstptr1 - ldrne r10, [r0, #yv12_buffer_config_uv_stride] - ldrne r11, [r1, #yv12_buffer_config_uv_stride] - - addne r10, r2, r10 ;second row src - addne r11, r3, r11 ;second row dst - - bne cp_uv_loop - - vpop {d8 - d15} - pop {r4 - r11, pc} - - ENDP - END diff --git a/media/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm b/media/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm deleted file mode 100644 index b2eb9eb0f22b..000000000000 --- a/media/libvpx/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm +++ /dev/null @@ -1,308 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_yv12_extend_frame_borders_neon| - ARM - REQUIRE8 - PRESERVE8 - - INCLUDE vpx_scale_asm_offsets.asm - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp8_yv12_extend_frame_borders_neon (YV12_BUFFER_CONFIG *ybf); -; we depend on VP8BORDERINPIXELS being 32 - -|vp8_yv12_extend_frame_borders_neon| PROC - push {r4 - r10, lr} - vpush {d8 - d15} - - ; Border = 32 - ldr r3, [r0, #yv12_buffer_config_y_width] ; plane_width - ldr r1, [r0, #yv12_buffer_config_y_buffer] ; src_ptr1 - ldr r4, [r0, #yv12_buffer_config_y_height] ; plane_height - ldr lr, [r0, #yv12_buffer_config_y_stride] ; plane_stride - -; Border copy for Y plane -; copy the left and right most columns out - add r6, r1, r3 ; dest_ptr2 = src_ptr2 + 1 (src_ptr1 + plane_width) - sub r2, r6, #1 ; src_ptr2 = src_ptr1 + plane_width - 1 - sub r5, r1, #32 ; dest_ptr1 = src_ptr1 - Border - - mov r12, r4, lsr #2 ; plane_height / 4 - -copy_left_right_y - vld1.8 {d0[], d1[]}, [r1], lr - vld1.8 {d4[], d5[]}, [r2], lr - vld1.8 {d8[], d9[]}, [r1], lr - vld1.8 {d12[], d13[]}, [r2], lr - vld1.8 {d16[], d17[]}, [r1], lr - vld1.8 {d20[], d21[]}, [r2], lr - vld1.8 {d24[], d25[]}, [r1], lr - vld1.8 {d28[], d29[]}, [r2], lr - - vmov q1, q0 - vmov q3, q2 - vmov q5, q4 - vmov q7, q6 - vmov q9, q8 - vmov q11, q10 - vmov q13, q12 - vmov q15, q14 - - subs r12, r12, #1 - - vst1.8 {q0, q1}, [r5], lr - vst1.8 {q2, q3}, [r6], lr - vst1.8 {q4, q5}, [r5], lr - vst1.8 {q6, q7}, [r6], lr - vst1.8 {q8, q9}, [r5], lr - vst1.8 {q10, q11}, [r6], lr - vst1.8 {q12, q13}, [r5], lr - vst1.8 {q14, q15}, [r6], lr - - bne copy_left_right_y - -;Now copy the top and bottom source lines into each line of the respective borders - ldr r1, [r0, #yv12_buffer_config_y_buffer] ; y_buffer - mul r8, r4, lr ; plane_height * plane_stride - - ; copy width is plane_stride - movs r12, lr, lsr #7 ; plane_stride / 128 - - sub r1, r1, #32 ; src_ptr1 = y_buffer - Border - add r6, r1, r8 ; dest_ptr2 = src_ptr2 - plane_stride (src_ptr1 + (plane_height * plane_stride)) - sub r2, r6, lr ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride - sub r5, r1, lr, asl #5 ; dest_ptr1 = src_ptr1 - (Border * plane_stride) - ble extra_y_copy_needed ; plane stride < 128 - -copy_top_bottom_y - vld1.8 {q0, q1}, [r1]! - vld1.8 {q8, q9}, [r2]! - vld1.8 {q2, q3}, [r1]! - vld1.8 {q10, q11}, [r2]! - vld1.8 {q4, q5}, [r1]! - vld1.8 {q12, q13}, [r2]! - vld1.8 {q6, q7}, [r1]! - vld1.8 {q14, q15}, [r2]! - - mov r7, #32 ; Border - -top_bottom_32 - subs r7, r7, #1 - - vst1.8 {q0, q1}, [r5]! - vst1.8 {q8, q9}, [r6]! - vst1.8 {q2, q3}, [r5]! - vst1.8 {q10, q11}, [r6]! - vst1.8 {q4, q5}, [r5]! - vst1.8 {q12, q13}, [r6]! - vst1.8 {q6, q7}, [r5]! - vst1.8 {q14, q15}, [r6]! - - add r5, r5, lr ; dest_ptr1 += plane_stride - sub r5, r5, #128 ; dest_ptr1 -= 128 - add r6, r6, lr ; dest_ptr2 += plane_stride - sub r6, r6, #128 ; dest_ptr2 -= 128 - - bne top_bottom_32 - - sub r5, r1, lr, asl #5 ; src_ptr1 - (Border* plane_stride) - add r6, r2, lr ; src_ptr2 + plane_stride - - subs r12, r12, #1 - bne copy_top_bottom_y - -extra_y_copy_needed - mov r7, lr, lsr #4 ; check to see if extra copy is needed - ands r7, r7, #0x7 - bne extra_top_bottom_y -end_of_border_copy_y - -;Border copy for U, V planes -; Border = 16 - ldr r7, [r0, #yv12_buffer_config_u_buffer] ; src_ptr1 - ldr lr, [r0, #yv12_buffer_config_uv_stride] ; plane_stride - ldr r3, [r0, #yv12_buffer_config_uv_width] ; plane_width - ldr r4, [r0, #yv12_buffer_config_uv_height] ; plane_height - - mov r10, #2 - -;copy the left and right most columns out -border_copy_uv - mov r1, r7 ; src_ptr1 needs to be saved for second half of loop - sub r5, r1, #16 ; dest_ptr1 = src_ptr1 - Border - add r6, r1, r3 ; dest_ptr2 = src_ptr2 + 1 (src_ptr1 + plane_width) - sub r2, r6, #1 ; src_ptr2 = src_ptr1 + plane_width - 1 - - mov r12, r4, lsr #3 ; plane_height / 8 - -copy_left_right_uv - vld1.8 {d0[], d1[]}, [r1], lr - vld1.8 {d2[], d3[]}, [r2], lr - vld1.8 {d4[], d5[]}, [r1], lr - vld1.8 {d6[], d7[]}, [r2], lr - vld1.8 {d8[], d9[]}, [r1], lr - vld1.8 {d10[], d11[]}, [r2], lr - vld1.8 {d12[], d13[]}, [r1], lr - vld1.8 {d14[], d15[]}, [r2], lr - vld1.8 {d16[], d17[]}, [r1], lr - vld1.8 {d18[], d19[]}, [r2], lr - vld1.8 {d20[], d21[]}, [r1], lr - vld1.8 {d22[], d23[]}, [r2], lr - vld1.8 {d24[], d25[]}, [r1], lr - vld1.8 {d26[], d27[]}, [r2], lr - vld1.8 {d28[], d29[]}, [r1], lr - vld1.8 {d30[], d31[]}, [r2], lr - - subs r12, r12, #1 - - vst1.8 {q0}, [r5], lr - vst1.8 {q1}, [r6], lr - vst1.8 {q2}, [r5], lr - vst1.8 {q3}, [r6], lr - vst1.8 {q4}, [r5], lr - vst1.8 {q5}, [r6], lr - vst1.8 {q6}, [r5], lr - vst1.8 {q7}, [r6], lr - vst1.8 {q8}, [r5], lr - vst1.8 {q9}, [r6], lr - vst1.8 {q10}, [r5], lr - vst1.8 {q11}, [r6], lr - vst1.8 {q12}, [r5], lr - vst1.8 {q13}, [r6], lr - vst1.8 {q14}, [r5], lr - vst1.8 {q15}, [r6], lr - - bne copy_left_right_uv - -;Now copy the top and bottom source lines into each line of the respective borders - mov r1, r7 - mul r8, r4, lr ; plane_height * plane_stride - movs r12, lr, lsr #6 ; plane_stride / 64 - - sub r1, r1, #16 ; src_ptr1 = u_buffer - Border - add r6, r1, r8 ; dest_ptr2 = src_ptr2 + plane_stride (src_ptr1 + (plane_height * plane_stride) - sub r2, r6, lr ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride - sub r5, r1, lr, asl #4 ; dest_ptr1 = src_ptr1 - (Border * plane_stride) - ble extra_uv_copy_needed ; plane_stride < 64 - -copy_top_bottom_uv - vld1.8 {q0, q1}, [r1]! - vld1.8 {q8, q9}, [r2]! - vld1.8 {q2, q3}, [r1]! - vld1.8 {q10, q11}, [r2]! - - mov r7, #16 ; Border - -top_bottom_16 - subs r7, r7, #1 - - vst1.8 {q0, q1}, [r5]! - vst1.8 {q8, q9}, [r6]! - vst1.8 {q2, q3}, [r5]! - vst1.8 {q10, q11}, [r6]! - - add r5, r5, lr ; dest_ptr1 += plane_stride - sub r5, r5, #64 - add r6, r6, lr ; dest_ptr2 += plane_stride - sub r6, r6, #64 - - bne top_bottom_16 - - sub r5, r1, lr, asl #4 ; dest_ptr1 = src_ptr1 - (Border * plane_stride) - add r6, r2, lr ; dest_ptr2 = src_ptr2 + plane_stride - - subs r12, r12, #1 - bne copy_top_bottom_uv -extra_uv_copy_needed - mov r7, lr, lsr #3 ; check to see if extra copy is needed - ands r7, r7, #0x7 - bne extra_top_bottom_uv - -end_of_border_copy_uv - subs r10, r10, #1 - ldrne r7, [r0, #yv12_buffer_config_v_buffer] ; src_ptr1 - bne border_copy_uv - - vpop {d8 - d15} - pop {r4 - r10, pc} - -;;;;;;;;;;;;;;;;;;;;;; -extra_top_bottom_y - vld1.8 {q0}, [r1]! - vld1.8 {q2}, [r2]! - - mov r9, #4 ; 32 >> 3 - -extra_top_bottom_32 - subs r9, r9, #1 - - vst1.8 {q0}, [r5], lr - vst1.8 {q2}, [r6], lr - vst1.8 {q0}, [r5], lr - vst1.8 {q2}, [r6], lr - vst1.8 {q0}, [r5], lr - vst1.8 {q2}, [r6], lr - vst1.8 {q0}, [r5], lr - vst1.8 {q2}, [r6], lr - vst1.8 {q0}, [r5], lr - vst1.8 {q2}, [r6], lr - vst1.8 {q0}, [r5], lr - vst1.8 {q2}, [r6], lr - vst1.8 {q0}, [r5], lr - vst1.8 {q2}, [r6], lr - vst1.8 {q0}, [r5], lr - vst1.8 {q2}, [r6], lr - bne extra_top_bottom_32 - - sub r5, r1, lr, asl #5 ; src_ptr1 - (Border * plane_stride) - add r6, r2, lr ; src_ptr2 + plane_stride - subs r7, r7, #1 - bne extra_top_bottom_y - - b end_of_border_copy_y - -extra_top_bottom_uv - vld1.8 {d0}, [r1]! - vld1.8 {d8}, [r2]! - - mov r9, #2 ; 16 >> 3 - -extra_top_bottom_16 - subs r9, r9, #1 - - vst1.8 {d0}, [r5], lr - vst1.8 {d8}, [r6], lr - vst1.8 {d0}, [r5], lr - vst1.8 {d8}, [r6], lr - vst1.8 {d0}, [r5], lr - vst1.8 {d8}, [r6], lr - vst1.8 {d0}, [r5], lr - vst1.8 {d8}, [r6], lr - vst1.8 {d0}, [r5], lr - vst1.8 {d8}, [r6], lr - vst1.8 {d0}, [r5], lr - vst1.8 {d8}, [r6], lr - vst1.8 {d0}, [r5], lr - vst1.8 {d8}, [r6], lr - vst1.8 {d0}, [r5], lr - vst1.8 {d8}, [r6], lr - bne extra_top_bottom_16 - - sub r5, r1, lr, asl #4 ; src_ptr1 - (Border * plane_stride) - add r6, r2, lr ; src_ptr2 + plane_stride - subs r7, r7, #1 - bne extra_top_bottom_uv - - b end_of_border_copy_uv - - ENDP - END diff --git a/media/libvpx/vpx_scale/arm/neon/yv12extend_arm.c b/media/libvpx/vpx_scale/arm/neon/yv12extend_arm.c deleted file mode 100644 index fac7bbc1b013..000000000000 --- a/media/libvpx/vpx_scale/arm/neon/yv12extend_arm.c +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_scale_rtcd.h" - -extern void vp8_yv12_copy_frame_func_neon( - const struct yv12_buffer_config *src_ybc, - struct yv12_buffer_config *dst_ybc); - -void vp8_yv12_copy_frame_neon(const struct yv12_buffer_config *src_ybc, - struct yv12_buffer_config *dst_ybc) { - vp8_yv12_copy_frame_func_neon(src_ybc, dst_ybc); - vp8_yv12_extend_frame_borders_neon(dst_ybc); -} diff --git a/media/libvpx/vpx_scale/generic/yv12config.c b/media/libvpx/vpx_scale/generic/yv12config.c index 7c3f7ece9e0e..70d7ac0c8862 100644 --- a/media/libvpx/vpx_scale/generic/yv12config.c +++ b/media/libvpx/vpx_scale/generic/yv12config.c @@ -8,9 +8,14 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include + #include "./vpx_config.h" #include "vpx_scale/yv12config.h" #include "vpx_mem/vpx_mem.h" +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH +#include "vp9/common/vp9_common.h" +#endif /**************************************************************************** * Exports @@ -19,10 +24,17 @@ /**************************************************************************** * ****************************************************************************/ +#define yv12_align_addr(addr, align) \ + (void*)(((size_t)(addr) + ((align) - 1)) & (size_t)-(align)) + int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf) { if (ybf) { - vpx_free(ybf->buffer_alloc); + // If libvpx is using frame buffer callbacks then buffer_alloc_sz must + // not be set. + if (ybf->buffer_alloc_sz > 0) { + vpx_free(ybf->buffer_alloc); + } /* buffer_alloc isn't accessed by most functions. Rather y_buffer, u_buffer and v_buffer point to buffer_alloc and are used. Clear out @@ -51,7 +63,7 @@ int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, const int frame_size = yplane_size + 2 * uvplane_size; if (!ybf->buffer_alloc) { - ybf->buffer_alloc = vpx_memalign(32, frame_size); + ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, frame_size); ybf->buffer_alloc_sz = frame_size; } @@ -72,6 +84,8 @@ int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, ybf->y_height = aligned_height; ybf->y_stride = y_stride; + ybf->uv_crop_width = (width + 1) / 2; + ybf->uv_crop_height = (height + 1) / 2; ybf->uv_width = uv_width; ybf->uv_height = uv_height; ybf->uv_stride = uv_stride; @@ -108,7 +122,9 @@ int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int vp9_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) { if (ybf) { - vpx_free(ybf->buffer_alloc); + if (ybf->buffer_alloc_sz > 0) { + vpx_free(ybf->buffer_alloc); + } /* buffer_alloc isn't accessed by most functions. Rather y_buffer, u_buffer and v_buffer point to buffer_alloc and are used. Clear out @@ -123,41 +139,91 @@ int vp9_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) { int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, - int ss_x, int ss_y, int border) { + int ss_x, int ss_y, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif + int border, + vpx_codec_frame_buffer_t *fb, + vpx_get_frame_buffer_cb_fn_t cb, + void *cb_priv) { if (ybf) { const int aligned_width = (width + 7) & ~7; const int aligned_height = (height + 7) & ~7; const int y_stride = ((aligned_width + 2 * border) + 31) & ~31; - const int yplane_size = (aligned_height + 2 * border) * y_stride; + const uint64_t yplane_size = (aligned_height + 2 * border) * + (uint64_t)y_stride; const int uv_width = aligned_width >> ss_x; const int uv_height = aligned_height >> ss_y; const int uv_stride = y_stride >> ss_x; const int uv_border_w = border >> ss_x; const int uv_border_h = border >> ss_y; - const int uvplane_size = (uv_height + 2 * uv_border_h) * uv_stride; + const uint64_t uvplane_size = (uv_height + 2 * uv_border_h) * + (uint64_t)uv_stride; #if CONFIG_ALPHA const int alpha_width = aligned_width; const int alpha_height = aligned_height; const int alpha_stride = y_stride; const int alpha_border_w = border; const int alpha_border_h = border; - const int alpha_plane_size = (alpha_height + 2 * alpha_border_h) * - alpha_stride; - const int frame_size = yplane_size + 2 * uvplane_size + - alpha_plane_size; + const uint64_t alpha_plane_size = (alpha_height + 2 * alpha_border_h) * + (uint64_t)alpha_stride; +#if CONFIG_VP9_HIGHBITDEPTH + const uint64_t frame_size = (1 + use_highbitdepth) * + (yplane_size + 2 * uvplane_size + alpha_plane_size); #else - const int frame_size = yplane_size + 2 * uvplane_size; -#endif - if (frame_size > ybf->buffer_alloc_sz) { - // Allocation to hold larger frame, or first allocation. - if (ybf->buffer_alloc) - vpx_free(ybf->buffer_alloc); - ybf->buffer_alloc = vpx_memalign(32, frame_size); - ybf->buffer_alloc_sz = frame_size; - } + const uint64_t frame_size = yplane_size + 2 * uvplane_size + + alpha_plane_size; +#endif // CONFIG_VP9_HIGHBITDEPTH +#else +#if CONFIG_VP9_HIGHBITDEPTH + const uint64_t frame_size = + (1 + use_highbitdepth) * (yplane_size + 2 * uvplane_size); +#else + const uint64_t frame_size = yplane_size + 2 * uvplane_size; +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // CONFIG_ALPHA + if (cb != NULL) { + const int align_addr_extra_size = 31; + const uint64_t external_frame_size = frame_size + align_addr_extra_size; - if (!ybf->buffer_alloc || ybf->buffer_alloc_sz < frame_size) - return -1; + assert(fb != NULL); + + if (external_frame_size != (size_t)external_frame_size) + return -1; + + // Allocation to hold larger frame, or first allocation. + if (cb(cb_priv, (size_t)external_frame_size, fb) < 0) + return -1; + + if (fb->data == NULL || fb->size < external_frame_size) + return -1; + + // This memset is needed for fixing valgrind error from C loop filter + // due to access uninitialized memory in frame border. It could be + // removed if border is totally removed. + vpx_memset(fb->data, 0, fb->size); + + ybf->buffer_alloc = (uint8_t *)yv12_align_addr(fb->data, 32); + } else if (frame_size > (size_t)ybf->buffer_alloc_sz) { + // Allocation to hold larger frame, or first allocation. + vpx_free(ybf->buffer_alloc); + ybf->buffer_alloc = NULL; + + if (frame_size != (size_t)frame_size) + return -1; + + ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, (size_t)frame_size); + if (!ybf->buffer_alloc) + return -1; + + ybf->buffer_alloc_sz = (int)frame_size; + + // This memset is needed for fixing valgrind error from C loop filter + // due to access uninitialized memory in frame border. It could be + // removed if border is totally removed. + vpx_memset(ybf->buffer_alloc, 0, ybf->buffer_alloc_sz); + } /* Only support allocating buffers that have a border that's a multiple * of 32. The border restriction is required to get 16-byte alignment of @@ -180,13 +246,33 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, ybf->uv_stride = uv_stride; ybf->border = border; - ybf->frame_size = frame_size; + ybf->frame_size = (int)frame_size; +#if CONFIG_VP9_HIGHBITDEPTH + if (use_highbitdepth) { + // Store uint16 addresses when using 16bit framebuffers + uint8_t *p = CONVERT_TO_BYTEPTR(ybf->buffer_alloc); + ybf->y_buffer = p + (border * y_stride) + border; + ybf->u_buffer = p + yplane_size + + (uv_border_h * uv_stride) + uv_border_w; + ybf->v_buffer = p + yplane_size + uvplane_size + + (uv_border_h * uv_stride) + uv_border_w; + ybf->flags = YV12_FLAG_HIGHBITDEPTH; + } else { + ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border; + ybf->u_buffer = ybf->buffer_alloc + yplane_size + + (uv_border_h * uv_stride) + uv_border_w; + ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + + (uv_border_h * uv_stride) + uv_border_w; + ybf->flags = 0; + } +#else ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border; ybf->u_buffer = ybf->buffer_alloc + yplane_size + (uv_border_h * uv_stride) + uv_border_w; ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (uv_border_h * uv_stride) + uv_border_w; +#endif // CONFIG_VP9_HIGHBITDEPTH #if CONFIG_ALPHA ybf->alpha_width = alpha_width; @@ -203,10 +289,18 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, - int ss_x, int ss_y, int border) { + int ss_x, int ss_y, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif + int border) { if (ybf) { vp9_free_frame_buffer(ybf); - return vp9_realloc_frame_buffer(ybf, width, height, ss_x, ss_y, border); + return vp9_realloc_frame_buffer(ybf, width, height, ss_x, ss_y, +#if CONFIG_VP9_HIGHBITDEPTH + use_highbitdepth, +#endif + border, NULL, NULL, NULL); } return -2; } diff --git a/media/libvpx/vpx_scale/generic/yv12extend.c b/media/libvpx/vpx_scale/generic/yv12extend.c index 7896dfee639c..0485452aec36 100644 --- a/media/libvpx/vpx_scale/generic/yv12extend.c +++ b/media/libvpx/vpx_scale/generic/yv12extend.c @@ -13,6 +13,9 @@ #include "vpx/vpx_integer.h" #include "vpx_mem/vpx_mem.h" #include "vpx_scale/yv12config.h" +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH +#include "vp9/common/vp9_common.h" +#endif static void extend_plane(uint8_t *const src, int src_stride, int width, int height, @@ -55,12 +58,84 @@ static void extend_plane(uint8_t *const src, int src_stride, } } +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH +static void extend_plane_high(uint8_t *const src8, int src_stride, + int width, int height, + int extend_top, int extend_left, + int extend_bottom, int extend_right) { + int i; + const int linesize = extend_left + extend_right + width; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + + /* copy the left and right most columns out */ + uint16_t *src_ptr1 = src; + uint16_t *src_ptr2 = src + width - 1; + uint16_t *dst_ptr1 = src - extend_left; + uint16_t *dst_ptr2 = src + width; + + for (i = 0; i < height; ++i) { + vpx_memset16(dst_ptr1, src_ptr1[0], extend_left); + vpx_memset16(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_stride; + src_ptr2 += src_stride; + dst_ptr1 += src_stride; + dst_ptr2 += src_stride; + } + + /* Now copy the top and bottom lines into each line of the respective + * borders + */ + src_ptr1 = src - extend_left; + src_ptr2 = src + src_stride * (height - 1) - extend_left; + dst_ptr1 = src + src_stride * -extend_top - extend_left; + dst_ptr2 = src + src_stride * height - extend_left; + + for (i = 0; i < extend_top; ++i) { + vpx_memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t)); + dst_ptr1 += src_stride; + } + + for (i = 0; i < extend_bottom; ++i) { + vpx_memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t)); + dst_ptr2 += src_stride; + } +} +#endif + void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) { + const int uv_border = ybf->border / 2; + + assert(ybf->border % 2 == 0); assert(ybf->y_height - ybf->y_crop_height < 16); assert(ybf->y_width - ybf->y_crop_width < 16); assert(ybf->y_height - ybf->y_crop_height >= 0); assert(ybf->y_width - ybf->y_crop_width >= 0); +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) { + extend_plane_high( + ybf->y_buffer, ybf->y_stride, + ybf->y_crop_width, ybf->y_crop_height, + ybf->border, ybf->border, + ybf->border + ybf->y_height - ybf->y_crop_height, + ybf->border + ybf->y_width - ybf->y_crop_width); + + extend_plane_high( + ybf->u_buffer, ybf->uv_stride, + (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2, + ybf->border / 2, ybf->border / 2, + (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2, + (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2); + + extend_plane_high( + ybf->v_buffer, ybf->uv_stride, + (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2, + ybf->border / 2, ybf->border / 2, + (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2, + (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2); + return; + } +#endif extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, ybf->y_crop_height, ybf->border, ybf->border, @@ -68,35 +143,48 @@ void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) { ybf->border + ybf->y_width - ybf->y_crop_width); extend_plane(ybf->u_buffer, ybf->uv_stride, - (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2, - ybf->border / 2, ybf->border / 2, - (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2, - (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2); + ybf->uv_crop_width, ybf->uv_crop_height, + uv_border, uv_border, + uv_border + ybf->uv_height - ybf->uv_crop_height, + uv_border + ybf->uv_width - ybf->uv_crop_width); extend_plane(ybf->v_buffer, ybf->uv_stride, - (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2, - ybf->border / 2, ybf->border / 2, - (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2, - (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2); + ybf->uv_crop_width, ybf->uv_crop_height, + uv_border, uv_border, + uv_border + ybf->uv_height - ybf->uv_crop_height, + uv_border + ybf->uv_width - ybf->uv_crop_width); } #if CONFIG_VP9 -static void extend_frame(YV12_BUFFER_CONFIG *const ybf, - int subsampling_x, int subsampling_y, - int ext_size) { +static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size) { const int c_w = ybf->uv_crop_width; const int c_h = ybf->uv_crop_height; - const int c_ext_size = ext_size >> 1; - const int c_et = c_ext_size; - const int c_el = c_ext_size; - const int c_eb = c_ext_size + ybf->uv_height - ybf->uv_crop_height; - const int c_er = c_ext_size + ybf->uv_width - ybf->uv_crop_width; + const int ss_x = ybf->uv_width < ybf->y_width; + const int ss_y = ybf->uv_height < ybf->y_height; + const int c_et = ext_size >> ss_y; + const int c_el = ext_size >> ss_x; + const int c_eb = c_et + ybf->uv_height - ybf->uv_crop_height; + const int c_er = c_el + ybf->uv_width - ybf->uv_crop_width; assert(ybf->y_height - ybf->y_crop_height < 16); assert(ybf->y_width - ybf->y_crop_width < 16); assert(ybf->y_height - ybf->y_crop_height >= 0); assert(ybf->y_width - ybf->y_crop_width >= 0); +#if CONFIG_VP9_HIGHBITDEPTH + if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) { + extend_plane_high(ybf->y_buffer, ybf->y_stride, + ybf->y_crop_width, ybf->y_crop_height, + ext_size, ext_size, + ext_size + ybf->y_height - ybf->y_crop_height, + ext_size + ybf->y_width - ybf->y_crop_width); + extend_plane_high(ybf->u_buffer, ybf->uv_stride, + c_w, c_h, c_et, c_el, c_eb, c_er); + extend_plane_high(ybf->v_buffer, ybf->uv_stride, + c_w, c_h, c_et, c_el, c_eb, c_er); + return; + } +#endif extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width, ybf->y_crop_height, ext_size, ext_size, @@ -110,17 +198,23 @@ static void extend_frame(YV12_BUFFER_CONFIG *const ybf, c_w, c_h, c_et, c_el, c_eb, c_er); } -void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf, - int subsampling_x, int subsampling_y) { - extend_frame(ybf, subsampling_x, subsampling_y, ybf->border); +void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) { + extend_frame(ybf, ybf->border); } -void vp9_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf, - int subsampling_x, int subsampling_y) { +void vp9_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf) { const int inner_bw = (ybf->border > VP9INNERBORDERINPIXELS) ? VP9INNERBORDERINPIXELS : ybf->border; - extend_frame(ybf, subsampling_x, subsampling_y, inner_bw); + extend_frame(ybf, inner_bw); } + +#if CONFIG_VP9_HIGHBITDEPTH +void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) { + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + vpx_memcpy(dst, src, num * sizeof(uint16_t)); +} +#endif // CONFIG_VP9_HIGHBITDEPTH #endif // CONFIG_VP9 // Copies the source image into the destination image and updates the @@ -140,6 +234,40 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc, assert(src_ybc->y_height == dst_ybc->y_height); #endif +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) { + assert(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH); + for (row = 0; row < src_ybc->y_height; ++row) { + memcpy_short_addr(dst, src, src_ybc->y_width); + src += src_ybc->y_stride; + dst += dst_ybc->y_stride; + } + + src = src_ybc->u_buffer; + dst = dst_ybc->u_buffer; + + for (row = 0; row < src_ybc->uv_height; ++row) { + memcpy_short_addr(dst, src, src_ybc->uv_width); + src += src_ybc->uv_stride; + dst += dst_ybc->uv_stride; + } + + src = src_ybc->v_buffer; + dst = dst_ybc->v_buffer; + + for (row = 0; row < src_ybc->uv_height; ++row) { + memcpy_short_addr(dst, src, src_ybc->uv_width); + src += src_ybc->uv_stride; + dst += dst_ybc->uv_stride; + } + + vp8_yv12_extend_frame_borders_c(dst_ybc); + return; + } else { + assert(!(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH)); + } +#endif + for (row = 0; row < src_ybc->y_height; ++row) { vpx_memcpy(dst, src, src_ybc->y_width); src += src_ybc->y_stride; @@ -173,6 +301,19 @@ void vpx_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc, const uint8_t *src = src_ybc->y_buffer; uint8_t *dst = dst_ybc->y_buffer; +#if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH + if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) { + const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); + uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); + for (row = 0; row < src_ybc->y_height; ++row) { + vpx_memcpy(dst16, src16, src_ybc->y_width * sizeof(uint16_t)); + src16 += src_ybc->y_stride; + dst16 += dst_ybc->y_stride; + } + return; + } +#endif + for (row = 0; row < src_ybc->y_height; ++row) { vpx_memcpy(dst, src, src_ybc->y_width); src += src_ybc->y_stride; diff --git a/media/libvpx/vpx_scale/vpx_scale.h b/media/libvpx/vpx_scale/vpx_scale.h index 9ddf62efd4cb..43fcf9d66e18 100644 --- a/media/libvpx/vpx_scale/vpx_scale.h +++ b/media/libvpx/vpx_scale/vpx_scale.h @@ -9,8 +9,8 @@ */ -#ifndef VPXSCALE_H -#define VPXSCALE_H +#ifndef VPX_SCALE_VPX_SCALE_H_ +#define VPX_SCALE_VPX_SCALE_H_ #include "vpx_scale/yv12config.h" @@ -24,4 +24,4 @@ extern void vpx_scale_frame(YV12_BUFFER_CONFIG *src, unsigned int vratio, unsigned int interlaced); -#endif +#endif // VPX_SCALE_VPX_SCALE_H_ diff --git a/media/libvpx/vpx_scale/yv12config.h b/media/libvpx/vpx_scale/yv12config.h index 0e950fb14f75..eb0a8d68f2bc 100644 --- a/media/libvpx/vpx_scale/yv12config.h +++ b/media/libvpx/vpx_scale/yv12config.h @@ -8,69 +8,87 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef YV12_CONFIG_H -#define YV12_CONFIG_H +#ifndef VPX_SCALE_YV12CONFIG_H_ +#define VPX_SCALE_YV12CONFIG_H_ #ifdef __cplusplus extern "C" { #endif +#include "vpx/vpx_frame_buffer.h" #include "vpx/vpx_integer.h" -#define VP8BORDERINPIXELS 32 -#define VP9INNERBORDERINPIXELS 96 -#define VP9BORDERINPIXELS 160 -#define VP9_INTERP_EXTEND 4 +#define VP8BORDERINPIXELS 32 +#define VP9INNERBORDERINPIXELS 96 +#define VP9_INTERP_EXTEND 4 +#define VP9_ENC_BORDER_IN_PIXELS 160 +#define VP9_DEC_BORDER_IN_PIXELS 32 - typedef struct yv12_buffer_config { - int y_width; - int y_height; - int y_crop_width; - int y_crop_height; - int y_stride; - /* int yinternal_width; */ +typedef struct yv12_buffer_config { + int y_width; + int y_height; + int y_crop_width; + int y_crop_height; + int y_stride; - int uv_width; - int uv_height; - int uv_crop_width; - int uv_crop_height; - int uv_stride; - /* int uvinternal_width; */ + int uv_width; + int uv_height; + int uv_crop_width; + int uv_crop_height; + int uv_stride; - int alpha_width; - int alpha_height; - int alpha_stride; + int alpha_width; + int alpha_height; + int alpha_stride; - uint8_t *y_buffer; - uint8_t *u_buffer; - uint8_t *v_buffer; - uint8_t *alpha_buffer; + uint8_t *y_buffer; + uint8_t *u_buffer; + uint8_t *v_buffer; + uint8_t *alpha_buffer; - uint8_t *buffer_alloc; - int buffer_alloc_sz; - int border; - int frame_size; + uint8_t *buffer_alloc; + int buffer_alloc_sz; + int border; + int frame_size; - int corrupted; - int flags; - } YV12_BUFFER_CONFIG; + int corrupted; + int flags; +} YV12_BUFFER_CONFIG; - int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, +#define YV12_FLAG_HIGHBITDEPTH 1 + +int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, + int width, int height, int border); +int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int border); - int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, - int width, int height, int border); - int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf); +int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf); - int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, +int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, + int width, int height, int ss_x, int ss_y, +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif + int border); + +// Updates the yv12 buffer config with the frame buffer. If cb is not +// NULL, then libvpx is using the frame buffer callbacks to handle memory. +// If cb is not NULL, libvpx will call cb with minimum size in bytes needed +// to decode the current frame. If cb is NULL, libvpx will allocate memory +// internally to decode the current frame. Returns 0 on success. Returns < 0 +// on failure. +int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int ss_x, int ss_y, - int border); - int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, - int width, int height, int ss_x, int ss_y, - int border); - int vp9_free_frame_buffer(YV12_BUFFER_CONFIG *ybf); +#if CONFIG_VP9_HIGHBITDEPTH + int use_highbitdepth, +#endif + int border, + vpx_codec_frame_buffer_t *fb, + vpx_get_frame_buffer_cb_fn_t cb, + void *cb_priv); +int vp9_free_frame_buffer(YV12_BUFFER_CONFIG *ybf); #ifdef __cplusplus } #endif -#endif // YV12_CONFIG_H +#endif // VPX_SCALE_YV12CONFIG_H_ diff --git a/media/libvpx/vpx_scale_rtcd_armv7-android-gcc.h b/media/libvpx/vpx_scale_rtcd_armv7-android-gcc.h index 0e034e3db5a0..0a6d790c028e 100644 --- a/media/libvpx/vpx_scale_rtcd_armv7-android-gcc.h +++ b/media/libvpx/vpx_scale_rtcd_armv7-android-gcc.h @@ -7,22 +7,20 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + struct yv12_buffer_config; -void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); -#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c - -void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); -#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); #define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c -void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); -#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c - -void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); -#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); #define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c @@ -30,25 +28,29 @@ void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); #define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c -void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); -void vp8_yv12_extend_frame_borders_neon(struct yv12_buffer_config *ybf); -RTCD_EXTERN void (*vp8_yv12_extend_frame_borders)(struct yv12_buffer_config *ybf); +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -void vp8_yv12_copy_frame_neon(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -RTCD_EXTERN void (*vp8_yv12_copy_frame)(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c -void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -void vpx_yv12_copy_y_neon(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -RTCD_EXTERN void (*vpx_yv12_copy_y)(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c -void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); +void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf); #define vp9_extend_frame_borders vp9_extend_frame_borders_c -void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); +void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + void vpx_scale_rtcd(void); + #include "vpx_config.h" #ifdef RTCD_C @@ -59,21 +61,11 @@ static void setup_rtcd_internal(void) (void)flags; - - - - - - - - vp8_yv12_extend_frame_borders = vp8_yv12_extend_frame_borders_c; - if (flags & HAS_NEON) vp8_yv12_extend_frame_borders = vp8_yv12_extend_frame_borders_neon; - - vp8_yv12_copy_frame = vp8_yv12_copy_frame_c; - if (flags & HAS_NEON) vp8_yv12_copy_frame = vp8_yv12_copy_frame_neon; - - vpx_yv12_copy_y = vpx_yv12_copy_y_c; - if (flags & HAS_NEON) vpx_yv12_copy_y = vpx_yv12_copy_y_neon; } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vpx_scale_rtcd_generic-gnu.h b/media/libvpx/vpx_scale_rtcd_generic-gnu.h index 472a29072cd8..f5e6caa038ef 100644 --- a/media/libvpx/vpx_scale_rtcd_generic-gnu.h +++ b/media/libvpx/vpx_scale_rtcd_generic-gnu.h @@ -7,22 +7,20 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + struct yv12_buffer_config; -void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); -#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c - -void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); -#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); #define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c -void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); -#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c - -void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); -#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); #define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c @@ -30,28 +28,39 @@ void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); #define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c -void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); -#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vp8_yv12_copy_frame vp8_yv12_copy_frame_c +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c + +void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp9_extend_frame_borders vp9_extend_frame_borders_c + +void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); +#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c + void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vpx_yv12_copy_y vpx_yv12_copy_y_c -void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); -#define vp9_extend_frame_borders vp9_extend_frame_borders_c - -void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); -#define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c - void vpx_scale_rtcd(void); + #include "vpx_config.h" #ifdef RTCD_C static void setup_rtcd_internal(void) { - } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vpx_scale_rtcd_x86-darwin9-gcc.h b/media/libvpx/vpx_scale_rtcd_x86-darwin9-gcc.h index e71138366079..7487e5f872ed 100644 --- a/media/libvpx/vpx_scale_rtcd_x86-darwin9-gcc.h +++ b/media/libvpx/vpx_scale_rtcd_x86-darwin9-gcc.h @@ -7,22 +7,20 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + struct yv12_buffer_config; -void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); -#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c - -void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); -#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); #define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c -void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); -#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c - -void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); -#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); #define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c @@ -30,21 +28,27 @@ void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); #define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c -void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); -#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vp8_yv12_copy_frame vp8_yv12_copy_frame_c -void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vpx_yv12_copy_y vpx_yv12_copy_y_c +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c -void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); +void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf); #define vp9_extend_frame_borders vp9_extend_frame_borders_c -void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); +void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + void vpx_scale_rtcd(void); #ifdef RTCD_C @@ -55,7 +59,11 @@ static void setup_rtcd_internal(void) (void)flags; - } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vpx_scale_rtcd_x86-linux-gcc.h b/media/libvpx/vpx_scale_rtcd_x86-linux-gcc.h index e71138366079..7487e5f872ed 100644 --- a/media/libvpx/vpx_scale_rtcd_x86-linux-gcc.h +++ b/media/libvpx/vpx_scale_rtcd_x86-linux-gcc.h @@ -7,22 +7,20 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + struct yv12_buffer_config; -void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); -#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c - -void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); -#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); #define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c -void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); -#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c - -void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); -#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); #define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c @@ -30,21 +28,27 @@ void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); #define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c -void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); -#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vp8_yv12_copy_frame vp8_yv12_copy_frame_c -void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vpx_yv12_copy_y vpx_yv12_copy_y_c +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c -void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); +void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf); #define vp9_extend_frame_borders vp9_extend_frame_borders_c -void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); +void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + void vpx_scale_rtcd(void); #ifdef RTCD_C @@ -55,7 +59,11 @@ static void setup_rtcd_internal(void) (void)flags; - } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vpx_scale_rtcd_x86-win32-gcc.h b/media/libvpx/vpx_scale_rtcd_x86-win32-gcc.h index e71138366079..7487e5f872ed 100644 --- a/media/libvpx/vpx_scale_rtcd_x86-win32-gcc.h +++ b/media/libvpx/vpx_scale_rtcd_x86-win32-gcc.h @@ -7,22 +7,20 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + struct yv12_buffer_config; -void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); -#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c - -void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); -#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); #define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c -void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); -#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c - -void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); -#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); #define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c @@ -30,21 +28,27 @@ void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); #define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c -void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); -#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vp8_yv12_copy_frame vp8_yv12_copy_frame_c -void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vpx_yv12_copy_y vpx_yv12_copy_y_c +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c -void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); +void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf); #define vp9_extend_frame_borders vp9_extend_frame_borders_c -void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); +void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + void vpx_scale_rtcd(void); #ifdef RTCD_C @@ -55,7 +59,11 @@ static void setup_rtcd_internal(void) (void)flags; - } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vpx_scale_rtcd_x86-win32-vs8.h b/media/libvpx/vpx_scale_rtcd_x86-win32-vs8.h index e71138366079..7487e5f872ed 100644 --- a/media/libvpx/vpx_scale_rtcd_x86-win32-vs8.h +++ b/media/libvpx/vpx_scale_rtcd_x86-win32-vs8.h @@ -7,22 +7,20 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + struct yv12_buffer_config; -void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); -#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c - -void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); -#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); #define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c -void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); -#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c - -void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); -#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); #define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c @@ -30,21 +28,27 @@ void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); #define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c -void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); -#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vp8_yv12_copy_frame vp8_yv12_copy_frame_c -void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vpx_yv12_copy_y vpx_yv12_copy_y_c +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c -void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); +void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf); #define vp9_extend_frame_borders vp9_extend_frame_borders_c -void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); +void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + void vpx_scale_rtcd(void); #ifdef RTCD_C @@ -55,7 +59,11 @@ static void setup_rtcd_internal(void) (void)flags; - } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vpx_scale_rtcd_x86_64-darwin9-gcc.h b/media/libvpx/vpx_scale_rtcd_x86_64-darwin9-gcc.h index e71138366079..7487e5f872ed 100644 --- a/media/libvpx/vpx_scale_rtcd_x86_64-darwin9-gcc.h +++ b/media/libvpx/vpx_scale_rtcd_x86_64-darwin9-gcc.h @@ -7,22 +7,20 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + struct yv12_buffer_config; -void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); -#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c - -void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); -#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); #define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c -void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); -#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c - -void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); -#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); #define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c @@ -30,21 +28,27 @@ void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); #define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c -void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); -#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vp8_yv12_copy_frame vp8_yv12_copy_frame_c -void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vpx_yv12_copy_y vpx_yv12_copy_y_c +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c -void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); +void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf); #define vp9_extend_frame_borders vp9_extend_frame_borders_c -void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); +void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + void vpx_scale_rtcd(void); #ifdef RTCD_C @@ -55,7 +59,11 @@ static void setup_rtcd_internal(void) (void)flags; - } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vpx_scale_rtcd_x86_64-linux-gcc.h b/media/libvpx/vpx_scale_rtcd_x86_64-linux-gcc.h index e71138366079..7487e5f872ed 100644 --- a/media/libvpx/vpx_scale_rtcd_x86_64-linux-gcc.h +++ b/media/libvpx/vpx_scale_rtcd_x86_64-linux-gcc.h @@ -7,22 +7,20 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + struct yv12_buffer_config; -void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); -#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c - -void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); -#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); #define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c -void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); -#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c - -void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); -#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); #define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c @@ -30,21 +28,27 @@ void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); #define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c -void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); -#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vp8_yv12_copy_frame vp8_yv12_copy_frame_c -void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vpx_yv12_copy_y vpx_yv12_copy_y_c +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c -void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); +void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf); #define vp9_extend_frame_borders vp9_extend_frame_borders_c -void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); +void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + void vpx_scale_rtcd(void); #ifdef RTCD_C @@ -55,7 +59,11 @@ static void setup_rtcd_internal(void) (void)flags; - } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vpx_scale_rtcd_x86_64-win64-gcc.h b/media/libvpx/vpx_scale_rtcd_x86_64-win64-gcc.h index e71138366079..7487e5f872ed 100644 --- a/media/libvpx/vpx_scale_rtcd_x86_64-win64-gcc.h +++ b/media/libvpx/vpx_scale_rtcd_x86_64-win64-gcc.h @@ -7,22 +7,20 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + struct yv12_buffer_config; -void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); -#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c - -void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); -#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); #define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c -void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); -#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c - -void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); -#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); #define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c @@ -30,21 +28,27 @@ void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); #define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c -void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); -#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vp8_yv12_copy_frame vp8_yv12_copy_frame_c -void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vpx_yv12_copy_y vpx_yv12_copy_y_c +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c -void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); +void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf); #define vp9_extend_frame_borders vp9_extend_frame_borders_c -void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); +void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + void vpx_scale_rtcd(void); #ifdef RTCD_C @@ -55,7 +59,11 @@ static void setup_rtcd_internal(void) (void)flags; - } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vpx_scale_rtcd_x86_64-win64-vs8.h b/media/libvpx/vpx_scale_rtcd_x86_64-win64-vs8.h index e71138366079..7487e5f872ed 100644 --- a/media/libvpx/vpx_scale_rtcd_x86_64-win64-vs8.h +++ b/media/libvpx/vpx_scale_rtcd_x86_64-win64-vs8.h @@ -7,22 +7,20 @@ #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + struct yv12_buffer_config; -void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); -#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c - -void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); -#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); #define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c -void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); -#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c - -void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); -#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); #define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c @@ -30,21 +28,27 @@ void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); #define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c -void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); -#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); #define vp8_yv12_copy_frame vp8_yv12_copy_frame_c -void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); -#define vpx_yv12_copy_y vpx_yv12_copy_y_c +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c -void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); +void vp9_extend_frame_borders_c(struct yv12_buffer_config *ybf); #define vp9_extend_frame_borders vp9_extend_frame_borders_c -void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y); +void vp9_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); #define vp9_extend_frame_inner_borders vp9_extend_frame_inner_borders_c +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + void vpx_scale_rtcd(void); #ifdef RTCD_C @@ -55,7 +59,11 @@ static void setup_rtcd_internal(void) (void)flags; - } #endif + +#ifdef __cplusplus +} // extern "C" +#endif + #endif diff --git a/media/libvpx/vpx_version.h b/media/libvpx/vpx_version.h index f60dcc7092ae..5b10d65f28c4 100644 --- a/media/libvpx/vpx_version.h +++ b/media/libvpx/vpx_version.h @@ -1,7 +1,7 @@ #define VERSION_MAJOR 1 #define VERSION_MINOR 3 #define VERSION_PATCH 0 -#define VERSION_EXTRA "" +#define VERSION_EXTRA "4207-gc731d6a" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.3.0" -#define VERSION_STRING " v1.3.0" +#define VERSION_STRING_NOSP "v1.3.0-4207-gc731d6a" +#define VERSION_STRING " v1.3.0-4207-gc731d6a"