Bug 1063356 - Update libvpx source. r=kinetik

Results of running ./update.py --ndk ~/android/android-ndk-r9
    --commit c731d6a4f19eea861ceb2ff31399420b2452eb74
This commit is contained in:
Ralph Giles 2014-10-07 09:49:40 -07:00
parent 1d50c84dd8
commit 307e5968d6
507 changed files with 66918 additions and 59184 deletions

View File

@ -1,22 +1,23 @@
Additional IP Rights Grant (Patents)
------------------------------------
"This implementation" means the copyrightable works distributed by
Google as part of the WebM Project.
"These implementations" means the copyrightable works that implement the WebM
codecs distributed by Google as part of the WebM Project.
Google hereby grants to you a perpetual, worldwide, non-exclusive,
no-charge, royalty-free, irrevocable (except as stated in this section)
patent license to make, have made, use, offer to sell, sell, import,
transfer, and otherwise run, modify and propagate the contents of this
implementation of VP8, where such license applies only to those patent
claims, both currently owned by Google and acquired in the future,
licensable by Google that are necessarily infringed by this
implementation of VP8. This grant does not include claims that would be
infringed only as a consequence of further modification of this
implementation. If you or your agent or exclusive licensee institute or
order or agree to the institution of patent litigation against any
entity (including a cross-claim or counterclaim in a lawsuit) alleging
that this implementation of VP8 or any code incorporated within this
implementation of VP8 constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any patent
rights granted to you under this License for this implementation of VP8
shall terminate as of the date such litigation is filed.
Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge,
royalty-free, irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and otherwise
run, modify and propagate the contents of these implementations of WebM, where
such license applies only to those patent claims, both currently owned by
Google and acquired in the future, licensable by Google that are necessarily
infringed by these implementations of WebM. This grant does not include claims
that would be infringed only as a consequence of further modification of these
implementations. If you or your agent or exclusive licensee institute or order
or agree to the institution of patent litigation or any other patent
enforcement activity against any entity (including a cross-claim or
counterclaim in a lawsuit) alleging that any of these implementations of WebM
or any code incorporated within any of these implementations of WebM
constitutes direct or contributory patent infringement, or inducement of
patent infringement, then any patent rights granted to you under this License
for these implementations of WebM shall terminate as of the date such
litigation is filed.

View File

@ -8,4 +8,4 @@ The libvpx git repository is:
https://gerrit.chromium.org/gerrit/webm/libvpx
The git commit ID used was 2e88f2f2ec777259bda1714e72f1ecd2519bceb5
The git commit ID used was c731d6a4f19eea861ceb2ff31399420b2452eb74

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl
#!/usr/bin/env perl
##
## Copyright (c) 2010 The WebM project authors. All Rights Reserved.
##

View File

@ -21,6 +21,7 @@ typedef enum {
OUTPUT_FMT_PLAIN,
OUTPUT_FMT_RVDS,
OUTPUT_FMT_GAS,
OUTPUT_FMT_C_HEADER,
} output_fmt_t;
int log_msg(const char *fmt, ...) {
@ -33,6 +34,18 @@ int log_msg(const char *fmt, ...) {
}
#if defined(__GNUC__) && __GNUC__
#if defined(FORCE_PARSE_ELF)
#if defined(__MACH__)
#undef __MACH__
#endif
#if !defined(__ELF__)
#define __ELF__
#endif
#endif
#if defined(__MACH__)
#include <mach-o/loader.h>
@ -43,9 +56,12 @@ int print_macho_equ(output_fmt_t mode, uint8_t* name, int val) {
case OUTPUT_FMT_RVDS:
printf("%-40s EQU %5d\n", name, val);
return 0;
case OUTPUT_FMT_GAS:
case OUTPUT_FMT_GAS:
printf(".set %-40s, %5d\n", name, val);
return 0;
case OUTPUT_FMT_C_HEADER:
printf("#define %-40s %5d\n", name, val);
return 0;
default:
log_msg("Unsupported mode: %d", mode);
return 1;
@ -321,7 +337,7 @@ bail:
return 1;
}
char *parse_elf_string_table(elf_obj_t *elf, int s_idx, int idx) {
const char *parse_elf_string_table(elf_obj_t *elf, int s_idx, int idx) {
if (elf->bits == 32) {
Elf32_Shdr shdr;
@ -491,6 +507,13 @@ int parse_elf(uint8_t *buf, size_t sz, output_fmt_t mode) {
sym.st_name),
val);
break;
case OUTPUT_FMT_C_HEADER:
printf("#define %-40s %5d\n",
parse_elf_string_table(&elf,
shdr.sh_link,
sym.st_name),
val);
break;
default:
printf("%s = %d\n",
parse_elf_string_table(&elf,
@ -655,7 +678,11 @@ int parse_coff(uint8_t *buf, size_t sz) {
}
strcpy(sectionlist[i], sectionname);
if (!strcmp(sectionname, ".rdata")) sectionrawdata_ptr = get_le32(ptr + 20);
// check if it's .rdata and is not a COMDAT section.
if (!strcmp(sectionname, ".rdata") &&
(get_le32(ptr + 36) & 0x1000) == 0) {
sectionrawdata_ptr = get_le32(ptr + 20);
}
ptr += 40;
}
@ -762,6 +789,7 @@ int main(int argc, char **argv) {
fprintf(stderr, "Output Formats:\n");
fprintf(stderr, " gas - compatible with GNU assembler\n");
fprintf(stderr, " rvds - compatible with armasm\n");
fprintf(stderr, " cheader - c/c++ header file\n");
goto bail;
}
@ -771,6 +799,8 @@ int main(int argc, char **argv) {
mode = OUTPUT_FMT_RVDS;
else if (!strcmp(argv[1], "gas"))
mode = OUTPUT_FMT_GAS;
else if (!strcmp(argv[1], "cheader"))
mode = OUTPUT_FMT_C_HEADER;
else
f = argv[1];

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl
#!/usr/bin/env perl
##
## Copyright (c) 2013 The WebM project authors. All Rights Reserved.
##
@ -24,7 +24,7 @@ sub FixThumbInstructions($$)
# with left shift, addition and a right shift (to restore the
# register to the original value). Currently the right shift
# isn't necessary in the code base since the values in these
# registers aren't used, but doing the shift for consitency.
# registers aren't used, but doing the shift for consistency.
# This converts instructions such as "add r12, r12, r5, lsl r4"
# into the sequence "lsl r5, r4", "add r12, r12, r5", "lsr r5, r4".
s/^(\s*)(add)(\s+)(r\d+),\s*(r\d+),\s*(r\d+),\s*lsl (r\d+)/$1lsl$3$6, $7\n$1$2$3$4, $5, $6\n$1lsr$3$6, $7/g;
@ -51,7 +51,7 @@ sub FixThumbInstructions($$)
# Convert register post indexing to a separate add instruction.
# This converts "ldrneb r9, [r0], r2" into "ldrneb r9, [r0]",
# "add r0, r2".
# "addne r0, r0, r2".
s/^(\s*)((ldr|str)(ne)?[bhd]?)(\s+)(\w+),(\s*\w+,)?\s*\[(\w+)\],\s*(\w+)/$1$2$5$6,$7 [$8]\n$1add$4$5$8, $8, $9/g;
# Convert a conditional addition to the pc register into a series of

View File

@ -1,244 +1,105 @@
files = {
'ARM_ASM': [
'vp8/common/arm/armv6/bilinearfilter_v6.asm',
'vp8/common/arm/armv6/copymem16x16_v6.asm',
'vp8/common/arm/armv6/copymem8x4_v6.asm',
'vp8/common/arm/armv6/copymem8x8_v6.asm',
'vp8/common/arm/armv6/dc_only_idct_add_v6.asm',
'vp8/common/arm/armv6/dequant_idct_v6.asm',
'vp8/common/arm/armv6/dequantize_v6.asm',
'vp8/common/arm/armv6/filter_v6.asm',
'vp8/common/arm/armv6/idct_blk_v6.c',
'vp8/common/arm/armv6/idct_v6.asm',
'vp8/common/arm/armv6/intra4x4_predict_v6.asm',
'vp8/common/arm/armv6/iwalsh_v6.asm',
'vp8/common/arm/armv6/loopfilter_v6.asm',
'vp8/common/arm/armv6/simpleloopfilter_v6.asm',
'vp8/common/arm/armv6/sixtappredict8x4_v6.asm',
'vp8/common/arm/armv6/vp8_sad16x16_armv6.asm',
'vp8/common/arm/armv6/vp8_variance16x16_armv6.asm',
'vp8/common/arm/armv6/vp8_variance8x8_armv6.asm',
'vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm',
'vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm',
'vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm',
'vp8/common/arm/bilinearfilter_arm.c',
'vp8/common/arm/dequantize_arm.c',
'vp8/common/arm/filter_arm.c',
'vp8/common/arm/loopfilter_arm.c',
'vp8/common/arm/neon/bilinearpredict16x16_neon.asm',
'vp8/common/arm/neon/bilinearpredict4x4_neon.asm',
'vp8/common/arm/neon/bilinearpredict8x4_neon.asm',
'vp8/common/arm/neon/bilinearpredict8x8_neon.asm',
'vp8/common/arm/neon/buildintrapredictorsmby_neon.asm',
'vp8/common/arm/neon/copymem16x16_neon.asm',
'vp8/common/arm/neon/copymem8x4_neon.asm',
'vp8/common/arm/neon/copymem8x8_neon.asm',
'vp8/common/arm/neon/dc_only_idct_add_neon.asm',
'vp8/common/arm/neon/dequant_idct_neon.asm',
'vp8/common/arm/neon/dequantizeb_neon.asm',
'vp8/common/arm/neon/idct_blk_neon.c',
'vp8/common/arm/neon/idct_dequant_0_2x_neon.asm',
'vp8/common/arm/neon/idct_dequant_full_2x_neon.asm',
'vp8/common/arm/neon/iwalsh_neon.asm',
'vp8/common/arm/neon/loopfilter_neon.asm',
'vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm',
'vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm',
'vp8/common/arm/neon/mbloopfilter_neon.asm',
'vp8/common/arm/neon/sad16_neon.asm',
'vp8/common/arm/neon/sad8_neon.asm',
'vp8/common/arm/neon/save_reg_neon.asm',
'vp8/common/arm/neon/shortidct4x4llm_neon.asm',
'vp8/common/arm/neon/sixtappredict16x16_neon.asm',
'vp8/common/arm/neon/sixtappredict4x4_neon.asm',
'vp8/common/arm/neon/sixtappredict8x4_neon.asm',
'vp8/common/arm/neon/sixtappredict8x8_neon.asm',
'vp8/common/arm/neon/variance_neon.asm',
'vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm',
'vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm',
'vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm',
'vp8/common/arm/reconintra_arm.c',
'vp8/common/arm/variance_arm.c',
'vp8/encoder/arm/armv5te/boolhuff_armv5te.asm',
'vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm',
'vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm',
'vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm',
'vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm',
'vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm',
'vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm',
'vp8/encoder/arm/armv6/vp8_subtract_armv6.asm',
'vp8/encoder/arm/armv6/walsh_v6.asm',
'vp8/encoder/arm/boolhuff_arm.c',
'vp8/encoder/arm/dct_arm.c',
'vp8/encoder/arm/neon/fastquantizeb_neon.asm',
'vp8/encoder/arm/neon/picklpf_arm.c',
'vp8/encoder/arm/neon/shortfdct_neon.asm',
'vp8/encoder/arm/neon/subtract_neon.asm',
'vp8/encoder/arm/neon/vp8_memcpy_neon.asm',
'vp8/encoder/arm/neon/vp8_mse16x16_neon.asm',
'vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm',
'vp8/encoder/arm/quantize_arm.c',
'vp9/common/arm/neon/vp9_avg_neon.asm',
'vp9/common/arm/neon/vp9_convolve8_avg_neon.asm',
'vp9/common/arm/neon/vp9_convolve8_neon.asm',
'vp9/common/arm/neon/vp9_convolve_neon.c',
'vp9/common/arm/neon/vp9_copy_neon.asm',
'vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm',
'vp9/common/arm/neon/vp9_idct16x16_neon.c',
'vp9/common/arm/neon/vp9_loopfilter_neon.asm',
'vp9/common/arm/neon/vp9_mb_lpf_neon.asm',
'vp9/common/arm/neon/vp9_save_reg_neon.asm',
'vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm',
'vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm',
'vp9/common/arm/neon/vp9_short_idct32x32_1_add_neon.asm',
'vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm',
'vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm',
'vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm',
'vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm',
'vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm',
'vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm',
'vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm',
'vpx_ports/arm_cpudetect.c',
'vpx_scale/arm/neon/vp8_vpxyv12_copy_y_neon.asm',
'vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm',
'vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm',
'vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm',
'vpx_scale/arm/neon/yv12extend_arm.c'
],
'AVX2': ['vp9/common/x86/vp9_loopfilter_intrin_avx2.c'],
'UNIFIED_SOURCES': [
'vp8/common/alloccommon.c',
'vp8/common/blockd.c',
'vp8/common/debugmodes.c',
'vp8/common/dequantize.c',
'vp8/common/entropy.c',
'vp8/common/entropymode.c',
'vp8/common/entropymv.c',
'vp8/common/extend.c',
'vp8/common/filter.c',
'vp8/common/findnearmv.c',
'vp8/common/generic/systemdependent.c',
'vp8/common/idct_blk.c',
'vp8/common/idctllm.c',
'vp8/common/loopfilter.c',
'vp8/common/loopfilter_filters.c',
'vp8/common/mbpitch.c',
'vp8/common/modecont.c',
'vp8/common/quant_common.c',
'vp8/common/reconinter.c',
'vp8/common/reconintra.c',
'vp8/common/reconintra4x4.c',
'vp8/common/setupintrarecon.c',
'vp8/common/swapyv12buffer.c',
'vp8/common/treecoder.c',
'vp8/common/variance_c.c',
'vp8/decoder/dboolhuff.c',
'vp8/decoder/decodemv.c',
'vp8/decoder/decodframe.c',
'vp8/decoder/detokenize.c',
'vp8/decoder/onyxd_if.c',
'vp8/decoder/threading.c',
'vp8/encoder/bitstream.c',
'vp8/encoder/dct.c',
'vp8/encoder/denoising.c',
'vp8/encoder/encodeframe.c',
'vp8/encoder/encodeintra.c',
'vp8/encoder/encodemb.c',
'vp8/encoder/encodemv.c',
'vp8/encoder/ethreading.c',
'vp8/encoder/firstpass.c',
'vp8/encoder/lookahead.c',
'vp8/encoder/mcomp.c',
'vp8/encoder/modecosts.c',
'vp8/encoder/mr_dissim.c',
'vp8/encoder/onyx_if.c',
'vp8/encoder/pickinter.c',
'vp8/encoder/picklpf.c',
'vp8/encoder/psnr.c',
'vp8/encoder/quantize.c',
'vp8/encoder/ratectrl.c',
'vp8/encoder/rdopt.c',
'vp8/encoder/segmentation.c',
'vp8/encoder/temporal_filter.c',
'vp8/encoder/tokenize.c',
'vp8/encoder/treewriter.c',
'vp8/vp8_cx_iface.c',
'vp9/common/generic/vp9_systemdependent.c',
'vp9/common/vp9_alloccommon.c',
'vp9/common/vp9_common_data.c',
'vp9/common/vp9_convolve.c',
'vp9/common/vp9_debugmodes.c',
'vp9/common/vp9_entropy.c',
'vp9/common/vp9_entropymode.c',
'vp9/common/vp9_extend.c',
'vp9/common/vp9_filter.c',
'vp9/common/vp9_findnearmv.c',
'vp9/common/vp9_idct.c',
'vp9/common/vp9_loopfilter.c',
'vp9/common/vp9_loopfilter_filters.c',
'vp9/common/vp9_mvref_common.c',
'vp9/common/vp9_pred_common.c',
'vp9/common/vp9_quant_common.c',
'vp9/common/vp9_reconinter.c',
'vp9/common/vp9_reconintra.c',
'vp9/common/vp9_scale.c',
'vp9/common/vp9_scan.c',
'vp9/common/vp9_seg_common.c',
'vp9/common/vp9_tile_common.c',
'vp9/common/vp9_treecoder.c',
'vp9/decoder/vp9_dboolhuff.c',
'vp9/decoder/vp9_decodemv.c',
'vp9/decoder/vp9_decodframe.c',
'vp9/decoder/vp9_detokenize.c',
'vp9/decoder/vp9_dsubexp.c',
'vp9/decoder/vp9_onyxd_if.c',
'vp9/decoder/vp9_thread.c',
'vp9/encoder/vp9_boolhuff.c',
'vp9/encoder/vp9_dct.c',
'vp9/encoder/vp9_encodeframe.c',
'vp9/encoder/vp9_encodeintra.c',
'vp9/encoder/vp9_encodemb.c',
'vp9/encoder/vp9_encodemv.c',
'vp9/encoder/vp9_firstpass.c',
'vp9/encoder/vp9_lookahead.c',
'vp9/encoder/vp9_mbgraph.c',
'vp9/encoder/vp9_mcomp.c',
'vp9/encoder/vp9_modecosts.c',
'vp9/encoder/vp9_onyx_if.c',
'vp9/encoder/vp9_picklpf.c',
'vp9/encoder/vp9_psnr.c',
'vp9/encoder/vp9_quantize.c',
'vp9/encoder/vp9_ratectrl.c',
'vp9/encoder/vp9_rdopt.c',
'vp9/encoder/vp9_sad_c.c',
'vp9/encoder/vp9_segmentation.c',
'vp9/encoder/vp9_subexp.c',
'vp9/encoder/vp9_temporal_filter.c',
'vp9/encoder/vp9_tokenize.c',
'vp9/encoder/vp9_treewriter.c',
'vp9/encoder/vp9_vaq.c',
'vp9/encoder/vp9_variance_c.c',
'vp9/vp9_cx_iface.c',
'vp9/vp9_dx_iface.c',
'vpx/src/vpx_codec.c',
'vpx/src/vpx_decoder.c',
'vpx/src/vpx_encoder.c',
'vpx/src/vpx_image.c',
'vpx_scale/generic/gen_scalers.c',
'vpx_scale/generic/vpx_scale.c',
'vpx_scale/generic/yv12config.c',
'vpx_scale/generic/yv12extend.c',
'vpx_scale/vpx_scale_rtcd.c'
],
'SOURCES': [
'vp8/common/rtcd.c',
'vp8/common/sad_c.c',
'vp8/vp8_dx_iface.c',
'vp9/common/vp9_entropymv.c',
'vp9/common/vp9_rtcd.c',
'vp9/encoder/vp9_bitstream.c',
'vpx/src/svc_encodeframe.c',
'vpx_mem/vpx_mem.c',
],
'ARM_ASM': ['vp8/common/arm/armv6/bilinearfilter_v6.asm',
'vp8/common/arm/armv6/copymem16x16_v6.asm',
'vp8/common/arm/armv6/copymem8x4_v6.asm',
'vp8/common/arm/armv6/copymem8x8_v6.asm',
'vp8/common/arm/armv6/dc_only_idct_add_v6.asm',
'vp8/common/arm/armv6/dequant_idct_v6.asm',
'vp8/common/arm/armv6/dequantize_v6.asm',
'vp8/common/arm/armv6/filter_v6.asm',
'vp8/common/arm/armv6/idct_blk_v6.c',
'vp8/common/arm/armv6/idct_v6.asm',
'vp8/common/arm/armv6/intra4x4_predict_v6.asm',
'vp8/common/arm/armv6/iwalsh_v6.asm',
'vp8/common/arm/armv6/loopfilter_v6.asm',
'vp8/common/arm/armv6/simpleloopfilter_v6.asm',
'vp8/common/arm/armv6/sixtappredict8x4_v6.asm',
'vp8/common/arm/armv6/vp8_sad16x16_armv6.asm',
'vp8/common/arm/armv6/vp8_variance16x16_armv6.asm',
'vp8/common/arm/armv6/vp8_variance8x8_armv6.asm',
'vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm',
'vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm',
'vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm',
'vp8/common/arm/bilinearfilter_arm.c',
'vp8/common/arm/dequantize_arm.c',
'vp8/common/arm/filter_arm.c',
'vp8/common/arm/loopfilter_arm.c',
'vp8/common/arm/neon/bilinearpredict_neon.c',
'vp8/common/arm/neon/copymem_neon.c',
'vp8/common/arm/neon/dc_only_idct_add_neon.c',
'vp8/common/arm/neon/dequant_idct_neon.c',
'vp8/common/arm/neon/dequantizeb_neon.c',
'vp8/common/arm/neon/idct_blk_neon.c',
'vp8/common/arm/neon/idct_dequant_0_2x_neon.c',
'vp8/common/arm/neon/idct_dequant_full_2x_neon.c',
'vp8/common/arm/neon/iwalsh_neon.c',
'vp8/common/arm/neon/loopfilter_neon.c',
'vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c',
'vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c',
'vp8/common/arm/neon/mbloopfilter_neon.c',
'vp8/common/arm/neon/reconintra_neon.c',
'vp8/common/arm/neon/sad_neon.c',
'vp8/common/arm/neon/shortidct4x4llm_neon.c',
'vp8/common/arm/neon/sixtappredict_neon.c',
'vp8/common/arm/neon/variance_neon.c',
'vp8/common/arm/neon/vp8_subpixelvariance_neon.c',
'vp8/common/arm/variance_arm.c',
'vp8/encoder/arm/armv5te/boolhuff_armv5te.asm',
'vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm',
'vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm',
'vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm',
'vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm',
'vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm',
'vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm',
'vp8/encoder/arm/armv6/vp8_subtract_armv6.asm',
'vp8/encoder/arm/armv6/walsh_v6.asm',
'vp8/encoder/arm/boolhuff_arm.c',
'vp8/encoder/arm/dct_arm.c',
'vp8/encoder/arm/neon/denoising_neon.c',
'vp8/encoder/arm/neon/fastquantizeb_neon.asm',
'vp8/encoder/arm/neon/shortfdct_neon.c',
'vp8/encoder/arm/neon/subtract_neon.c',
'vp8/encoder/arm/neon/vp8_mse16x16_neon.asm',
'vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c',
'vp8/encoder/arm/quantize_arm.c',
'vp9/common/arm/neon/vp9_avg_neon.asm',
'vp9/common/arm/neon/vp9_convolve8_avg_neon.asm',
'vp9/common/arm/neon/vp9_convolve8_neon.asm',
'vp9/common/arm/neon/vp9_convolve_neon.c',
'vp9/common/arm/neon/vp9_copy_neon.asm',
'vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm',
'vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm',
'vp9/common/arm/neon/vp9_idct16x16_add_neon.asm',
'vp9/common/arm/neon/vp9_idct16x16_neon.c',
'vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm',
'vp9/common/arm/neon/vp9_idct32x32_add_neon.asm',
'vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm',
'vp9/common/arm/neon/vp9_idct4x4_add_neon.asm',
'vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm',
'vp9/common/arm/neon/vp9_idct8x8_add_neon.asm',
'vp9/common/arm/neon/vp9_iht4x4_add_neon.asm',
'vp9/common/arm/neon/vp9_iht8x8_add_neon.asm',
'vp9/common/arm/neon/vp9_loopfilter_16_neon.asm',
'vp9/common/arm/neon/vp9_loopfilter_16_neon.c',
'vp9/common/arm/neon/vp9_loopfilter_neon.asm',
'vp9/common/arm/neon/vp9_mb_lpf_neon.asm',
'vp9/common/arm/neon/vp9_reconintra_neon.asm',
'vp9/common/arm/neon/vp9_save_reg_neon.asm',
'vp9/encoder/arm/neon/vp9_dct_neon.c',
'vp9/encoder/arm/neon/vp9_quantize_neon.c',
'vp9/encoder/arm/neon/vp9_sad_neon.c',
'vp9/encoder/arm/neon/vp9_subtract_neon.c',
'vp9/encoder/arm/neon/vp9_variance_neon.c',
'vpx_ports/arm_cpudetect.c'],
'AVX2': ['vp9/common/x86/vp9_loopfilter_intrin_avx2.c',
'vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c',
'vp9/encoder/x86/vp9_dct32x32_avx2.c',
'vp9/encoder/x86/vp9_dct_avx2.c',
'vp9/encoder/x86/vp9_error_intrin_avx2.c',
'vp9/encoder/x86/vp9_sad4d_intrin_avx2.c',
'vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c',
'vp9/encoder/x86/vp9_variance_avx2.c',
'vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c'],
'ERROR_CONCEALMENT': ['vp8/decoder/error_concealment.c'],
'EXPORTS': ['vpx/vp8.h',
'vpx/vp8cx.h',
@ -246,6 +107,7 @@ files = {
'vpx/vpx_codec.h',
'vpx/vpx_decoder.h',
'vpx/vpx_encoder.h',
'vpx/vpx_frame_buffer.h',
'vpx/vpx_image.h',
'vpx/vpx_integer.h',
'vpx_mem/include/vpx_mem_intrnl.h',
@ -256,86 +118,225 @@ files = {
'vpx_ports/x86.h',
'vpx_scale/vpx_scale.h',
'vpx_scale/yv12config.h'],
'SOURCES': ['vp8/common/rtcd.c',
'vp8/common/sad_c.c',
'vp8/encoder/bitstream.c',
'vp8/encoder/onyx_if.c',
'vp8/vp8_dx_iface.c',
'vp9/common/vp9_alloccommon.c',
'vp9/common/vp9_blockd.c',
'vp9/common/vp9_common_data.c',
'vp9/common/vp9_convolve.c',
'vp9/common/vp9_debugmodes.c',
'vp9/common/vp9_entropy.c',
'vp9/common/vp9_entropymode.c',
'vp9/common/vp9_entropymv.c',
'vp9/common/vp9_filter.c',
'vp9/common/vp9_frame_buffers.c',
'vp9/common/vp9_idct.c',
'vp9/common/vp9_loopfilter.c',
'vp9/common/vp9_loopfilter_filters.c',
'vp9/common/vp9_mvref_common.c',
'vp9/common/vp9_pred_common.c',
'vp9/common/vp9_prob.c',
'vp9/common/vp9_quant_common.c',
'vp9/common/vp9_reconinter.c',
'vp9/common/vp9_reconintra.c',
'vp9/common/vp9_rtcd.c',
'vp9/common/vp9_scale.c',
'vp9/common/vp9_scan.c',
'vp9/common/vp9_seg_common.c',
'vp9/common/vp9_thread.c',
'vp9/common/vp9_tile_common.c',
'vp9/decoder/vp9_decodeframe.c',
'vp9/decoder/vp9_decodemv.c',
'vp9/decoder/vp9_decoder.c',
'vp9/decoder/vp9_detokenize.c',
'vp9/decoder/vp9_dsubexp.c',
'vp9/decoder/vp9_dthread.c',
'vp9/decoder/vp9_reader.c',
'vp9/encoder/vp9_aq_complexity.c',
'vp9/encoder/vp9_aq_cyclicrefresh.c',
'vp9/encoder/vp9_aq_variance.c',
'vp9/encoder/vp9_bitstream.c',
'vp9/encoder/vp9_context_tree.c',
'vp9/encoder/vp9_cost.c',
'vp9/encoder/vp9_dct.c',
'vp9/encoder/vp9_encodeframe.c',
'vp9/encoder/vp9_encodemb.c',
'vp9/encoder/vp9_encodemv.c',
'vp9/encoder/vp9_encoder.c',
'vp9/encoder/vp9_extend.c',
'vp9/encoder/vp9_firstpass.c',
'vp9/encoder/vp9_lookahead.c',
'vp9/encoder/vp9_mbgraph.c',
'vp9/encoder/vp9_mcomp.c',
'vp9/encoder/vp9_picklpf.c',
'vp9/encoder/vp9_pickmode.c',
'vp9/encoder/vp9_quantize.c',
'vp9/encoder/vp9_ratectrl.c',
'vp9/encoder/vp9_rd.c',
'vp9/encoder/vp9_rdopt.c',
'vp9/encoder/vp9_resize.c',
'vp9/encoder/vp9_sad.c',
'vp9/encoder/vp9_segmentation.c',
'vp9/encoder/vp9_speed_features.c',
'vp9/encoder/vp9_subexp.c',
'vp9/encoder/vp9_svc_layercontext.c',
'vp9/encoder/vp9_temporal_filter.c',
'vp9/encoder/vp9_tokenize.c',
'vp9/encoder/vp9_treewriter.c',
'vp9/encoder/vp9_variance.c',
'vp9/encoder/vp9_write_bit_buffer.c',
'vp9/encoder/vp9_writer.c',
'vp9/vp9_cx_iface.c',
'vp9/vp9_dx_iface.c',
'vpx/src/svc_encodeframe.c',
'vpx/src/vpx_encoder.c',
'vpx_mem/vpx_mem.c',
'vpx_scale/generic/yv12config.c',
'vpx_scale/generic/yv12extend.c',
'vpx_scale/vpx_scale_rtcd.c'],
'UNIFIED_SOURCES': ['vp8/common/alloccommon.c',
'vp8/common/blockd.c',
'vp8/common/debugmodes.c',
'vp8/common/dequantize.c',
'vp8/common/entropy.c',
'vp8/common/entropymode.c',
'vp8/common/entropymv.c',
'vp8/common/extend.c',
'vp8/common/filter.c',
'vp8/common/findnearmv.c',
'vp8/common/generic/systemdependent.c',
'vp8/common/idct_blk.c',
'vp8/common/idctllm.c',
'vp8/common/loopfilter.c',
'vp8/common/loopfilter_filters.c',
'vp8/common/mbpitch.c',
'vp8/common/modecont.c',
'vp8/common/quant_common.c',
'vp8/common/reconinter.c',
'vp8/common/reconintra.c',
'vp8/common/reconintra4x4.c',
'vp8/common/setupintrarecon.c',
'vp8/common/swapyv12buffer.c',
'vp8/common/treecoder.c',
'vp8/common/variance_c.c',
'vp8/decoder/dboolhuff.c',
'vp8/decoder/decodeframe.c',
'vp8/decoder/decodemv.c',
'vp8/decoder/detokenize.c',
'vp8/decoder/onyxd_if.c',
'vp8/decoder/threading.c',
'vp8/encoder/dct.c',
'vp8/encoder/denoising.c',
'vp8/encoder/encodeframe.c',
'vp8/encoder/encodeintra.c',
'vp8/encoder/encodemb.c',
'vp8/encoder/encodemv.c',
'vp8/encoder/ethreading.c',
'vp8/encoder/firstpass.c',
'vp8/encoder/lookahead.c',
'vp8/encoder/mcomp.c',
'vp8/encoder/modecosts.c',
'vp8/encoder/mr_dissim.c',
'vp8/encoder/pickinter.c',
'vp8/encoder/picklpf.c',
'vp8/encoder/quantize.c',
'vp8/encoder/ratectrl.c',
'vp8/encoder/rdopt.c',
'vp8/encoder/segmentation.c',
'vp8/encoder/temporal_filter.c',
'vp8/encoder/tokenize.c',
'vp8/encoder/treewriter.c',
'vp8/vp8_cx_iface.c',
'vp9/decoder/vp9_read_bit_buffer.c',
'vpx/src/vpx_codec.c',
'vpx/src/vpx_decoder.c',
'vpx/src/vpx_image.c',
'vpx/src/vpx_psnr.c',
'vpx_scale/generic/gen_scalers.c',
'vpx_scale/generic/vpx_scale.c'],
'VP8_POSTPROC': ['vp8/common/mfqe.c', 'vp8/common/postproc.c'],
'VP9_POSTPROC': ['vp9/common/vp9_postproc.c'],
'X86-64_ASM': ['third_party/x86inc/x86inc.asm',
'vp8/common/x86/loopfilter_block_sse2.asm',
'vp9/encoder/x86/vp9_quantize_ssse3.asm'],
'X86_ASM': [
'vp8/common/x86/dequantize_mmx.asm',
'vp8/common/x86/filter_x86.c',
'vp8/common/x86/idct_blk_mmx.c',
'vp8/common/x86/idct_blk_sse2.c',
'vp8/common/x86/idctllm_mmx.asm',
'vp8/common/x86/idctllm_sse2.asm',
'vp8/common/x86/iwalsh_mmx.asm',
'vp8/common/x86/iwalsh_sse2.asm',
'vp8/common/x86/loopfilter_mmx.asm',
'vp8/common/x86/loopfilter_sse2.asm',
'vp8/common/x86/loopfilter_x86.c',
'vp8/common/x86/mfqe_sse2.asm',
'vp8/common/x86/postproc_mmx.asm',
'vp8/common/x86/postproc_sse2.asm',
'vp8/common/x86/postproc_x86.c',
'vp8/common/x86/recon_mmx.asm',
'vp8/common/x86/recon_sse2.asm',
'vp8/common/x86/recon_wrapper_sse2.c',
'vp8/common/x86/sad_mmx.asm',
'vp8/common/x86/sad_sse2.asm',
'vp8/common/x86/sad_sse3.asm',
'vp8/common/x86/sad_sse4.asm',
'vp8/common/x86/sad_ssse3.asm',
'vp8/common/x86/subpixel_mmx.asm',
'vp8/common/x86/subpixel_sse2.asm',
'vp8/common/x86/subpixel_ssse3.asm',
'vp8/common/x86/variance_impl_mmx.asm',
'vp8/common/x86/variance_impl_sse2.asm',
'vp8/common/x86/variance_impl_ssse3.asm',
'vp8/common/x86/variance_mmx.c',
'vp8/common/x86/variance_sse2.c',
'vp8/common/x86/variance_ssse3.c',
'vp8/common/x86/vp8_asm_stubs.c',
'vp8/encoder/x86/dct_mmx.asm',
'vp8/encoder/x86/dct_sse2.asm',
'vp8/encoder/x86/denoising_sse2.c',
'vp8/encoder/x86/encodeopt.asm',
'vp8/encoder/x86/fwalsh_sse2.asm',
'vp8/encoder/x86/quantize_mmx.asm',
'vp8/encoder/x86/quantize_sse2.c',
'vp8/encoder/x86/quantize_sse4.asm',
'vp8/encoder/x86/quantize_ssse3.asm',
'vp8/encoder/x86/subtract_mmx.asm',
'vp8/encoder/x86/subtract_sse2.asm',
'vp8/encoder/x86/temporal_filter_apply_sse2.asm',
'vp8/encoder/x86/vp8_enc_stubs_mmx.c',
'vp8/encoder/x86/vp8_enc_stubs_sse2.c',
'vp9/common/x86/vp9_asm_stubs.c',
'vp9/common/x86/vp9_copy_sse2.asm',
'vp9/common/x86/vp9_idct_intrin_sse2.c',
'vp9/common/x86/vp9_intrapred_sse2.asm',
'vp9/common/x86/vp9_intrapred_ssse3.asm',
'vp9/common/x86/vp9_loopfilter_intrin_sse2.c',
'vp9/common/x86/vp9_loopfilter_mmx.asm',
'vp9/common/x86/vp9_subpixel_8t_sse2.asm',
'vp9/common/x86/vp9_subpixel_8t_ssse3.asm',
'vp9/encoder/x86/vp9_dct32x32_sse2.c',
'vp9/encoder/x86/vp9_dct_sse2.c',
'vp9/encoder/x86/vp9_error_sse2.asm',
'vp9/encoder/x86/vp9_sad4d_sse2.asm',
'vp9/encoder/x86/vp9_sad_mmx.asm',
'vp9/encoder/x86/vp9_sad_sse2.asm',
'vp9/encoder/x86/vp9_sad_sse3.asm',
'vp9/encoder/x86/vp9_sad_sse4.asm',
'vp9/encoder/x86/vp9_sad_ssse3.asm',
'vp9/encoder/x86/vp9_subpel_variance.asm',
'vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm',
'vp9/encoder/x86/vp9_subtract_sse2.asm',
'vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm',
'vp9/encoder/x86/vp9_variance_impl_mmx.asm',
'vp9/encoder/x86/vp9_variance_impl_sse2.asm',
'vp9/encoder/x86/vp9_variance_mmx.c',
'vp9/encoder/x86/vp9_variance_sse2.c',
'vpx_ports/emms.asm',
'vpx_ports/x86_cpuid.c',
]
'vp8/common/x86/loopfilter_block_sse2_x86_64.asm',
'vp8/encoder/x86/ssim_opt_x86_64.asm',
'vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm',
'vp9/encoder/x86/vp9_ssim_opt_x86_64.asm'],
'X86_ASM': ['vp8/common/x86/dequantize_mmx.asm',
'vp8/common/x86/filter_x86.c',
'vp8/common/x86/idct_blk_mmx.c',
'vp8/common/x86/idct_blk_sse2.c',
'vp8/common/x86/idctllm_mmx.asm',
'vp8/common/x86/idctllm_sse2.asm',
'vp8/common/x86/iwalsh_mmx.asm',
'vp8/common/x86/iwalsh_sse2.asm',
'vp8/common/x86/loopfilter_mmx.asm',
'vp8/common/x86/loopfilter_sse2.asm',
'vp8/common/x86/loopfilter_x86.c',
'vp8/common/x86/mfqe_sse2.asm',
'vp8/common/x86/postproc_mmx.asm',
'vp8/common/x86/postproc_sse2.asm',
'vp8/common/x86/recon_mmx.asm',
'vp8/common/x86/recon_sse2.asm',
'vp8/common/x86/recon_wrapper_sse2.c',
'vp8/common/x86/sad_mmx.asm',
'vp8/common/x86/sad_sse2.asm',
'vp8/common/x86/sad_sse3.asm',
'vp8/common/x86/sad_sse4.asm',
'vp8/common/x86/sad_ssse3.asm',
'vp8/common/x86/subpixel_mmx.asm',
'vp8/common/x86/subpixel_sse2.asm',
'vp8/common/x86/subpixel_ssse3.asm',
'vp8/common/x86/variance_impl_mmx.asm',
'vp8/common/x86/variance_impl_sse2.asm',
'vp8/common/x86/variance_impl_ssse3.asm',
'vp8/common/x86/variance_mmx.c',
'vp8/common/x86/variance_sse2.c',
'vp8/common/x86/variance_ssse3.c',
'vp8/common/x86/vp8_asm_stubs.c',
'vp8/encoder/x86/dct_mmx.asm',
'vp8/encoder/x86/dct_sse2.asm',
'vp8/encoder/x86/denoising_sse2.c',
'vp8/encoder/x86/encodeopt.asm',
'vp8/encoder/x86/fwalsh_sse2.asm',
'vp8/encoder/x86/quantize_mmx.asm',
'vp8/encoder/x86/quantize_sse2.c',
'vp8/encoder/x86/quantize_sse4.c',
'vp8/encoder/x86/quantize_ssse3.c',
'vp8/encoder/x86/subtract_mmx.asm',
'vp8/encoder/x86/subtract_sse2.asm',
'vp8/encoder/x86/temporal_filter_apply_sse2.asm',
'vp8/encoder/x86/vp8_enc_stubs_mmx.c',
'vp8/encoder/x86/vp8_enc_stubs_sse2.c',
'vp9/common/x86/vp9_asm_stubs.c',
'vp9/common/x86/vp9_copy_sse2.asm',
'vp9/common/x86/vp9_idct_intrin_sse2.c',
'vp9/common/x86/vp9_idct_intrin_ssse3.c',
'vp9/common/x86/vp9_idct_ssse3_x86_64.asm',
'vp9/common/x86/vp9_intrapred_sse2.asm',
'vp9/common/x86/vp9_intrapred_ssse3.asm',
'vp9/common/x86/vp9_loopfilter_intrin_sse2.c',
'vp9/common/x86/vp9_loopfilter_mmx.asm',
'vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c',
'vp9/common/x86/vp9_subpixel_8t_sse2.asm',
'vp9/common/x86/vp9_subpixel_8t_ssse3.asm',
'vp9/common/x86/vp9_subpixel_bilinear_sse2.asm',
'vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm',
'vp9/encoder/x86/vp9_dct32x32_sse2.c',
'vp9/encoder/x86/vp9_dct_mmx.asm',
'vp9/encoder/x86/vp9_dct_sse2.c',
'vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm',
'vp9/encoder/x86/vp9_error_sse2.asm',
'vp9/encoder/x86/vp9_sad4d_sse2.asm',
'vp9/encoder/x86/vp9_sad_sse2.asm',
'vp9/encoder/x86/vp9_sad_sse3.asm',
'vp9/encoder/x86/vp9_sad_sse4.asm',
'vp9/encoder/x86/vp9_sad_ssse3.asm',
'vp9/encoder/x86/vp9_subpel_variance.asm',
'vp9/encoder/x86/vp9_subtract_sse2.asm',
'vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm',
'vp9/encoder/x86/vp9_variance_sse2.c',
'vpx_ports/emms.asm']
}

View File

@ -234,10 +234,10 @@ ALIGNMODE k7
%define r%1mp %2
%elif ARCH_X86_64 ; memory
%define r%1m [rsp + stack_offset + %6]
%define r%1mp qword r %+ %1m
%define r%1mp qword r %+ %1 %+ m
%else
%define r%1m [esp + stack_offset + %6]
%define r%1mp dword r %+ %1m
%define r%1mp dword r %+ %1 %+ m
%endif
%define r%1 %2
%endmacro
@ -395,6 +395,23 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%assign n_arg_names %0
%endmacro
%if ARCH_X86_64
%macro ALLOC_STACK 2 ; stack_size, num_regs
%assign %%stack_aligment ((mmsize + 15) & ~15)
%assign stack_size_padded %1
%assign %%reg_num (%2 - 1)
%xdefine rsp_tmp r %+ %%reg_num
mov rsp_tmp, rsp
sub rsp, stack_size_padded
and rsp, ~(%%stack_aligment - 1)
%endmacro
%macro RESTORE_STACK 0 ; reset rsp register
mov rsp, rsp_tmp
%endmacro
%endif
%if WIN64 ; Windows x64 ;=================================================
DECLARE_REG 0, rcx, ecx, cx, cl
@ -592,16 +609,20 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
CAT_XDEFINE cglobaled_, %1, 1
%endif
%xdefine current_function %1
%ifidn __OUTPUT_FORMAT__,elf
global %1:function hidden
%elifidn __OUTPUT_FORMAT__,elf32
global %1:function hidden
%elifidn __OUTPUT_FORMAT__,elf64
global %1:function hidden
%elifidn __OUTPUT_FORMAT__,macho32
global %1:private_extern
%elifidn __OUTPUT_FORMAT__,macho64
global %1:private_extern
%ifdef CHROMIUM
%ifidn __OUTPUT_FORMAT__,elf
global %1:function hidden
%elifidn __OUTPUT_FORMAT__,elf32
global %1:function hidden
%elifidn __OUTPUT_FORMAT__,elf64
global %1:function hidden
%elifidn __OUTPUT_FORMAT__,macho32
global %1:private_extern
%elifidn __OUTPUT_FORMAT__,macho64
global %1:private_extern
%else
global %1
%endif
%else
global %1
%endif

View File

@ -9,15 +9,23 @@
*/
#ifndef __INC_ALLOCCOMMON_H
#define __INC_ALLOCCOMMON_H
#ifndef VP8_COMMON_ALLOCCOMMON_H_
#define VP8_COMMON_ALLOCCOMMON_H_
#include "onyxc_int.h"
#ifdef __cplusplus
extern "C" {
#endif
void vp8_create_common(VP8_COMMON *oci);
void vp8_remove_common(VP8_COMMON *oci);
void vp8_de_alloc_frame_buffers(VP8_COMMON *oci);
int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height);
void vp8_setup_version(VP8_COMMON *oci);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_ALLOCCOMMON_H_

View File

@ -53,7 +53,7 @@ loop
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
adds r8, r8, r4 ; add positive differences to sum
subs r8, r8, r5 ; substract negative differences from sum
subs r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -77,7 +77,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -101,7 +101,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -127,7 +127,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords

View File

@ -51,7 +51,7 @@ loop
orr r8, r8, r10 ; differences of all 4 pixels
; calculate total sum
add r4, r4, r6 ; add positive differences to sum
sub r4, r4, r7 ; substract negative differences from sum
sub r4, r4, r7 ; subtract negative differences from sum
; calculate sse
uxtb16 r7, r8 ; byte (two pixels) to halfwords
@ -77,7 +77,7 @@ loop
; calculate total sum
add r4, r4, r6 ; add positive differences to sum
sub r4, r4, r7 ; substract negative differences from sum
sub r4, r4, r7 ; subtract negative differences from sum
; calculate sse
uxtb16 r7, r8 ; byte (two pixels) to halfwords

View File

@ -58,7 +58,7 @@ loop
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
adds r8, r8, r4 ; add positive differences to sum
subs r8, r8, r5 ; substract negative differences from sum
subs r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -89,7 +89,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -120,7 +120,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -153,7 +153,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords

View File

@ -69,7 +69,7 @@ loop
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
adds r8, r8, r4 ; add positive differences to sum
subs r8, r8, r5 ; substract negative differences from sum
subs r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -111,7 +111,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -153,7 +153,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -195,7 +195,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords

View File

@ -59,7 +59,7 @@ loop
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
adds r8, r8, r4 ; add positive differences to sum
subs r8, r8, r5 ; substract negative differences from sum
subs r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -90,7 +90,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -121,7 +121,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -154,7 +154,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
sub r8, r8, r5 ; subtract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords

View File

@ -9,8 +9,12 @@
*/
#ifndef BILINEARFILTER_ARM_H
#define BILINEARFILTER_ARM_H
#ifndef VP8_COMMON_ARM_BILINEARFILTER_ARM_H_
#define VP8_COMMON_ARM_BILINEARFILTER_ARM_H_
#ifdef __cplusplus
extern "C" {
#endif
extern void vp8_filter_block2d_bil_first_pass_armv6
(
@ -32,4 +36,8 @@ extern void vp8_filter_block2d_bil_second_pass_armv6
const short *vp8_filter
);
#endif /* BILINEARFILTER_ARM_H */
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_ARM_BILINEARFILTER_ARM_H_

View File

@ -12,26 +12,9 @@
#include "vpx_config.h"
#include "vp8/common/blockd.h"
#if HAVE_NEON
extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
#endif
#if HAVE_MEDIA
extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
#endif
#if HAVE_NEON
void vp8_dequantize_b_neon(BLOCKD *d, short *DQC)
{
short *DQ = d->dqcoeff;
short *Q = d->qcoeff;
vp8_dequantize_b_loop_neon(Q, DQC, DQ);
}
#endif
#if HAVE_MEDIA
void vp8_dequantize_b_v6(BLOCKD *d, short *DQC)
{
short *DQ = d->dqcoeff;

View File

@ -34,11 +34,11 @@ typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
#endif

View File

@ -1,357 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_bilinear_predict16x16_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; r4 unsigned char *dst_ptr,
; stack(r5) int dst_pitch
|vp8_bilinear_predict16x16_neon| PROC
push {r4-r5, lr}
adr r12, bifilter16_coeff
ldr r4, [sp, #12] ;load parameters from stack
ldr r5, [sp, #16] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_bfilter16x16_only
add r2, r12, r2, lsl #3 ;calculate filter location
cmp r3, #0 ;skip second_pass filter if yoffset=0
vld1.s32 {d31}, [r2] ;load first_pass filter
beq firstpass_bfilter16x16_only
sub sp, sp, #272 ;reserve space on stack for temporary storage
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
mov lr, sp
vld1.u8 {d5, d6, d7}, [r0], r1
mov r2, #3 ;loop counter
vld1.u8 {d8, d9, d10}, [r0], r1
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
vld1.u8 {d11, d12, d13}, [r0], r1
vdup.8 d1, d31[4]
;First Pass: output_height lines x output_width columns (17x16)
filt_blk2d_fp16x16_loop_neon
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0])
vmull.u8 q8, d3, d0
vmull.u8 q9, d5, d0
vmull.u8 q10, d6, d0
vmull.u8 q11, d8, d0
vmull.u8 q12, d9, d0
vmull.u8 q13, d11, d0
vmull.u8 q14, d12, d0
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
vext.8 d5, d5, d6, #1
vext.8 d8, d8, d9, #1
vext.8 d11, d11, d12, #1
vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1])
vmlal.u8 q9, d5, d1
vmlal.u8 q11, d8, d1
vmlal.u8 q13, d11, d1
vext.8 d3, d3, d4, #1
vext.8 d6, d6, d7, #1
vext.8 d9, d9, d10, #1
vext.8 d12, d12, d13, #1
vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1])
vmlal.u8 q10, d6, d1
vmlal.u8 q12, d9, d1
vmlal.u8 q14, d12, d1
subs r2, r2, #1
vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
vqrshrn.u16 d15, q8, #7
vqrshrn.u16 d16, q9, #7
vqrshrn.u16 d17, q10, #7
vqrshrn.u16 d18, q11, #7
vqrshrn.u16 d19, q12, #7
vqrshrn.u16 d20, q13, #7
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
vqrshrn.u16 d21, q14, #7
vld1.u8 {d5, d6, d7}, [r0], r1
vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
vld1.u8 {d8, d9, d10}, [r0], r1
vst1.u8 {d18, d19, d20, d21}, [lr]!
vld1.u8 {d11, d12, d13}, [r0], r1
bne filt_blk2d_fp16x16_loop_neon
;First-pass filtering for rest 5 lines
vld1.u8 {d14, d15, d16}, [r0], r1
vmull.u8 q9, d2, d0 ;(src_ptr[0] * vp8_filter[0])
vmull.u8 q10, d3, d0
vmull.u8 q11, d5, d0
vmull.u8 q12, d6, d0
vmull.u8 q13, d8, d0
vmull.u8 q14, d9, d0
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
vext.8 d5, d5, d6, #1
vext.8 d8, d8, d9, #1
vmlal.u8 q9, d2, d1 ;(src_ptr[0] * vp8_filter[1])
vmlal.u8 q11, d5, d1
vmlal.u8 q13, d8, d1
vext.8 d3, d3, d4, #1
vext.8 d6, d6, d7, #1
vext.8 d9, d9, d10, #1
vmlal.u8 q10, d3, d1 ;(src_ptr[0] * vp8_filter[1])
vmlal.u8 q12, d6, d1
vmlal.u8 q14, d9, d1
vmull.u8 q1, d11, d0
vmull.u8 q2, d12, d0
vmull.u8 q3, d14, d0
vmull.u8 q4, d15, d0
vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
vext.8 d14, d14, d15, #1
vmlal.u8 q1, d11, d1 ;(src_ptr[0] * vp8_filter[1])
vmlal.u8 q3, d14, d1
vext.8 d12, d12, d13, #1
vext.8 d15, d15, d16, #1
vmlal.u8 q2, d12, d1 ;(src_ptr[0] * vp8_filter[1])
vmlal.u8 q4, d15, d1
vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
vqrshrn.u16 d11, q10, #7
vqrshrn.u16 d12, q11, #7
vqrshrn.u16 d13, q12, #7
vqrshrn.u16 d14, q13, #7
vqrshrn.u16 d15, q14, #7
vqrshrn.u16 d16, q1, #7
vqrshrn.u16 d17, q2, #7
vqrshrn.u16 d18, q3, #7
vqrshrn.u16 d19, q4, #7
vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
vst1.u8 {d14, d15, d16, d17}, [lr]!
vst1.u8 {d18, d19}, [lr]!
;Second pass: 16x16
;secondpass_filter
add r3, r12, r3, lsl #3
sub lr, lr, #272
vld1.u32 {d31}, [r3] ;load second_pass filter
vld1.u8 {d22, d23}, [lr]! ;load src data
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
vdup.8 d1, d31[4]
mov r12, #4 ;loop counter
filt_blk2d_sp16x16_loop_neon
vld1.u8 {d24, d25}, [lr]!
vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0])
vld1.u8 {d26, d27}, [lr]!
vmull.u8 q2, d23, d0
vld1.u8 {d28, d29}, [lr]!
vmull.u8 q3, d24, d0
vld1.u8 {d30, d31}, [lr]!
vmull.u8 q4, d25, d0
vmull.u8 q5, d26, d0
vmull.u8 q6, d27, d0
vmull.u8 q7, d28, d0
vmull.u8 q8, d29, d0
vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1])
vmlal.u8 q2, d25, d1
vmlal.u8 q3, d26, d1
vmlal.u8 q4, d27, d1
vmlal.u8 q5, d28, d1
vmlal.u8 q6, d29, d1
vmlal.u8 q7, d30, d1
vmlal.u8 q8, d31, d1
subs r12, r12, #1
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d3, q2, #7
vqrshrn.u16 d4, q3, #7
vqrshrn.u16 d5, q4, #7
vqrshrn.u16 d6, q5, #7
vqrshrn.u16 d7, q6, #7
vqrshrn.u16 d8, q7, #7
vqrshrn.u16 d9, q8, #7
vst1.u8 {d2, d3}, [r4], r5 ;store result
vst1.u8 {d4, d5}, [r4], r5
vst1.u8 {d6, d7}, [r4], r5
vmov q11, q15
vst1.u8 {d8, d9}, [r4], r5
bne filt_blk2d_sp16x16_loop_neon
add sp, sp, #272
pop {r4-r5,pc}
;--------------------
firstpass_bfilter16x16_only
mov r2, #4 ;loop counter
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
vdup.8 d1, d31[4]
;First Pass: output_height lines x output_width columns (16x16)
filt_blk2d_fpo16x16_loop_neon
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
vld1.u8 {d5, d6, d7}, [r0], r1
vld1.u8 {d8, d9, d10}, [r0], r1
vld1.u8 {d11, d12, d13}, [r0], r1
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0])
vmull.u8 q8, d3, d0
vmull.u8 q9, d5, d0
vmull.u8 q10, d6, d0
vmull.u8 q11, d8, d0
vmull.u8 q12, d9, d0
vmull.u8 q13, d11, d0
vmull.u8 q14, d12, d0
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
vext.8 d5, d5, d6, #1
vext.8 d8, d8, d9, #1
vext.8 d11, d11, d12, #1
vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1])
vmlal.u8 q9, d5, d1
vmlal.u8 q11, d8, d1
vmlal.u8 q13, d11, d1
vext.8 d3, d3, d4, #1
vext.8 d6, d6, d7, #1
vext.8 d9, d9, d10, #1
vext.8 d12, d12, d13, #1
vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1])
vmlal.u8 q10, d6, d1
vmlal.u8 q12, d9, d1
vmlal.u8 q14, d12, d1
subs r2, r2, #1
vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
vqrshrn.u16 d15, q8, #7
vqrshrn.u16 d16, q9, #7
vqrshrn.u16 d17, q10, #7
vqrshrn.u16 d18, q11, #7
vqrshrn.u16 d19, q12, #7
vqrshrn.u16 d20, q13, #7
vst1.u8 {d14, d15}, [r4], r5 ;store result
vqrshrn.u16 d21, q14, #7
vst1.u8 {d16, d17}, [r4], r5
vst1.u8 {d18, d19}, [r4], r5
vst1.u8 {d20, d21}, [r4], r5
bne filt_blk2d_fpo16x16_loop_neon
pop {r4-r5,pc}
;---------------------
secondpass_bfilter16x16_only
;Second pass: 16x16
;secondpass_filter
add r3, r12, r3, lsl #3
mov r12, #4 ;loop counter
vld1.u32 {d31}, [r3] ;load second_pass filter
vld1.u8 {d22, d23}, [r0], r1 ;load src data
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
vdup.8 d1, d31[4]
filt_blk2d_spo16x16_loop_neon
vld1.u8 {d24, d25}, [r0], r1
vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0])
vld1.u8 {d26, d27}, [r0], r1
vmull.u8 q2, d23, d0
vld1.u8 {d28, d29}, [r0], r1
vmull.u8 q3, d24, d0
vld1.u8 {d30, d31}, [r0], r1
vmull.u8 q4, d25, d0
vmull.u8 q5, d26, d0
vmull.u8 q6, d27, d0
vmull.u8 q7, d28, d0
vmull.u8 q8, d29, d0
vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1])
vmlal.u8 q2, d25, d1
vmlal.u8 q3, d26, d1
vmlal.u8 q4, d27, d1
vmlal.u8 q5, d28, d1
vmlal.u8 q6, d29, d1
vmlal.u8 q7, d30, d1
vmlal.u8 q8, d31, d1
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d3, q2, #7
vqrshrn.u16 d4, q3, #7
vqrshrn.u16 d5, q4, #7
vqrshrn.u16 d6, q5, #7
vqrshrn.u16 d7, q6, #7
vqrshrn.u16 d8, q7, #7
vqrshrn.u16 d9, q8, #7
vst1.u8 {d2, d3}, [r4], r5 ;store result
subs r12, r12, #1
vst1.u8 {d4, d5}, [r4], r5
vmov q11, q15
vst1.u8 {d6, d7}, [r4], r5
vst1.u8 {d8, d9}, [r4], r5
bne filt_blk2d_spo16x16_loop_neon
pop {r4-r5,pc}
ENDP
;-----------------
bifilter16_coeff
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
END

View File

@ -1,130 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_bilinear_predict4x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; r4 unsigned char *dst_ptr,
; stack(lr) int dst_pitch
|vp8_bilinear_predict4x4_neon| PROC
push {r4, lr}
adr r12, bifilter4_coeff
ldr r4, [sp, #8] ;load parameters from stack
ldr lr, [sp, #12] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq skip_firstpass_filter
;First pass: output_height lines x output_width columns (5x4)
vld1.u8 {d2}, [r0], r1 ;load src data
add r2, r12, r2, lsl #3 ;calculate Hfilter location (2coeffsx4bytes=8bytes)
vld1.u8 {d3}, [r0], r1
vld1.u32 {d31}, [r2] ;first_pass filter
vld1.u8 {d4}, [r0], r1
vdup.8 d0, d31[0] ;first_pass filter (d0-d1)
vld1.u8 {d5}, [r0], r1
vdup.8 d1, d31[4]
vld1.u8 {d6}, [r0], r1
vshr.u64 q4, q1, #8 ;construct src_ptr[1]
vshr.u64 q5, q2, #8
vshr.u64 d12, d6, #8
vzip.32 d2, d3 ;put 2-line data in 1 register (src_ptr[0])
vzip.32 d4, d5
vzip.32 d8, d9 ;put 2-line data in 1 register (src_ptr[1])
vzip.32 d10, d11
vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0])
vmull.u8 q8, d4, d0
vmull.u8 q9, d6, d0
vmlal.u8 q7, d8, d1 ;(src_ptr[1] * vp8_filter[1])
vmlal.u8 q8, d10, d1
vmlal.u8 q9, d12, d1
vqrshrn.u16 d28, q7, #7 ;shift/round/saturate to u8
vqrshrn.u16 d29, q8, #7
vqrshrn.u16 d30, q9, #7
;Second pass: 4x4
secondpass_filter
cmp r3, #0 ;skip second_pass filter if yoffset=0
beq skip_secondpass_filter
add r3, r12, r3, lsl #3 ;calculate Vfilter location
vld1.u32 {d31}, [r3] ;load second_pass filter
vdup.8 d0, d31[0] ;second_pass filter parameters (d0-d5)
vdup.8 d1, d31[4]
vmull.u8 q1, d28, d0
vmull.u8 q2, d29, d0
vext.8 d26, d28, d29, #4 ;construct src_ptr[pixel_step]
vext.8 d27, d29, d30, #4
vmlal.u8 q1, d26, d1
vmlal.u8 q2, d27, d1
add r0, r4, lr
add r1, r0, lr
add r2, r1, lr
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d3, q2, #7
vst1.32 {d2[0]}, [r4] ;store result
vst1.32 {d2[1]}, [r0]
vst1.32 {d3[0]}, [r1]
vst1.32 {d3[1]}, [r2]
pop {r4, pc}
;--------------------
skip_firstpass_filter
vld1.32 {d28[0]}, [r0], r1 ;load src data
vld1.32 {d28[1]}, [r0], r1
vld1.32 {d29[0]}, [r0], r1
vld1.32 {d29[1]}, [r0], r1
vld1.32 {d30[0]}, [r0], r1
b secondpass_filter
;---------------------
skip_secondpass_filter
vst1.32 {d28[0]}, [r4], lr ;store result
vst1.32 {d28[1]}, [r4], lr
vst1.32 {d29[0]}, [r4], lr
vst1.32 {d29[1]}, [r4], lr
pop {r4, pc}
ENDP
;-----------------
bifilter4_coeff
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
END

View File

@ -1,135 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_bilinear_predict8x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; r4 unsigned char *dst_ptr,
; stack(lr) int dst_pitch
|vp8_bilinear_predict8x4_neon| PROC
push {r4, lr}
adr r12, bifilter8x4_coeff
ldr r4, [sp, #8] ;load parameters from stack
ldr lr, [sp, #12] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq skip_firstpass_filter
;First pass: output_height lines x output_width columns (5x8)
add r2, r12, r2, lsl #3 ;calculate filter location
vld1.u8 {q1}, [r0], r1 ;load src data
vld1.u32 {d31}, [r2] ;load first_pass filter
vld1.u8 {q2}, [r0], r1
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
vld1.u8 {q3}, [r0], r1
vdup.8 d1, d31[4]
vld1.u8 {q4}, [r0], r1
vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0])
vld1.u8 {q5}, [r0], r1
vmull.u8 q7, d4, d0
vmull.u8 q8, d6, d0
vmull.u8 q9, d8, d0
vmull.u8 q10, d10, d0
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
vext.8 d9, d8, d9, #1
vext.8 d11, d10, d11, #1
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1])
vmlal.u8 q7, d5, d1
vmlal.u8 q8, d7, d1
vmlal.u8 q9, d9, d1
vmlal.u8 q10, d11, d1
vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
vqrshrn.u16 d23, q7, #7
vqrshrn.u16 d24, q8, #7
vqrshrn.u16 d25, q9, #7
vqrshrn.u16 d26, q10, #7
;Second pass: 4x8
secondpass_filter
cmp r3, #0 ;skip second_pass filter if yoffset=0
beq skip_secondpass_filter
add r3, r12, r3, lsl #3
add r0, r4, lr
vld1.u32 {d31}, [r3] ;load second_pass filter
add r1, r0, lr
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
vdup.8 d1, d31[4]
vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0])
vmull.u8 q2, d23, d0
vmull.u8 q3, d24, d0
vmull.u8 q4, d25, d0
vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1])
vmlal.u8 q2, d24, d1
vmlal.u8 q3, d25, d1
vmlal.u8 q4, d26, d1
add r2, r1, lr
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d3, q2, #7
vqrshrn.u16 d4, q3, #7
vqrshrn.u16 d5, q4, #7
vst1.u8 {d2}, [r4] ;store result
vst1.u8 {d3}, [r0]
vst1.u8 {d4}, [r1]
vst1.u8 {d5}, [r2]
pop {r4, pc}
;--------------------
skip_firstpass_filter
vld1.u8 {d22}, [r0], r1 ;load src data
vld1.u8 {d23}, [r0], r1
vld1.u8 {d24}, [r0], r1
vld1.u8 {d25}, [r0], r1
vld1.u8 {d26}, [r0], r1
b secondpass_filter
;---------------------
skip_secondpass_filter
vst1.u8 {d22}, [r4], lr ;store result
vst1.u8 {d23}, [r4], lr
vst1.u8 {d24}, [r4], lr
vst1.u8 {d25}, [r4], lr
pop {r4, pc}
ENDP
;-----------------
bifilter8x4_coeff
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
END

View File

@ -1,183 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_bilinear_predict8x8_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; r4 unsigned char *dst_ptr,
; stack(lr) int dst_pitch
|vp8_bilinear_predict8x8_neon| PROC
push {r4, lr}
adr r12, bifilter8_coeff
ldr r4, [sp, #8] ;load parameters from stack
ldr lr, [sp, #12] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq skip_firstpass_filter
;First pass: output_height lines x output_width columns (9x8)
add r2, r12, r2, lsl #3 ;calculate filter location
vld1.u8 {q1}, [r0], r1 ;load src data
vld1.u32 {d31}, [r2] ;load first_pass filter
vld1.u8 {q2}, [r0], r1
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
vld1.u8 {q3}, [r0], r1
vdup.8 d1, d31[4]
vld1.u8 {q4}, [r0], r1
vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0])
vmull.u8 q7, d4, d0
vmull.u8 q8, d6, d0
vmull.u8 q9, d8, d0
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
vext.8 d9, d8, d9, #1
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1])
vmlal.u8 q7, d5, d1
vmlal.u8 q8, d7, d1
vmlal.u8 q9, d9, d1
vld1.u8 {q1}, [r0], r1 ;load src data
vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
vld1.u8 {q2}, [r0], r1
vqrshrn.u16 d23, q7, #7
vld1.u8 {q3}, [r0], r1
vqrshrn.u16 d24, q8, #7
vld1.u8 {q4}, [r0], r1
vqrshrn.u16 d25, q9, #7
;first_pass filtering on the rest 5-line data
vld1.u8 {q5}, [r0], r1
vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0])
vmull.u8 q7, d4, d0
vmull.u8 q8, d6, d0
vmull.u8 q9, d8, d0
vmull.u8 q10, d10, d0
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
vext.8 d9, d8, d9, #1
vext.8 d11, d10, d11, #1
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1])
vmlal.u8 q7, d5, d1
vmlal.u8 q8, d7, d1
vmlal.u8 q9, d9, d1
vmlal.u8 q10, d11, d1
vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
vqrshrn.u16 d27, q7, #7
vqrshrn.u16 d28, q8, #7
vqrshrn.u16 d29, q9, #7
vqrshrn.u16 d30, q10, #7
;Second pass: 8x8
secondpass_filter
cmp r3, #0 ;skip second_pass filter if yoffset=0
beq skip_secondpass_filter
add r3, r12, r3, lsl #3
add r0, r4, lr
vld1.u32 {d31}, [r3] ;load second_pass filter
add r1, r0, lr
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
vdup.8 d1, d31[4]
vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0])
vmull.u8 q2, d23, d0
vmull.u8 q3, d24, d0
vmull.u8 q4, d25, d0
vmull.u8 q5, d26, d0
vmull.u8 q6, d27, d0
vmull.u8 q7, d28, d0
vmull.u8 q8, d29, d0
vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1])
vmlal.u8 q2, d24, d1
vmlal.u8 q3, d25, d1
vmlal.u8 q4, d26, d1
vmlal.u8 q5, d27, d1
vmlal.u8 q6, d28, d1
vmlal.u8 q7, d29, d1
vmlal.u8 q8, d30, d1
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d3, q2, #7
vqrshrn.u16 d4, q3, #7
vqrshrn.u16 d5, q4, #7
vqrshrn.u16 d6, q5, #7
vqrshrn.u16 d7, q6, #7
vqrshrn.u16 d8, q7, #7
vqrshrn.u16 d9, q8, #7
vst1.u8 {d2}, [r4] ;store result
vst1.u8 {d3}, [r0]
vst1.u8 {d4}, [r1], lr
vst1.u8 {d5}, [r1], lr
vst1.u8 {d6}, [r1], lr
vst1.u8 {d7}, [r1], lr
vst1.u8 {d8}, [r1], lr
vst1.u8 {d9}, [r1], lr
pop {r4, pc}
;--------------------
skip_firstpass_filter
vld1.u8 {d22}, [r0], r1 ;load src data
vld1.u8 {d23}, [r0], r1
vld1.u8 {d24}, [r0], r1
vld1.u8 {d25}, [r0], r1
vld1.u8 {d26}, [r0], r1
vld1.u8 {d27}, [r0], r1
vld1.u8 {d28}, [r0], r1
vld1.u8 {d29}, [r0], r1
vld1.u8 {d30}, [r0], r1
b secondpass_filter
;---------------------
skip_secondpass_filter
vst1.u8 {d22}, [r4], lr ;store result
vst1.u8 {d23}, [r4], lr
vst1.u8 {d24}, [r4], lr
vst1.u8 {d25}, [r4], lr
vst1.u8 {d26}, [r4], lr
vst1.u8 {d27}, [r4], lr
vst1.u8 {d28}, [r4], lr
vst1.u8 {d29}, [r4], lr
pop {r4, pc}
ENDP
;-----------------
bifilter8_coeff
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
END

View File

@ -0,0 +1,699 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
static const uint8_t bifilter4_coeff[8][2] = {
{128, 0},
{112, 16},
{ 96, 32},
{ 80, 48},
{ 64, 64},
{ 48, 80},
{ 32, 96},
{ 16, 112}
};
void vp8_bilinear_predict4x4_neon(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch) {
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8;
uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8;
uint8x16_t q1u8, q2u8;
uint16x8_t q1u16, q2u16;
uint16x8_t q7u16, q8u16, q9u16;
uint64x2_t q4u64, q5u64;
uint64x1_t d12u64;
uint32x2x2_t d0u32x2, d1u32x2, d2u32x2, d3u32x2;
if (xoffset == 0) { // skip_1stpass_filter
uint32x2_t d28u32 = vdup_n_u32(0);
uint32x2_t d29u32 = vdup_n_u32(0);
uint32x2_t d30u32 = vdup_n_u32(0);
d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 0);
src_ptr += src_pixels_per_line;
d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 1);
src_ptr += src_pixels_per_line;
d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 0);
src_ptr += src_pixels_per_line;
d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 1);
src_ptr += src_pixels_per_line;
d30u32 = vld1_lane_u32((const uint32_t *)src_ptr, d30u32, 0);
d28u8 = vreinterpret_u8_u32(d28u32);
d29u8 = vreinterpret_u8_u32(d29u32);
d30u8 = vreinterpret_u8_u32(d30u32);
} else {
d2u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d3u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d4u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d5u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d6u8 = vld1_u8(src_ptr);
q1u8 = vcombine_u8(d2u8, d3u8);
q2u8 = vcombine_u8(d4u8, d5u8);
d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
q4u64 = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8);
q5u64 = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8);
d12u64 = vshr_n_u64(vreinterpret_u64_u8(d6u8), 8);
d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q1u8)),
vreinterpret_u32_u8(vget_high_u8(q1u8)));
d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q2u8)),
vreinterpret_u32_u8(vget_high_u8(q2u8)));
d2u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q4u64)),
vreinterpret_u32_u64(vget_high_u64(q4u64)));
d3u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),
vreinterpret_u32_u64(vget_high_u64(q5u64)));
q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
q9u16 = vmull_u8(d6u8, d0u8);
q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d2u32x2.val[0]), d1u8);
q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d3u32x2.val[0]), d1u8);
q9u16 = vmlal_u8(q9u16, vreinterpret_u8_u64(d12u64), d1u8);
d28u8 = vqrshrn_n_u16(q7u16, 7);
d29u8 = vqrshrn_n_u16(q8u16, 7);
d30u8 = vqrshrn_n_u16(q9u16, 7);
}
// secondpass_filter
if (yoffset == 0) { // skip_2ndpass_filter
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0);
dst_ptr += dst_pitch;
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1);
dst_ptr += dst_pitch;
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 0);
dst_ptr += dst_pitch;
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 1);
} else {
d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
q1u16 = vmull_u8(d28u8, d0u8);
q2u16 = vmull_u8(d29u8, d0u8);
d26u8 = vext_u8(d28u8, d29u8, 4);
d27u8 = vext_u8(d29u8, d30u8, 4);
q1u16 = vmlal_u8(q1u16, d26u8, d1u8);
q2u16 = vmlal_u8(q2u16, d27u8, d1u8);
d2u8 = vqrshrn_n_u16(q1u16, 7);
d3u8 = vqrshrn_n_u16(q2u16, 7);
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0);
dst_ptr += dst_pitch;
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1);
dst_ptr += dst_pitch;
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
dst_ptr += dst_pitch;
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
}
return;
}
void vp8_bilinear_predict8x4_neon(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch) {
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8;
uint8x8_t d7u8, d9u8, d11u8, d22u8, d23u8, d24u8, d25u8, d26u8;
uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8;
uint16x8_t q1u16, q2u16, q3u16, q4u16;
uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16;
if (xoffset == 0) { // skip_1stpass_filter
d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d26u8 = vld1_u8(src_ptr);
} else {
q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q5u8 = vld1q_u8(src_ptr);
d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
q10u16 = vmlal_u8(q10u16, d11u8, d1u8);
d22u8 = vqrshrn_n_u16(q6u16, 7);
d23u8 = vqrshrn_n_u16(q7u16, 7);
d24u8 = vqrshrn_n_u16(q8u16, 7);
d25u8 = vqrshrn_n_u16(q9u16, 7);
d26u8 = vqrshrn_n_u16(q10u16, 7);
}
// secondpass_filter
if (yoffset == 0) { // skip_2ndpass_filter
vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d25u8);
} else {
d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
q1u16 = vmull_u8(d22u8, d0u8);
q2u16 = vmull_u8(d23u8, d0u8);
q3u16 = vmull_u8(d24u8, d0u8);
q4u16 = vmull_u8(d25u8, d0u8);
q1u16 = vmlal_u8(q1u16, d23u8, d1u8);
q2u16 = vmlal_u8(q2u16, d24u8, d1u8);
q3u16 = vmlal_u8(q3u16, d25u8, d1u8);
q4u16 = vmlal_u8(q4u16, d26u8, d1u8);
d2u8 = vqrshrn_n_u16(q1u16, 7);
d3u8 = vqrshrn_n_u16(q2u16, 7);
d4u8 = vqrshrn_n_u16(q3u16, 7);
d5u8 = vqrshrn_n_u16(q4u16, 7);
vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d5u8);
}
return;
}
void vp8_bilinear_predict8x8_neon(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch) {
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8, d11u8;
uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8;
uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8;
uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16;
uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16;
if (xoffset == 0) { // skip_1stpass_filter
d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d26u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d27u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d28u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d29u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
d30u8 = vld1_u8(src_ptr);
} else {
q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
d22u8 = vqrshrn_n_u16(q6u16, 7);
d23u8 = vqrshrn_n_u16(q7u16, 7);
d24u8 = vqrshrn_n_u16(q8u16, 7);
d25u8 = vqrshrn_n_u16(q9u16, 7);
// first_pass filtering on the rest 5-line data
q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q5u8 = vld1q_u8(src_ptr);
q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
q10u16 = vmlal_u8(q10u16, d11u8, d1u8);
d26u8 = vqrshrn_n_u16(q6u16, 7);
d27u8 = vqrshrn_n_u16(q7u16, 7);
d28u8 = vqrshrn_n_u16(q8u16, 7);
d29u8 = vqrshrn_n_u16(q9u16, 7);
d30u8 = vqrshrn_n_u16(q10u16, 7);
}
// secondpass_filter
if (yoffset == 0) { // skip_2ndpass_filter
vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d25u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d26u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d27u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d28u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d29u8);
} else {
d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
q1u16 = vmull_u8(d22u8, d0u8);
q2u16 = vmull_u8(d23u8, d0u8);
q3u16 = vmull_u8(d24u8, d0u8);
q4u16 = vmull_u8(d25u8, d0u8);
q5u16 = vmull_u8(d26u8, d0u8);
q6u16 = vmull_u8(d27u8, d0u8);
q7u16 = vmull_u8(d28u8, d0u8);
q8u16 = vmull_u8(d29u8, d0u8);
q1u16 = vmlal_u8(q1u16, d23u8, d1u8);
q2u16 = vmlal_u8(q2u16, d24u8, d1u8);
q3u16 = vmlal_u8(q3u16, d25u8, d1u8);
q4u16 = vmlal_u8(q4u16, d26u8, d1u8);
q5u16 = vmlal_u8(q5u16, d27u8, d1u8);
q6u16 = vmlal_u8(q6u16, d28u8, d1u8);
q7u16 = vmlal_u8(q7u16, d29u8, d1u8);
q8u16 = vmlal_u8(q8u16, d30u8, d1u8);
d2u8 = vqrshrn_n_u16(q1u16, 7);
d3u8 = vqrshrn_n_u16(q2u16, 7);
d4u8 = vqrshrn_n_u16(q3u16, 7);
d5u8 = vqrshrn_n_u16(q4u16, 7);
d6u8 = vqrshrn_n_u16(q5u16, 7);
d7u8 = vqrshrn_n_u16(q6u16, 7);
d8u8 = vqrshrn_n_u16(q7u16, 7);
d9u8 = vqrshrn_n_u16(q8u16, 7);
vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d5u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d6u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d7u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d8u8); dst_ptr += dst_pitch;
vst1_u8((uint8_t *)dst_ptr, d9u8);
}
return;
}
void vp8_bilinear_predict16x16_neon(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch) {
int i;
unsigned char tmp[272];
unsigned char *tmpp;
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;
uint8x8_t d19u8, d20u8, d21u8;
uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
uint8x16_t q11u8, q12u8, q13u8, q14u8, q15u8;
uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16;
uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16;
if (xoffset == 0) { // secondpass_bfilter16x16_only
d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
q11u8 = vld1q_u8(src_ptr);
src_ptr += src_pixels_per_line;
for (i = 4; i > 0; i--) {
q12u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q13u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q14u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q15u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
d2u8 = vqrshrn_n_u16(q1u16, 7);
d3u8 = vqrshrn_n_u16(q2u16, 7);
d4u8 = vqrshrn_n_u16(q3u16, 7);
d5u8 = vqrshrn_n_u16(q4u16, 7);
d6u8 = vqrshrn_n_u16(q5u16, 7);
d7u8 = vqrshrn_n_u16(q6u16, 7);
d8u8 = vqrshrn_n_u16(q7u16, 7);
d9u8 = vqrshrn_n_u16(q8u16, 7);
q1u8 = vcombine_u8(d2u8, d3u8);
q2u8 = vcombine_u8(d4u8, d5u8);
q3u8 = vcombine_u8(d6u8, d7u8);
q4u8 = vcombine_u8(d8u8, d9u8);
q11u8 = q15u8;
vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch;
vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch;
vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch;
vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch;
}
return;
}
if (yoffset == 0) { // firstpass_bfilter16x16_only
d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
for (i = 4; i > 0 ; i--) {
d2u8 = vld1_u8(src_ptr);
d3u8 = vld1_u8(src_ptr + 8);
d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
d5u8 = vld1_u8(src_ptr);
d6u8 = vld1_u8(src_ptr + 8);
d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
d8u8 = vld1_u8(src_ptr);
d9u8 = vld1_u8(src_ptr + 8);
d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
d11u8 = vld1_u8(src_ptr);
d12u8 = vld1_u8(src_ptr + 8);
d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
q7u16 = vmull_u8(d2u8, d0u8);
q8u16 = vmull_u8(d3u8, d0u8);
q9u16 = vmull_u8(d5u8, d0u8);
q10u16 = vmull_u8(d6u8, d0u8);
q11u16 = vmull_u8(d8u8, d0u8);
q12u16 = vmull_u8(d9u8, d0u8);
q13u16 = vmull_u8(d11u8, d0u8);
q14u16 = vmull_u8(d12u8, d0u8);
d2u8 = vext_u8(d2u8, d3u8, 1);
d5u8 = vext_u8(d5u8, d6u8, 1);
d8u8 = vext_u8(d8u8, d9u8, 1);
d11u8 = vext_u8(d11u8, d12u8, 1);
q7u16 = vmlal_u8(q7u16, d2u8, d1u8);
q9u16 = vmlal_u8(q9u16, d5u8, d1u8);
q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
d3u8 = vext_u8(d3u8, d4u8, 1);
d6u8 = vext_u8(d6u8, d7u8, 1);
d9u8 = vext_u8(d9u8, d10u8, 1);
d12u8 = vext_u8(d12u8, d13u8, 1);
q8u16 = vmlal_u8(q8u16, d3u8, d1u8);
q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
d14u8 = vqrshrn_n_u16(q7u16, 7);
d15u8 = vqrshrn_n_u16(q8u16, 7);
d16u8 = vqrshrn_n_u16(q9u16, 7);
d17u8 = vqrshrn_n_u16(q10u16, 7);
d18u8 = vqrshrn_n_u16(q11u16, 7);
d19u8 = vqrshrn_n_u16(q12u16, 7);
d20u8 = vqrshrn_n_u16(q13u16, 7);
d21u8 = vqrshrn_n_u16(q14u16, 7);
q7u8 = vcombine_u8(d14u8, d15u8);
q8u8 = vcombine_u8(d16u8, d17u8);
q9u8 = vcombine_u8(d18u8, d19u8);
q10u8 =vcombine_u8(d20u8, d21u8);
vst1q_u8((uint8_t *)dst_ptr, q7u8); dst_ptr += dst_pitch;
vst1q_u8((uint8_t *)dst_ptr, q8u8); dst_ptr += dst_pitch;
vst1q_u8((uint8_t *)dst_ptr, q9u8); dst_ptr += dst_pitch;
vst1q_u8((uint8_t *)dst_ptr, q10u8); dst_ptr += dst_pitch;
}
return;
}
d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
d2u8 = vld1_u8(src_ptr);
d3u8 = vld1_u8(src_ptr + 8);
d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
d5u8 = vld1_u8(src_ptr);
d6u8 = vld1_u8(src_ptr + 8);
d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
d8u8 = vld1_u8(src_ptr);
d9u8 = vld1_u8(src_ptr + 8);
d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
d11u8 = vld1_u8(src_ptr);
d12u8 = vld1_u8(src_ptr + 8);
d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
// First Pass: output_height lines x output_width columns (17x16)
tmpp = tmp;
for (i = 3; i > 0; i--) {
q7u16 = vmull_u8(d2u8, d0u8);
q8u16 = vmull_u8(d3u8, d0u8);
q9u16 = vmull_u8(d5u8, d0u8);
q10u16 = vmull_u8(d6u8, d0u8);
q11u16 = vmull_u8(d8u8, d0u8);
q12u16 = vmull_u8(d9u8, d0u8);
q13u16 = vmull_u8(d11u8, d0u8);
q14u16 = vmull_u8(d12u8, d0u8);
d2u8 = vext_u8(d2u8, d3u8, 1);
d5u8 = vext_u8(d5u8, d6u8, 1);
d8u8 = vext_u8(d8u8, d9u8, 1);
d11u8 = vext_u8(d11u8, d12u8, 1);
q7u16 = vmlal_u8(q7u16, d2u8, d1u8);
q9u16 = vmlal_u8(q9u16, d5u8, d1u8);
q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
d3u8 = vext_u8(d3u8, d4u8, 1);
d6u8 = vext_u8(d6u8, d7u8, 1);
d9u8 = vext_u8(d9u8, d10u8, 1);
d12u8 = vext_u8(d12u8, d13u8, 1);
q8u16 = vmlal_u8(q8u16, d3u8, d1u8);
q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
d14u8 = vqrshrn_n_u16(q7u16, 7);
d15u8 = vqrshrn_n_u16(q8u16, 7);
d16u8 = vqrshrn_n_u16(q9u16, 7);
d17u8 = vqrshrn_n_u16(q10u16, 7);
d18u8 = vqrshrn_n_u16(q11u16, 7);
d19u8 = vqrshrn_n_u16(q12u16, 7);
d20u8 = vqrshrn_n_u16(q13u16, 7);
d21u8 = vqrshrn_n_u16(q14u16, 7);
d2u8 = vld1_u8(src_ptr);
d3u8 = vld1_u8(src_ptr + 8);
d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
d5u8 = vld1_u8(src_ptr);
d6u8 = vld1_u8(src_ptr + 8);
d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
d8u8 = vld1_u8(src_ptr);
d9u8 = vld1_u8(src_ptr + 8);
d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
d11u8 = vld1_u8(src_ptr);
d12u8 = vld1_u8(src_ptr + 8);
d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
q7u8 = vcombine_u8(d14u8, d15u8);
q8u8 = vcombine_u8(d16u8, d17u8);
q9u8 = vcombine_u8(d18u8, d19u8);
q10u8 = vcombine_u8(d20u8, d21u8);
vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16;
vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16;
vst1q_u8((uint8_t *)tmpp, q9u8); tmpp += 16;
vst1q_u8((uint8_t *)tmpp, q10u8); tmpp += 16;
}
// First-pass filtering for rest 5 lines
d14u8 = vld1_u8(src_ptr);
d15u8 = vld1_u8(src_ptr + 8);
d16u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
q9u16 = vmull_u8(d2u8, d0u8);
q10u16 = vmull_u8(d3u8, d0u8);
q11u16 = vmull_u8(d5u8, d0u8);
q12u16 = vmull_u8(d6u8, d0u8);
q13u16 = vmull_u8(d8u8, d0u8);
q14u16 = vmull_u8(d9u8, d0u8);
d2u8 = vext_u8(d2u8, d3u8, 1);
d5u8 = vext_u8(d5u8, d6u8, 1);
d8u8 = vext_u8(d8u8, d9u8, 1);
q9u16 = vmlal_u8(q9u16, d2u8, d1u8);
q11u16 = vmlal_u8(q11u16, d5u8, d1u8);
q13u16 = vmlal_u8(q13u16, d8u8, d1u8);
d3u8 = vext_u8(d3u8, d4u8, 1);
d6u8 = vext_u8(d6u8, d7u8, 1);
d9u8 = vext_u8(d9u8, d10u8, 1);
q10u16 = vmlal_u8(q10u16, d3u8, d1u8);
q12u16 = vmlal_u8(q12u16, d6u8, d1u8);
q14u16 = vmlal_u8(q14u16, d9u8, d1u8);
q1u16 = vmull_u8(d11u8, d0u8);
q2u16 = vmull_u8(d12u8, d0u8);
q3u16 = vmull_u8(d14u8, d0u8);
q4u16 = vmull_u8(d15u8, d0u8);
d11u8 = vext_u8(d11u8, d12u8, 1);
d14u8 = vext_u8(d14u8, d15u8, 1);
q1u16 = vmlal_u8(q1u16, d11u8, d1u8);
q3u16 = vmlal_u8(q3u16, d14u8, d1u8);
d12u8 = vext_u8(d12u8, d13u8, 1);
d15u8 = vext_u8(d15u8, d16u8, 1);
q2u16 = vmlal_u8(q2u16, d12u8, d1u8);
q4u16 = vmlal_u8(q4u16, d15u8, d1u8);
d10u8 = vqrshrn_n_u16(q9u16, 7);
d11u8 = vqrshrn_n_u16(q10u16, 7);
d12u8 = vqrshrn_n_u16(q11u16, 7);
d13u8 = vqrshrn_n_u16(q12u16, 7);
d14u8 = vqrshrn_n_u16(q13u16, 7);
d15u8 = vqrshrn_n_u16(q14u16, 7);
d16u8 = vqrshrn_n_u16(q1u16, 7);
d17u8 = vqrshrn_n_u16(q2u16, 7);
d18u8 = vqrshrn_n_u16(q3u16, 7);
d19u8 = vqrshrn_n_u16(q4u16, 7);
q5u8 = vcombine_u8(d10u8, d11u8);
q6u8 = vcombine_u8(d12u8, d13u8);
q7u8 = vcombine_u8(d14u8, d15u8);
q8u8 = vcombine_u8(d16u8, d17u8);
q9u8 = vcombine_u8(d18u8, d19u8);
vst1q_u8((uint8_t *)tmpp, q5u8); tmpp += 16;
vst1q_u8((uint8_t *)tmpp, q6u8); tmpp += 16;
vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16;
vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16;
vst1q_u8((uint8_t *)tmpp, q9u8);
// secondpass_filter
d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
tmpp = tmp;
q11u8 = vld1q_u8(tmpp);
tmpp += 16;
for (i = 4; i > 0; i--) {
q12u8 = vld1q_u8(tmpp); tmpp += 16;
q13u8 = vld1q_u8(tmpp); tmpp += 16;
q14u8 = vld1q_u8(tmpp); tmpp += 16;
q15u8 = vld1q_u8(tmpp); tmpp += 16;
q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
d2u8 = vqrshrn_n_u16(q1u16, 7);
d3u8 = vqrshrn_n_u16(q2u16, 7);
d4u8 = vqrshrn_n_u16(q3u16, 7);
d5u8 = vqrshrn_n_u16(q4u16, 7);
d6u8 = vqrshrn_n_u16(q5u16, 7);
d7u8 = vqrshrn_n_u16(q6u16, 7);
d8u8 = vqrshrn_n_u16(q7u16, 7);
d9u8 = vqrshrn_n_u16(q8u16, 7);
q1u8 = vcombine_u8(d2u8, d3u8);
q2u8 = vcombine_u8(d4u8, d5u8);
q3u8 = vcombine_u8(d6u8, d7u8);
q4u8 = vcombine_u8(d8u8, d9u8);
q11u8 = q15u8;
vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch;
vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch;
vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch;
vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch;
}
return;
}

View File

@ -1,584 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_build_intra_predictors_mby_neon_func|
EXPORT |vp8_build_intra_predictors_mby_s_neon_func|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *y_buffer
; r1 unsigned char *ypred_ptr
; r2 int y_stride
; r3 int mode
; stack int Up
; stack int Left
|vp8_build_intra_predictors_mby_neon_func| PROC
push {r4-r8, lr}
cmp r3, #0
beq case_dc_pred
cmp r3, #1
beq case_v_pred
cmp r3, #2
beq case_h_pred
cmp r3, #3
beq case_tm_pred
case_dc_pred
ldr r4, [sp, #24] ; Up
ldr r5, [sp, #28] ; Left
; Default the DC average to 128
mov r12, #128
vdup.u8 q0, r12
; Zero out running sum
mov r12, #0
; compute shift and jump
adds r7, r4, r5
beq skip_dc_pred_up_left
; Load above row, if it exists
cmp r4, #0
beq skip_dc_pred_up
sub r6, r0, r2
vld1.8 {q1}, [r6]
vpaddl.u8 q2, q1
vpaddl.u16 q3, q2
vpaddl.u32 q4, q3
vmov.32 r4, d8[0]
vmov.32 r6, d9[0]
add r12, r4, r6
; Move back to interger registers
skip_dc_pred_up
cmp r5, #0
beq skip_dc_pred_left
sub r0, r0, #1
; Load left row, if it exists
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0]
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
skip_dc_pred_left
add r7, r7, #3 ; Shift
sub r4, r7, #1
mov r5, #1
add r12, r12, r5, lsl r4
mov r5, r12, lsr r7 ; expected_dc
vdup.u8 q0, r5
skip_dc_pred_up_left
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
pop {r4-r8,pc}
case_v_pred
; Copy down above row
sub r6, r0, r2
vld1.8 {q0}, [r6]
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
pop {r4-r8,pc}
case_h_pred
; Load 4x yleft_col
sub r0, r0, #1
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1]!
vst1.u8 {q1}, [r1]!
vst1.u8 {q2}, [r1]!
vst1.u8 {q3}, [r1]!
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1]!
vst1.u8 {q1}, [r1]!
vst1.u8 {q2}, [r1]!
vst1.u8 {q3}, [r1]!
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1]!
vst1.u8 {q1}, [r1]!
vst1.u8 {q2}, [r1]!
vst1.u8 {q3}, [r1]!
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1]!
vst1.u8 {q1}, [r1]!
vst1.u8 {q2}, [r1]!
vst1.u8 {q3}, [r1]!
pop {r4-r8,pc}
case_tm_pred
; Load yabove_row
sub r3, r0, r2
vld1.8 {q8}, [r3]
; Load ytop_left
sub r3, r3, #1
ldrb r7, [r3]
vdup.u16 q7, r7
; Compute yabove_row - ytop_left
mov r3, #1
vdup.u8 q0, r3
vmull.u8 q4, d16, d0
vmull.u8 q5, d17, d0
vsub.s16 q4, q4, q7
vsub.s16 q5, q5, q7
; Load 4x yleft_col
sub r0, r0, #1
mov r12, #4
case_tm_pred_loop
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u16 q0, r3
vdup.u16 q1, r4
vdup.u16 q2, r5
vdup.u16 q3, r6
vqadd.s16 q8, q0, q4
vqadd.s16 q9, q0, q5
vqadd.s16 q10, q1, q4
vqadd.s16 q11, q1, q5
vqadd.s16 q12, q2, q4
vqadd.s16 q13, q2, q5
vqadd.s16 q14, q3, q4
vqadd.s16 q15, q3, q5
vqshrun.s16 d0, q8, #0
vqshrun.s16 d1, q9, #0
vqshrun.s16 d2, q10, #0
vqshrun.s16 d3, q11, #0
vqshrun.s16 d4, q12, #0
vqshrun.s16 d5, q13, #0
vqshrun.s16 d6, q14, #0
vqshrun.s16 d7, q15, #0
vst1.u8 {q0}, [r1]!
vst1.u8 {q1}, [r1]!
vst1.u8 {q2}, [r1]!
vst1.u8 {q3}, [r1]!
subs r12, r12, #1
bne case_tm_pred_loop
pop {r4-r8,pc}
ENDP
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r0 unsigned char *y_buffer
; r1 unsigned char *ypred_ptr
; r2 int y_stride
; r3 int mode
; stack int Up
; stack int Left
|vp8_build_intra_predictors_mby_s_neon_func| PROC
push {r4-r8, lr}
mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
cmp r3, #0
beq case_dc_pred_s
cmp r3, #1
beq case_v_pred_s
cmp r3, #2
beq case_h_pred_s
cmp r3, #3
beq case_tm_pred_s
case_dc_pred_s
ldr r4, [sp, #24] ; Up
ldr r5, [sp, #28] ; Left
; Default the DC average to 128
mov r12, #128
vdup.u8 q0, r12
; Zero out running sum
mov r12, #0
; compute shift and jump
adds r7, r4, r5
beq skip_dc_pred_up_left_s
; Load above row, if it exists
cmp r4, #0
beq skip_dc_pred_up_s
sub r6, r0, r2
vld1.8 {q1}, [r6]
vpaddl.u8 q2, q1
vpaddl.u16 q3, q2
vpaddl.u32 q4, q3
vmov.32 r4, d8[0]
vmov.32 r6, d9[0]
add r12, r4, r6
; Move back to interger registers
skip_dc_pred_up_s
cmp r5, #0
beq skip_dc_pred_left_s
sub r0, r0, #1
; Load left row, if it exists
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0]
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
skip_dc_pred_left_s
add r7, r7, #3 ; Shift
sub r4, r7, #1
mov r5, #1
add r12, r12, r5, lsl r4
mov r5, r12, lsr r7 ; expected_dc
vdup.u8 q0, r5
skip_dc_pred_up_left_s
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
pop {r4-r8,pc}
case_v_pred_s
; Copy down above row
sub r6, r0, r2
vld1.8 {q0}, [r6]
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
pop {r4-r8,pc}
case_h_pred_s
; Load 4x yleft_col
sub r0, r0, #1
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1], r2
vst1.u8 {q1}, [r1], r2
vst1.u8 {q2}, [r1], r2
vst1.u8 {q3}, [r1], r2
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1], r2
vst1.u8 {q1}, [r1], r2
vst1.u8 {q2}, [r1], r2
vst1.u8 {q3}, [r1], r2
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1], r2
vst1.u8 {q1}, [r1], r2
vst1.u8 {q2}, [r1], r2
vst1.u8 {q3}, [r1], r2
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1], r2
vst1.u8 {q1}, [r1], r2
vst1.u8 {q2}, [r1], r2
vst1.u8 {q3}, [r1], r2
pop {r4-r8,pc}
case_tm_pred_s
; Load yabove_row
sub r3, r0, r2
vld1.8 {q8}, [r3]
; Load ytop_left
sub r3, r3, #1
ldrb r7, [r3]
vdup.u16 q7, r7
; Compute yabove_row - ytop_left
mov r3, #1
vdup.u8 q0, r3
vmull.u8 q4, d16, d0
vmull.u8 q5, d17, d0
vsub.s16 q4, q4, q7
vsub.s16 q5, q5, q7
; Load 4x yleft_col
sub r0, r0, #1
mov r12, #4
case_tm_pred_loop_s
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u16 q0, r3
vdup.u16 q1, r4
vdup.u16 q2, r5
vdup.u16 q3, r6
vqadd.s16 q8, q0, q4
vqadd.s16 q9, q0, q5
vqadd.s16 q10, q1, q4
vqadd.s16 q11, q1, q5
vqadd.s16 q12, q2, q4
vqadd.s16 q13, q2, q5
vqadd.s16 q14, q3, q4
vqadd.s16 q15, q3, q5
vqshrun.s16 d0, q8, #0
vqshrun.s16 d1, q9, #0
vqshrun.s16 d2, q10, #0
vqshrun.s16 d3, q11, #0
vqshrun.s16 d4, q12, #0
vqshrun.s16 d5, q13, #0
vqshrun.s16 d6, q14, #0
vqshrun.s16 d7, q15, #0
vst1.u8 {q0}, [r1], r2
vst1.u8 {q1}, [r1], r2
vst1.u8 {q2}, [r1], r2
vst1.u8 {q3}, [r1], r2
subs r12, r12, #1
bne case_tm_pred_loop_s
pop {r4-r8,pc}
ENDP
END

View File

@ -1,59 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_copy_mem16x16_neon|
; ARM
; REQUIRE8
; PRESERVE8
AREA Block, CODE, READONLY ; name this block of code
;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|vp8_copy_mem16x16_neon| PROC
vld1.u8 {q0}, [r0], r1
vld1.u8 {q1}, [r0], r1
vld1.u8 {q2}, [r0], r1
vst1.u8 {q0}, [r2], r3
vld1.u8 {q3}, [r0], r1
vst1.u8 {q1}, [r2], r3
vld1.u8 {q4}, [r0], r1
vst1.u8 {q2}, [r2], r3
vld1.u8 {q5}, [r0], r1
vst1.u8 {q3}, [r2], r3
vld1.u8 {q6}, [r0], r1
vst1.u8 {q4}, [r2], r3
vld1.u8 {q7}, [r0], r1
vst1.u8 {q5}, [r2], r3
vld1.u8 {q8}, [r0], r1
vst1.u8 {q6}, [r2], r3
vld1.u8 {q9}, [r0], r1
vst1.u8 {q7}, [r2], r3
vld1.u8 {q10}, [r0], r1
vst1.u8 {q8}, [r2], r3
vld1.u8 {q11}, [r0], r1
vst1.u8 {q9}, [r2], r3
vld1.u8 {q12}, [r0], r1
vst1.u8 {q10}, [r2], r3
vld1.u8 {q13}, [r0], r1
vst1.u8 {q11}, [r2], r3
vld1.u8 {q14}, [r0], r1
vst1.u8 {q12}, [r2], r3
vld1.u8 {q15}, [r0], r1
vst1.u8 {q13}, [r2], r3
vst1.u8 {q14}, [r2], r3
vst1.u8 {q15}, [r2], r3
mov pc, lr
ENDP ; |vp8_copy_mem16x16_neon|
END

View File

@ -1,34 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_copy_mem8x4_neon|
; ARM
; REQUIRE8
; PRESERVE8
AREA Block, CODE, READONLY ; name this block of code
;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|vp8_copy_mem8x4_neon| PROC
vld1.u8 {d0}, [r0], r1
vld1.u8 {d1}, [r0], r1
vst1.u8 {d0}, [r2], r3
vld1.u8 {d2}, [r0], r1
vst1.u8 {d1}, [r2], r3
vld1.u8 {d3}, [r0], r1
vst1.u8 {d2}, [r2], r3
vst1.u8 {d3}, [r2], r3
mov pc, lr
ENDP ; |vp8_copy_mem8x4_neon|
END

View File

@ -1,43 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_copy_mem8x8_neon|
; ARM
; REQUIRE8
; PRESERVE8
AREA Block, CODE, READONLY ; name this block of code
;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|vp8_copy_mem8x8_neon| PROC
vld1.u8 {d0}, [r0], r1
vld1.u8 {d1}, [r0], r1
vst1.u8 {d0}, [r2], r3
vld1.u8 {d2}, [r0], r1
vst1.u8 {d1}, [r2], r3
vld1.u8 {d3}, [r0], r1
vst1.u8 {d2}, [r2], r3
vld1.u8 {d4}, [r0], r1
vst1.u8 {d3}, [r2], r3
vld1.u8 {d5}, [r0], r1
vst1.u8 {d4}, [r2], r3
vld1.u8 {d6}, [r0], r1
vst1.u8 {d5}, [r2], r3
vld1.u8 {d7}, [r0], r1
vst1.u8 {d6}, [r2], r3
vst1.u8 {d7}, [r2], r3
mov pc, lr
ENDP ; |vp8_copy_mem8x8_neon|
END

View File

@ -0,0 +1,59 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
void vp8_copy_mem8x4_neon(
unsigned char *src,
int src_stride,
unsigned char *dst,
int dst_stride) {
uint8x8_t vtmp;
int r;
for (r = 0; r < 4; r++) {
vtmp = vld1_u8(src);
vst1_u8(dst, vtmp);
src += src_stride;
dst += dst_stride;
}
}
void vp8_copy_mem8x8_neon(
unsigned char *src,
int src_stride,
unsigned char *dst,
int dst_stride) {
uint8x8_t vtmp;
int r;
for (r = 0; r < 8; r++) {
vtmp = vld1_u8(src);
vst1_u8(dst, vtmp);
src += src_stride;
dst += dst_stride;
}
}
void vp8_copy_mem16x16_neon(
unsigned char *src,
int src_stride,
unsigned char *dst,
int dst_stride) {
int r;
uint8x16_t qtmp;
for (r = 0; r < 16; r++) {
qtmp = vld1q_u8(src);
vst1q_u8(dst, qtmp);
src += src_stride;
dst += dst_stride;
}
}

View File

@ -1,54 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |vp8_dc_only_idct_add_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
; int pred_stride, unsigned char *dst_ptr,
; int dst_stride)
; r0 input_dc
; r1 pred_ptr
; r2 pred_stride
; r3 dst_ptr
; sp dst_stride
|vp8_dc_only_idct_add_neon| PROC
add r0, r0, #4
asr r0, r0, #3
ldr r12, [sp]
vdup.16 q0, r0
vld1.32 {d2[0]}, [r1], r2
vld1.32 {d2[1]}, [r1], r2
vld1.32 {d4[0]}, [r1], r2
vld1.32 {d4[1]}, [r1]
vaddw.u8 q1, q0, d2
vaddw.u8 q2, q0, d4
vqmovun.s16 d2, q1
vqmovun.s16 d4, q2
vst1.32 {d2[0]}, [r3], r12
vst1.32 {d2[1]}, [r3], r12
vst1.32 {d4[0]}, [r3], r12
vst1.32 {d4[1]}, [r3]
bx lr
ENDP
END

View File

@ -0,0 +1,42 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
void vp8_dc_only_idct_add_neon(
int16_t input_dc,
unsigned char *pred_ptr,
int pred_stride,
unsigned char *dst_ptr,
int dst_stride) {
int i;
uint16_t a1 = ((input_dc + 4) >> 3);
uint32x2_t d2u32 = vdup_n_u32(0);
uint8x8_t d2u8;
uint16x8_t q1u16;
uint16x8_t qAdd;
qAdd = vdupq_n_u16(a1);
for (i = 0; i < 2; i++) {
d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 0);
pred_ptr += pred_stride;
d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 1);
pred_ptr += pred_stride;
q1u16 = vaddw_u8(qAdd, vreinterpret_u8_u32(d2u32));
d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0);
dst_ptr += dst_stride;
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1);
dst_ptr += dst_stride;
}
}

View File

@ -1,131 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_dequant_idct_add_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_dequant_idct_add_neon(short *input, short *dq,
; unsigned char *dest, int stride)
; r0 short *input,
; r1 short *dq,
; r2 unsigned char *dest
; r3 int stride
|vp8_dequant_idct_add_neon| PROC
vld1.16 {q3, q4}, [r0]
vld1.16 {q5, q6}, [r1]
add r1, r2, r3 ; r1 = dest + stride
lsl r3, #1 ; 2x stride
vld1.32 {d14[0]}, [r2], r3
vld1.32 {d14[1]}, [r1], r3
vld1.32 {d15[0]}, [r2]
vld1.32 {d15[1]}, [r1]
adr r12, cospi8sqrt2minus1 ; pointer to the first constant
vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon
vmul.i16 q2, q4, q6
;|short_idct4x4llm_neon| PROC
vld1.16 {d0}, [r12]
vswp d3, d4 ;q2(vp[4] vp[12])
vqdmulh.s16 q3, q2, d0[2]
vqdmulh.s16 q4, q2, d0[0]
vqadd.s16 d12, d2, d3 ;a1
vqsub.s16 d13, d2, d3 ;b1
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
vqadd.s16 q3, q3, q2
vqadd.s16 q4, q4, q2
vqsub.s16 d10, d6, d9 ;c1
vqadd.s16 d11, d7, d8 ;d1
vqadd.s16 d2, d12, d11
vqadd.s16 d3, d13, d10
vqsub.s16 d4, d13, d10
vqsub.s16 d5, d12, d11
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
; memset(input, 0, 32) -- 32bytes
vmov.i16 q14, #0
vswp d3, d4
vqdmulh.s16 q3, q2, d0[2]
vqdmulh.s16 q4, q2, d0[0]
vqadd.s16 d12, d2, d3 ;a1
vqsub.s16 d13, d2, d3 ;b1
vmov q15, q14
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
vqadd.s16 q3, q3, q2
vqadd.s16 q4, q4, q2
vqsub.s16 d10, d6, d9 ;c1
vqadd.s16 d11, d7, d8 ;d1
vqadd.s16 d2, d12, d11
vqadd.s16 d3, d13, d10
vqsub.s16 d4, d13, d10
vqsub.s16 d5, d12, d11
vst1.16 {q14, q15}, [r0]
vrshr.s16 d2, d2, #3
vrshr.s16 d3, d3, #3
vrshr.s16 d4, d4, #3
vrshr.s16 d5, d5, #3
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
vaddw.u8 q1, q1, d14
vaddw.u8 q2, q2, d15
sub r2, r2, r3
sub r1, r1, r3
vqmovun.s16 d0, q1
vqmovun.s16 d1, q2
vst1.32 {d0[0]}, [r2], r3
vst1.32 {d0[1]}, [r1], r3
vst1.32 {d1[0]}, [r2]
vst1.32 {d1[1]}, [r1]
bx lr
ENDP ; |vp8_dequant_idct_add_neon|
; Constant Pool
cospi8sqrt2minus1 DCD 0x4e7b4e7b
sinpi8sqrt2 DCD 0x8a8c8a8c
END

View File

@ -0,0 +1,142 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
static const int16_t cospi8sqrt2minus1 = 20091;
static const int16_t sinpi8sqrt2 = 35468;
void vp8_dequant_idct_add_neon(
int16_t *input,
int16_t *dq,
unsigned char *dst,
int stride) {
unsigned char *dst0;
int32x2_t d14, d15;
int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
int16x8_t q1, q2, q3, q4, q5, q6;
int16x8_t qEmpty = vdupq_n_s16(0);
int32x2x2_t d2tmp0, d2tmp1;
int16x4x2_t d2tmp2, d2tmp3;
d14 = d15 = vdup_n_s32(0);
// load input
q3 = vld1q_s16(input);
vst1q_s16(input, qEmpty);
input += 8;
q4 = vld1q_s16(input);
vst1q_s16(input, qEmpty);
// load dq
q5 = vld1q_s16(dq);
dq += 8;
q6 = vld1q_s16(dq);
// load src from dst
dst0 = dst;
d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0);
dst0 += stride;
d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1);
dst0 += stride;
d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0);
dst0 += stride;
d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1);
q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3),
vreinterpretq_u16_s16(q5)));
q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4),
vreinterpretq_u16_s16(q6)));
d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2));
d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2));
q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2));
q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
q3 = vshrq_n_s16(q3, 1);
q4 = vshrq_n_s16(q4, 1);
q3 = vqaddq_s16(q3, q2);
q4 = vqaddq_s16(q4, q2);
d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
d2 = vqadd_s16(d12, d11);
d3 = vqadd_s16(d13, d10);
d4 = vqsub_s16(d13, d10);
d5 = vqsub_s16(d12, d11);
d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
vreinterpret_s16_s32(d2tmp1.val[0]));
d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
vreinterpret_s16_s32(d2tmp1.val[1]));
// loop 2
q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]);
q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
q3 = vshrq_n_s16(q3, 1);
q4 = vshrq_n_s16(q4, 1);
q3 = vqaddq_s16(q3, q2);
q4 = vqaddq_s16(q4, q2);
d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
d2 = vqadd_s16(d12, d11);
d3 = vqadd_s16(d13, d10);
d4 = vqsub_s16(d13, d10);
d5 = vqsub_s16(d12, d11);
d2 = vrshr_n_s16(d2, 3);
d3 = vrshr_n_s16(d3, 3);
d4 = vrshr_n_s16(d4, 3);
d5 = vrshr_n_s16(d5, 3);
d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
vreinterpret_s16_s32(d2tmp1.val[0]));
d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
vreinterpret_s16_s32(d2tmp1.val[1]));
q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]);
q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]);
q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1),
vreinterpret_u8_s32(d14)));
q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2),
vreinterpret_u8_s32(d15)));
d14 = vreinterpret_s32_u8(vqmovun_s16(q1));
d15 = vreinterpret_s32_u8(vqmovun_s16(q2));
dst0 = dst;
vst1_lane_s32((int32_t *)dst0, d14, 0);
dst0 += stride;
vst1_lane_s32((int32_t *)dst0, d14, 1);
dst0 += stride;
vst1_lane_s32((int32_t *)dst0, d15, 0);
dst0 += stride;
vst1_lane_s32((int32_t *)dst0, d15, 1);
return;
}

View File

@ -1,34 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_dequantize_b_loop_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 short *Q,
; r1 short *DQC
; r2 short *DQ
|vp8_dequantize_b_loop_neon| PROC
vld1.16 {q0, q1}, [r0]
vld1.16 {q2, q3}, [r1]
vmul.i16 q4, q0, q2
vmul.i16 q5, q1, q3
vst1.16 {q4, q5}, [r2]
bx lr
ENDP
END

View File

@ -0,0 +1,25 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "vp8/common/blockd.h"
void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) {
int16x8x2_t qQ, qDQC, qDQ;
qQ = vld2q_s16(d->qcoeff);
qDQC = vld2q_s16(DQC);
qDQ.val[0] = vmulq_s16(qQ.val[0], qDQC.val[0]);
qDQ.val[1] = vmulq_s16(qQ.val[1], qDQC.val[1]);
vst2q_s16(d->dqcoeff, qDQ);
}

View File

@ -1,79 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |idct_dequant_0_2x_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void idct_dequant_0_2x_neon(short *q, short dq,
; unsigned char *dst, int stride);
; r0 *q
; r1 dq
; r2 *dst
; r3 stride
|idct_dequant_0_2x_neon| PROC
push {r4, r5}
add r12, r2, #4
vld1.32 {d2[0]}, [r2], r3
vld1.32 {d8[0]}, [r12], r3
vld1.32 {d2[1]}, [r2], r3
vld1.32 {d8[1]}, [r12], r3
vld1.32 {d4[0]}, [r2], r3
vld1.32 {d10[0]}, [r12], r3
vld1.32 {d4[1]}, [r2], r3
vld1.32 {d10[1]}, [r12], r3
ldrh r12, [r0] ; lo q
ldrh r4, [r0, #32] ; hi q
mov r5, #0
strh r5, [r0]
strh r5, [r0, #32]
sxth r12, r12 ; lo
mul r0, r12, r1
add r0, r0, #4
asr r0, r0, #3
vdup.16 q0, r0
sxth r4, r4 ; hi
mul r0, r4, r1
add r0, r0, #4
asr r0, r0, #3
vdup.16 q3, r0
vaddw.u8 q1, q0, d2 ; lo
vaddw.u8 q2, q0, d4
vaddw.u8 q4, q3, d8 ; hi
vaddw.u8 q5, q3, d10
sub r2, r2, r3, lsl #2 ; dst - 4*stride
add r0, r2, #4
vqmovun.s16 d2, q1 ; lo
vqmovun.s16 d4, q2
vqmovun.s16 d8, q4 ; hi
vqmovun.s16 d10, q5
vst1.32 {d2[0]}, [r2], r3 ; lo
vst1.32 {d8[0]}, [r0], r3 ; hi
vst1.32 {d2[1]}, [r2], r3
vst1.32 {d8[1]}, [r0], r3
vst1.32 {d4[0]}, [r2], r3
vst1.32 {d10[0]}, [r0], r3
vst1.32 {d4[1]}, [r2]
vst1.32 {d10[1]}, [r0]
pop {r4, r5}
bx lr
ENDP ; |idct_dequant_0_2x_neon|
END

View File

@ -0,0 +1,62 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
void idct_dequant_0_2x_neon(
int16_t *q,
int16_t dq,
unsigned char *dst,
int stride) {
unsigned char *dst0;
int i, a0, a1;
int16x8x2_t q2Add;
int32x2_t d2s32, d4s32;
uint8x8_t d2u8, d4u8;
uint16x8_t q1u16, q2u16;
a0 = ((q[0] * dq) + 4) >> 3;
a1 = ((q[16] * dq) + 4) >> 3;
q[0] = q[16] = 0;
q2Add.val[0] = vdupq_n_s16((int16_t)a0);
q2Add.val[1] = vdupq_n_s16((int16_t)a1);
for (i = 0; i < 2; i++, dst += 4) {
dst0 = dst;
d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
dst0 += stride;
d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
dst0 += stride;
d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
dst0 += stride;
d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
vreinterpret_u8_s32(d2s32));
q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
vreinterpret_u8_s32(d4s32));
d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
d2s32 = vreinterpret_s32_u8(d2u8);
d4s32 = vreinterpret_s32_u8(d4u8);
dst0 = dst;
vst1_lane_s32((int32_t *)dst0, d2s32, 0);
dst0 += stride;
vst1_lane_s32((int32_t *)dst0, d2s32, 1);
dst0 += stride;
vst1_lane_s32((int32_t *)dst0, d4s32, 0);
dst0 += stride;
vst1_lane_s32((int32_t *)dst0, d4s32, 1);
}
return;
}

View File

@ -1,196 +0,0 @@
;
; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |idct_dequant_full_2x_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void idct_dequant_full_2x_neon(short *q, short *dq,
; unsigned char *dst, int stride);
; r0 *q,
; r1 *dq,
; r2 *dst
; r3 stride
|idct_dequant_full_2x_neon| PROC
vld1.16 {q0, q1}, [r1] ; dq (same l/r)
vld1.16 {q2, q3}, [r0] ; l q
add r0, r0, #32
vld1.16 {q4, q5}, [r0] ; r q
add r12, r2, #4
; interleave the predictors
vld1.32 {d28[0]}, [r2], r3 ; l pre
vld1.32 {d28[1]}, [r12], r3 ; r pre
vld1.32 {d29[0]}, [r2], r3
vld1.32 {d29[1]}, [r12], r3
vld1.32 {d30[0]}, [r2], r3
vld1.32 {d30[1]}, [r12], r3
vld1.32 {d31[0]}, [r2], r3
vld1.32 {d31[1]}, [r12]
adr r1, cospi8sqrt2minus1 ; pointer to the first constant
; dequant: q[i] = q[i] * dq[i]
vmul.i16 q2, q2, q0
vmul.i16 q3, q3, q1
vmul.i16 q4, q4, q0
vmul.i16 q5, q5, q1
vld1.16 {d0}, [r1]
; q2: l0r0 q3: l8r8
; q4: l4r4 q5: l12r12
vswp d5, d8
vswp d7, d10
; _CONSTANTS_ * 4,12 >> 16
; q6: 4 * sinpi : c1/temp1
; q7: 12 * sinpi : d1/temp2
; q8: 4 * cospi
; q9: 12 * cospi
vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2
vqdmulh.s16 q7, q5, d0[2]
vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1
vqdmulh.s16 q9, q5, d0[0]
vqadd.s16 q10, q2, q3 ; a1 = 0 + 8
vqsub.s16 q11, q2, q3 ; b1 = 0 - 8
; vqdmulh only accepts signed values. this was a problem because
; our constant had the high bit set, and was treated as a negative value.
; vqdmulh also doubles the value before it shifts by 16. we need to
; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
; so we can shift the constant without losing precision. this avoids
; shift again afterward, but also avoids the sign issue. win win!
; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
; pre-shift it
vshr.s16 q8, q8, #1
vshr.s16 q9, q9, #1
; q4: 4 + 4 * cospi : d1/temp1
; q5: 12 + 12 * cospi : c1/temp2
vqadd.s16 q4, q4, q8
vqadd.s16 q5, q5, q9
; c1 = temp1 - temp2
; d1 = temp1 + temp2
vqsub.s16 q2, q6, q5
vqadd.s16 q3, q4, q7
; [0]: a1+d1
; [1]: b1+c1
; [2]: b1-c1
; [3]: a1-d1
vqadd.s16 q4, q10, q3
vqadd.s16 q5, q11, q2
vqsub.s16 q6, q11, q2
vqsub.s16 q7, q10, q3
; rotate
vtrn.32 q4, q6
vtrn.32 q5, q7
vtrn.16 q4, q5
vtrn.16 q6, q7
; idct loop 2
; q4: l 0, 4, 8,12 r 0, 4, 8,12
; q5: l 1, 5, 9,13 r 1, 5, 9,13
; q6: l 2, 6,10,14 r 2, 6,10,14
; q7: l 3, 7,11,15 r 3, 7,11,15
; q8: 1 * sinpi : c1/temp1
; q9: 3 * sinpi : d1/temp2
; q10: 1 * cospi
; q11: 3 * cospi
vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2
vqdmulh.s16 q9, q7, d0[2]
vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1
vqdmulh.s16 q11, q7, d0[0]
vqadd.s16 q2, q4, q6 ; a1 = 0 + 2
vqsub.s16 q3, q4, q6 ; b1 = 0 - 2
; see note on shifting above
vshr.s16 q10, q10, #1
vshr.s16 q11, q11, #1
; q10: 1 + 1 * cospi : d1/temp1
; q11: 3 + 3 * cospi : c1/temp2
vqadd.s16 q10, q5, q10
vqadd.s16 q11, q7, q11
; q8: c1 = temp1 - temp2
; q9: d1 = temp1 + temp2
vqsub.s16 q8, q8, q11
vqadd.s16 q9, q10, q9
; a1+d1
; b1+c1
; b1-c1
; a1-d1
vqadd.s16 q4, q2, q9
vqadd.s16 q5, q3, q8
vqsub.s16 q6, q3, q8
vqsub.s16 q7, q2, q9
; +4 >> 3 (rounding)
vrshr.s16 q4, q4, #3 ; lo
vrshr.s16 q5, q5, #3
vrshr.s16 q6, q6, #3 ; hi
vrshr.s16 q7, q7, #3
vtrn.32 q4, q6
vtrn.32 q5, q7
vtrn.16 q4, q5
vtrn.16 q6, q7
; adding pre
; input is still packed. pre was read interleaved
vaddw.u8 q4, q4, d28
vaddw.u8 q5, q5, d29
vaddw.u8 q6, q6, d30
vaddw.u8 q7, q7, d31
vmov.i16 q14, #0
vmov q15, q14
vst1.16 {q14, q15}, [r0] ; write over high input
sub r0, r0, #32
vst1.16 {q14, q15}, [r0] ; write over low input
sub r2, r2, r3, lsl #2 ; dst - 4*stride
add r1, r2, #4 ; hi
;saturate and narrow
vqmovun.s16 d0, q4 ; lo
vqmovun.s16 d1, q5
vqmovun.s16 d2, q6 ; hi
vqmovun.s16 d3, q7
vst1.32 {d0[0]}, [r2], r3 ; lo
vst1.32 {d0[1]}, [r1], r3 ; hi
vst1.32 {d1[0]}, [r2], r3
vst1.32 {d1[1]}, [r1], r3
vst1.32 {d2[0]}, [r2], r3
vst1.32 {d2[1]}, [r1], r3
vst1.32 {d3[0]}, [r2]
vst1.32 {d3[1]}, [r1]
bx lr
ENDP ; |idct_dequant_full_2x_neon|
; Constant Pool
cospi8sqrt2minus1 DCD 0x4e7b
; because the lowest bit in 0x8a8c is 0, we can pre-shift this
sinpi8sqrt2 DCD 0x4546
END

View File

@ -0,0 +1,185 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
static const int16_t cospi8sqrt2minus1 = 20091;
static const int16_t sinpi8sqrt2 = 17734;
// because the lowest bit in 0x8a8c is 0, we can pre-shift this
void idct_dequant_full_2x_neon(
int16_t *q,
int16_t *dq,
unsigned char *dst,
int stride) {
unsigned char *dst0, *dst1;
int32x2_t d28, d29, d30, d31;
int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
int16x8_t qEmpty = vdupq_n_s16(0);
int32x4x2_t q2tmp0, q2tmp1;
int16x8x2_t q2tmp2, q2tmp3;
int16x4_t dLow0, dLow1, dHigh0, dHigh1;
d28 = d29 = d30 = d31 = vdup_n_s32(0);
// load dq
q0 = vld1q_s16(dq);
dq += 8;
q1 = vld1q_s16(dq);
// load q
q2 = vld1q_s16(q);
vst1q_s16(q, qEmpty);
q += 8;
q3 = vld1q_s16(q);
vst1q_s16(q, qEmpty);
q += 8;
q4 = vld1q_s16(q);
vst1q_s16(q, qEmpty);
q += 8;
q5 = vld1q_s16(q);
vst1q_s16(q, qEmpty);
// load src from dst
dst0 = dst;
dst1 = dst + 4;
d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
dst0 += stride;
d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
dst1 += stride;
d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
dst0 += stride;
d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
dst1 += stride;
d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
dst0 += stride;
d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
dst1 += stride;
d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
q2 = vmulq_s16(q2, q0);
q3 = vmulq_s16(q3, q1);
q4 = vmulq_s16(q4, q0);
q5 = vmulq_s16(q5, q1);
// vswp
dLow0 = vget_low_s16(q2);
dHigh0 = vget_high_s16(q2);
dLow1 = vget_low_s16(q4);
dHigh1 = vget_high_s16(q4);
q2 = vcombine_s16(dLow0, dLow1);
q4 = vcombine_s16(dHigh0, dHigh1);
dLow0 = vget_low_s16(q3);
dHigh0 = vget_high_s16(q3);
dLow1 = vget_low_s16(q5);
dHigh1 = vget_high_s16(q5);
q3 = vcombine_s16(dLow0, dLow1);
q5 = vcombine_s16(dHigh0, dHigh1);
q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
q10 = vqaddq_s16(q2, q3);
q11 = vqsubq_s16(q2, q3);
q8 = vshrq_n_s16(q8, 1);
q9 = vshrq_n_s16(q9, 1);
q4 = vqaddq_s16(q4, q8);
q5 = vqaddq_s16(q5, q9);
q2 = vqsubq_s16(q6, q5);
q3 = vqaddq_s16(q7, q4);
q4 = vqaddq_s16(q10, q3);
q5 = vqaddq_s16(q11, q2);
q6 = vqsubq_s16(q11, q2);
q7 = vqsubq_s16(q10, q3);
q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
vreinterpretq_s16_s32(q2tmp1.val[0]));
q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
vreinterpretq_s16_s32(q2tmp1.val[1]));
// loop 2
q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
q10 = vshrq_n_s16(q10, 1);
q11 = vshrq_n_s16(q11, 1);
q10 = vqaddq_s16(q2tmp2.val[1], q10);
q11 = vqaddq_s16(q2tmp3.val[1], q11);
q8 = vqsubq_s16(q8, q11);
q9 = vqaddq_s16(q9, q10);
q4 = vqaddq_s16(q2, q9);
q5 = vqaddq_s16(q3, q8);
q6 = vqsubq_s16(q3, q8);
q7 = vqsubq_s16(q2, q9);
q4 = vrshrq_n_s16(q4, 3);
q5 = vrshrq_n_s16(q5, 3);
q6 = vrshrq_n_s16(q6, 3);
q7 = vrshrq_n_s16(q7, 3);
q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
vreinterpretq_s16_s32(q2tmp1.val[0]));
q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
vreinterpretq_s16_s32(q2tmp1.val[1]));
q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]),
vreinterpret_u8_s32(d28)));
q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]),
vreinterpret_u8_s32(d29)));
q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]),
vreinterpret_u8_s32(d30)));
q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]),
vreinterpret_u8_s32(d31)));
d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
dst0 = dst;
dst1 = dst + 4;
vst1_lane_s32((int32_t *)dst0, d28, 0);
dst0 += stride;
vst1_lane_s32((int32_t *)dst1, d28, 1);
dst1 += stride;
vst1_lane_s32((int32_t *)dst0, d29, 0);
dst0 += stride;
vst1_lane_s32((int32_t *)dst1, d29, 1);
dst1 += stride;
vst1_lane_s32((int32_t *)dst0, d30, 0);
dst0 += stride;
vst1_lane_s32((int32_t *)dst1, d30, 1);
dst1 += stride;
vst1_lane_s32((int32_t *)dst0, d31, 0);
vst1_lane_s32((int32_t *)dst1, d31, 1);
return;
}

View File

@ -1,87 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_inv_walsh4x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA |.text|, CODE, READONLY ; name this block of code
;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff)
|vp8_short_inv_walsh4x4_neon| PROC
; read in all four lines of values: d0->d3
vld1.i16 {q0-q1}, [r0@128]
; first for loop
vadd.s16 d4, d0, d3 ;a = [0] + [12]
vadd.s16 d6, d1, d2 ;b = [4] + [8]
vsub.s16 d5, d0, d3 ;d = [0] - [12]
vsub.s16 d7, d1, d2 ;c = [4] - [8]
vadd.s16 q0, q2, q3 ; a+b d+c
vsub.s16 q1, q2, q3 ; a-b d-c
vtrn.32 d0, d2 ;d0: 0 1 8 9
;d2: 2 3 10 11
vtrn.32 d1, d3 ;d1: 4 5 12 13
;d3: 6 7 14 15
vtrn.16 d0, d1 ;d0: 0 4 8 12
;d1: 1 5 9 13
vtrn.16 d2, d3 ;d2: 2 6 10 14
;d3: 3 7 11 15
; second for loop
vadd.s16 d4, d0, d3 ;a = [0] + [3]
vadd.s16 d6, d1, d2 ;b = [1] + [2]
vsub.s16 d5, d0, d3 ;d = [0] - [3]
vsub.s16 d7, d1, d2 ;c = [1] - [2]
vmov.i16 q8, #3
vadd.s16 q0, q2, q3 ; a+b d+c
vsub.s16 q1, q2, q3 ; a-b d-c
vadd.i16 q0, q0, q8 ;e/f += 3
vadd.i16 q1, q1, q8 ;g/h += 3
vshr.s16 q0, q0, #3 ;e/f >> 3
vshr.s16 q1, q1, #3 ;g/h >> 3
mov r2, #64
add r3, r1, #32
vst1.i16 d0[0], [r1],r2
vst1.i16 d1[0], [r3],r2
vst1.i16 d2[0], [r1],r2
vst1.i16 d3[0], [r3],r2
vst1.i16 d0[1], [r1],r2
vst1.i16 d1[1], [r3],r2
vst1.i16 d2[1], [r1],r2
vst1.i16 d3[1], [r3],r2
vst1.i16 d0[2], [r1],r2
vst1.i16 d1[2], [r3],r2
vst1.i16 d2[2], [r1],r2
vst1.i16 d3[2], [r3],r2
vst1.i16 d0[3], [r1],r2
vst1.i16 d1[3], [r3],r2
vst1.i16 d2[3], [r1]
vst1.i16 d3[3], [r3]
bx lr
ENDP ; |vp8_short_inv_walsh4x4_neon|
END

View File

@ -0,0 +1,102 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
void vp8_short_inv_walsh4x4_neon(
int16_t *input,
int16_t *mb_dqcoeff) {
int16x8_t q0s16, q1s16, q2s16, q3s16;
int16x4_t d4s16, d5s16, d6s16, d7s16;
int16x4x2_t v2tmp0, v2tmp1;
int32x2x2_t v2tmp2, v2tmp3;
int16x8_t qAdd3;
q0s16 = vld1q_s16(input);
q1s16 = vld1q_s16(input + 8);
// 1st for loop
d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
q2s16 = vcombine_s16(d4s16, d5s16);
q3s16 = vcombine_s16(d6s16, d7s16);
q0s16 = vaddq_s16(q2s16, q3s16);
q1s16 = vsubq_s16(q2s16, q3s16);
v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)),
vreinterpret_s32_s16(vget_low_s16(q1s16)));
v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)),
vreinterpret_s32_s16(vget_high_s16(q1s16)));
v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),
vreinterpret_s16_s32(v2tmp3.val[0]));
v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),
vreinterpret_s16_s32(v2tmp3.val[1]));
// 2nd for loop
d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
q2s16 = vcombine_s16(d4s16, d5s16);
q3s16 = vcombine_s16(d6s16, d7s16);
qAdd3 = vdupq_n_s16(3);
q0s16 = vaddq_s16(q2s16, q3s16);
q1s16 = vsubq_s16(q2s16, q3s16);
q0s16 = vaddq_s16(q0s16, qAdd3);
q1s16 = vaddq_s16(q1s16, qAdd3);
q0s16 = vshrq_n_s16(q0s16, 3);
q1s16 = vshrq_n_s16(q1s16, 3);
// store
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 0);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 0);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 1);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 1);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 2);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 2);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 3);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 3);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3);
mb_dqcoeff += 16;
return;
}

View File

@ -1,397 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_loop_filter_horizontal_edge_y_neon|
EXPORT |vp8_loop_filter_horizontal_edge_uv_neon|
EXPORT |vp8_loop_filter_vertical_edge_y_neon|
EXPORT |vp8_loop_filter_vertical_edge_uv_neon|
ARM
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src
; r1 int pitch
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
|vp8_loop_filter_horizontal_edge_y_neon| PROC
push {lr}
vdup.u8 q0, r2 ; duplicate blimit
vdup.u8 q1, r3 ; duplicate limit
sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines
ldr r3, [sp, #4] ; load thresh
add r12, r2, r1
add r1, r1, r1
vdup.u8 q2, r3 ; duplicate thresh
vld1.u8 {q3}, [r2@128], r1 ; p3
vld1.u8 {q4}, [r12@128], r1 ; p2
vld1.u8 {q5}, [r2@128], r1 ; p1
vld1.u8 {q6}, [r12@128], r1 ; p0
vld1.u8 {q7}, [r2@128], r1 ; q0
vld1.u8 {q8}, [r12@128], r1 ; q1
vld1.u8 {q9}, [r2@128] ; q2
vld1.u8 {q10}, [r12@128] ; q3
sub r2, r2, r1, lsl #1
sub r12, r12, r1, lsl #1
bl vp8_loop_filter_neon
vst1.u8 {q5}, [r2@128], r1 ; store op1
vst1.u8 {q6}, [r12@128], r1 ; store op0
vst1.u8 {q7}, [r2@128], r1 ; store oq0
vst1.u8 {q8}, [r12@128], r1 ; store oq1
pop {pc}
ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
; r0 unsigned char *u,
; r1 int pitch,
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
; sp+4 unsigned char *v
|vp8_loop_filter_horizontal_edge_uv_neon| PROC
push {lr}
vdup.u8 q0, r2 ; duplicate blimit
vdup.u8 q1, r3 ; duplicate limit
ldr r12, [sp, #4] ; load thresh
ldr r2, [sp, #8] ; load v ptr
vdup.u8 q2, r12 ; duplicate thresh
sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines
sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines
vld1.u8 {d6}, [r3@64], r1 ; p3
vld1.u8 {d7}, [r12@64], r1 ; p3
vld1.u8 {d8}, [r3@64], r1 ; p2
vld1.u8 {d9}, [r12@64], r1 ; p2
vld1.u8 {d10}, [r3@64], r1 ; p1
vld1.u8 {d11}, [r12@64], r1 ; p1
vld1.u8 {d12}, [r3@64], r1 ; p0
vld1.u8 {d13}, [r12@64], r1 ; p0
vld1.u8 {d14}, [r3@64], r1 ; q0
vld1.u8 {d15}, [r12@64], r1 ; q0
vld1.u8 {d16}, [r3@64], r1 ; q1
vld1.u8 {d17}, [r12@64], r1 ; q1
vld1.u8 {d18}, [r3@64], r1 ; q2
vld1.u8 {d19}, [r12@64], r1 ; q2
vld1.u8 {d20}, [r3@64] ; q3
vld1.u8 {d21}, [r12@64] ; q3
bl vp8_loop_filter_neon
sub r0, r0, r1, lsl #1
sub r2, r2, r1, lsl #1
vst1.u8 {d10}, [r0@64], r1 ; store u op1
vst1.u8 {d11}, [r2@64], r1 ; store v op1
vst1.u8 {d12}, [r0@64], r1 ; store u op0
vst1.u8 {d13}, [r2@64], r1 ; store v op0
vst1.u8 {d14}, [r0@64], r1 ; store u oq0
vst1.u8 {d15}, [r2@64], r1 ; store v oq0
vst1.u8 {d16}, [r0@64] ; store u oq1
vst1.u8 {d17}, [r2@64] ; store v oq1
pop {pc}
ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon|
; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; int count)
; r0 unsigned char *src
; r1 int pitch
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
|vp8_loop_filter_vertical_edge_y_neon| PROC
push {lr}
vdup.u8 q0, r2 ; duplicate blimit
vdup.u8 q1, r3 ; duplicate limit
sub r2, r0, #4 ; src ptr down by 4 columns
add r1, r1, r1
ldr r3, [sp, #4] ; load thresh
add r12, r2, r1, asr #1
vld1.u8 {d6}, [r2], r1
vld1.u8 {d8}, [r12], r1
vld1.u8 {d10}, [r2], r1
vld1.u8 {d12}, [r12], r1
vld1.u8 {d14}, [r2], r1
vld1.u8 {d16}, [r12], r1
vld1.u8 {d18}, [r2], r1
vld1.u8 {d20}, [r12], r1
vld1.u8 {d7}, [r2], r1 ; load second 8-line src data
vld1.u8 {d9}, [r12], r1
vld1.u8 {d11}, [r2], r1
vld1.u8 {d13}, [r12], r1
vld1.u8 {d15}, [r2], r1
vld1.u8 {d17}, [r12], r1
vld1.u8 {d19}, [r2]
vld1.u8 {d21}, [r12]
;transpose to 8x16 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vdup.u8 q2, r3 ; duplicate thresh
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
bl vp8_loop_filter_neon
vswp d12, d11
vswp d16, d13
sub r0, r0, #2 ; dst ptr
vswp d14, d12
vswp d16, d15
add r12, r0, r1, asr #1
;store op1, op0, oq0, oq1
vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1
vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0]
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12]
pop {pc}
ENDP ; |vp8_loop_filter_vertical_edge_y_neon|
; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
; sp+4 unsigned char *v
|vp8_loop_filter_vertical_edge_uv_neon| PROC
push {lr}
vdup.u8 q0, r2 ; duplicate blimit
sub r12, r0, #4 ; move u pointer down by 4 columns
ldr r2, [sp, #8] ; load v ptr
vdup.u8 q1, r3 ; duplicate limit
sub r3, r2, #4 ; move v pointer down by 4 columns
vld1.u8 {d6}, [r12], r1 ;load u data
vld1.u8 {d7}, [r3], r1 ;load v data
vld1.u8 {d8}, [r12], r1
vld1.u8 {d9}, [r3], r1
vld1.u8 {d10}, [r12], r1
vld1.u8 {d11}, [r3], r1
vld1.u8 {d12}, [r12], r1
vld1.u8 {d13}, [r3], r1
vld1.u8 {d14}, [r12], r1
vld1.u8 {d15}, [r3], r1
vld1.u8 {d16}, [r12], r1
vld1.u8 {d17}, [r3], r1
vld1.u8 {d18}, [r12], r1
vld1.u8 {d19}, [r3], r1
vld1.u8 {d20}, [r12]
vld1.u8 {d21}, [r3]
ldr r12, [sp, #4] ; load thresh
;transpose to 8x16 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vdup.u8 q2, r12 ; duplicate thresh
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
bl vp8_loop_filter_neon
vswp d12, d11
vswp d16, d13
vswp d14, d12
vswp d16, d15
sub r0, r0, #2
sub r2, r2, #2
;store op1, op0, oq0, oq1
vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r2], r1
vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r2], r1
vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r2], r1
vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r2], r1
vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r2], r1
vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r2], r1
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0]
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2]
pop {pc}
ENDP ; |vp8_loop_filter_vertical_edge_uv_neon|
; void vp8_loop_filter_neon();
; This is a helper function for the loopfilters. The invidual functions do the
; necessary load, transpose (if necessary) and store.
; r0-r3 PRESERVE
; q0 flimit
; q1 limit
; q2 thresh
; q3 p3
; q4 p2
; q5 p1
; q6 p0
; q7 q0
; q8 q1
; q9 q2
; q10 q3
|vp8_loop_filter_neon| PROC
; vp8_filter_mask
vabd.u8 q11, q3, q4 ; abs(p3 - p2)
vabd.u8 q12, q4, q5 ; abs(p2 - p1)
vabd.u8 q13, q5, q6 ; abs(p1 - p0)
vabd.u8 q14, q8, q7 ; abs(q1 - q0)
vabd.u8 q3, q9, q8 ; abs(q2 - q1)
vabd.u8 q4, q10, q9 ; abs(q3 - q2)
vmax.u8 q11, q11, q12
vmax.u8 q12, q13, q14
vmax.u8 q3, q3, q4
vmax.u8 q15, q11, q12
vabd.u8 q9, q6, q7 ; abs(p0 - q0)
; vp8_hevmask
vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1
vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1
vmax.u8 q15, q15, q3
vmov.u8 q10, #0x80 ; 0x80
vabd.u8 q2, q5, q8 ; a = abs(p1 - q1)
vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2
vcge.u8 q15, q1, q15
; vp8_filter() function
; convert to signed
veor q7, q7, q10 ; qs0
vshr.u8 q2, q2, #1 ; a = a / 2
veor q6, q6, q10 ; ps0
veor q5, q5, q10 ; ps1
vqadd.u8 q9, q9, q2 ; a = b + a
veor q8, q8, q10 ; qs1
vmov.u8 q10, #3 ; #3
vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
vsubl.s8 q11, d15, d13
vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1
vmovl.u8 q4, d20
vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1)
vorr q14, q13, q14 ; vp8_hevmask
vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0)
vmul.i16 q11, q11, q4
vand q1, q1, q14 ; vp8_filter &= hev
vand q15, q15, q9 ; vp8_filter_mask
vaddw.s8 q2, q2, d2
vaddw.s8 q11, q11, d3
vmov.u8 q9, #4 ; #4
; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d2, q2
vqmovn.s16 d3, q11
vand q1, q1, q15 ; vp8_filter &= mask
vqadd.s8 q2, q1, q10 ; Filter2 = clamp(vp8_filter+3)
vqadd.s8 q1, q1, q9 ; Filter1 = clamp(vp8_filter+4)
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
vshr.s8 q1, q1, #3 ; Filter1 >>= 3
vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2)
vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1)
; outer tap adjustments: ++vp8_filter >> 1
vrshr.s8 q1, q1, #1
vbic q1, q1, q14 ; vp8_filter &= ~hev
vmov.u8 q0, #0x80 ; 0x80
vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp8_filter)
vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp8_filter)
veor q6, q11, q0 ; *op0 = u^0x80
veor q7, q10, q0 ; *oq0 = u^0x80
veor q5, q13, q0 ; *op1 = u^0x80
veor q8, q12, q0 ; *oq1 = u^0x80
bx lr
ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
;-----------------
END

View File

@ -0,0 +1,549 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "./vpx_config.h"
static INLINE void vp8_loop_filter_neon(
uint8x16_t qblimit, // flimit
uint8x16_t qlimit, // limit
uint8x16_t qthresh, // thresh
uint8x16_t q3, // p3
uint8x16_t q4, // p2
uint8x16_t q5, // p1
uint8x16_t q6, // p0
uint8x16_t q7, // q0
uint8x16_t q8, // q1
uint8x16_t q9, // q2
uint8x16_t q10, // q3
uint8x16_t *q5r, // p1
uint8x16_t *q6r, // p0
uint8x16_t *q7r, // q0
uint8x16_t *q8r) { // q1
uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
int16x8_t q2s16, q11s16;
uint16x8_t q4u16;
int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8;
int8x8_t d2s8, d3s8;
q11u8 = vabdq_u8(q3, q4);
q12u8 = vabdq_u8(q4, q5);
q13u8 = vabdq_u8(q5, q6);
q14u8 = vabdq_u8(q8, q7);
q3 = vabdq_u8(q9, q8);
q4 = vabdq_u8(q10, q9);
q11u8 = vmaxq_u8(q11u8, q12u8);
q12u8 = vmaxq_u8(q13u8, q14u8);
q3 = vmaxq_u8(q3, q4);
q15u8 = vmaxq_u8(q11u8, q12u8);
q9 = vabdq_u8(q6, q7);
// vp8_hevmask
q13u8 = vcgtq_u8(q13u8, qthresh);
q14u8 = vcgtq_u8(q14u8, qthresh);
q15u8 = vmaxq_u8(q15u8, q3);
q2u8 = vabdq_u8(q5, q8);
q9 = vqaddq_u8(q9, q9);
q15u8 = vcgeq_u8(qlimit, q15u8);
// vp8_filter() function
// convert to signed
q10 = vdupq_n_u8(0x80);
q8 = veorq_u8(q8, q10);
q7 = veorq_u8(q7, q10);
q6 = veorq_u8(q6, q10);
q5 = veorq_u8(q5, q10);
q2u8 = vshrq_n_u8(q2u8, 1);
q9 = vqaddq_u8(q9, q2u8);
q10 = vdupq_n_u8(3);
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
vget_low_s8(vreinterpretq_s8_u8(q6)));
q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
vget_high_s8(vreinterpretq_s8_u8(q6)));
q9 = vcgeq_u8(qblimit, q9);
q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
vreinterpretq_s8_u8(q8));
q14u8 = vorrq_u8(q13u8, q14u8);
q4u16 = vmovl_u8(vget_low_u8(q10));
q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
q15u8 = vandq_u8(q15u8, q9);
q1s8 = vreinterpretq_s8_u8(q1u8);
q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
q9 = vdupq_n_u8(4);
// vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
d2s8 = vqmovn_s16(q2s16);
d3s8 = vqmovn_s16(q11s16);
q1s8 = vcombine_s8(d2s8, d3s8);
q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
q1s8 = vreinterpretq_s8_u8(q1u8);
q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10));
q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
q2s8 = vshrq_n_s8(q2s8, 3);
q1s8 = vshrq_n_s8(q1s8, 3);
q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
q1s8 = vrshrq_n_s8(q1s8, 1);
q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
q0u8 = vdupq_n_u8(0x80);
*q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8);
*q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
*q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
*q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8);
return;
}
void vp8_loop_filter_horizontal_edge_y_neon(
unsigned char *src,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh) {
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
src -= (pitch << 2);
q3 = vld1q_u8(src);
src += pitch;
q4 = vld1q_u8(src);
src += pitch;
q5 = vld1q_u8(src);
src += pitch;
q6 = vld1q_u8(src);
src += pitch;
q7 = vld1q_u8(src);
src += pitch;
q8 = vld1q_u8(src);
src += pitch;
q9 = vld1q_u8(src);
src += pitch;
q10 = vld1q_u8(src);
vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q5, &q6, &q7, &q8);
src -= (pitch * 5);
vst1q_u8(src, q5);
src += pitch;
vst1q_u8(src, q6);
src += pitch;
vst1q_u8(src, q7);
src += pitch;
vst1q_u8(src, q8);
return;
}
void vp8_loop_filter_horizontal_edge_uv_neon(
unsigned char *u,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh,
unsigned char *v) {
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
u -= (pitch << 2);
v -= (pitch << 2);
d6 = vld1_u8(u);
u += pitch;
d7 = vld1_u8(v);
v += pitch;
d8 = vld1_u8(u);
u += pitch;
d9 = vld1_u8(v);
v += pitch;
d10 = vld1_u8(u);
u += pitch;
d11 = vld1_u8(v);
v += pitch;
d12 = vld1_u8(u);
u += pitch;
d13 = vld1_u8(v);
v += pitch;
d14 = vld1_u8(u);
u += pitch;
d15 = vld1_u8(v);
v += pitch;
d16 = vld1_u8(u);
u += pitch;
d17 = vld1_u8(v);
v += pitch;
d18 = vld1_u8(u);
u += pitch;
d19 = vld1_u8(v);
v += pitch;
d20 = vld1_u8(u);
d21 = vld1_u8(v);
q3 = vcombine_u8(d6, d7);
q4 = vcombine_u8(d8, d9);
q5 = vcombine_u8(d10, d11);
q6 = vcombine_u8(d12, d13);
q7 = vcombine_u8(d14, d15);
q8 = vcombine_u8(d16, d17);
q9 = vcombine_u8(d18, d19);
q10 = vcombine_u8(d20, d21);
vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q5, &q6, &q7, &q8);
u -= (pitch * 5);
vst1_u8(u, vget_low_u8(q5));
u += pitch;
vst1_u8(u, vget_low_u8(q6));
u += pitch;
vst1_u8(u, vget_low_u8(q7));
u += pitch;
vst1_u8(u, vget_low_u8(q8));
v -= (pitch * 5);
vst1_u8(v, vget_high_u8(q5));
v += pitch;
vst1_u8(v, vget_high_u8(q6));
v += pitch;
vst1_u8(v, vget_high_u8(q7));
v += pitch;
vst1_u8(v, vget_high_u8(q8));
return;
}
static INLINE void write_4x8(unsigned char *dst, int pitch,
const uint8x8x4_t result) {
#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
vst4_lane_u8(dst, result, 0);
dst += pitch;
vst4_lane_u8(dst, result, 1);
dst += pitch;
vst4_lane_u8(dst, result, 2);
dst += pitch;
vst4_lane_u8(dst, result, 3);
dst += pitch;
vst4_lane_u8(dst, result, 4);
dst += pitch;
vst4_lane_u8(dst, result, 5);
dst += pitch;
vst4_lane_u8(dst, result, 6);
dst += pitch;
vst4_lane_u8(dst, result, 7);
#else
/*
* uint8x8x4_t result
00 01 02 03 | 04 05 06 07
10 11 12 13 | 14 15 16 17
20 21 22 23 | 24 25 26 27
30 31 32 33 | 34 35 36 37
---
* after vtrn_u16
00 01 20 21 | 04 05 24 25
02 03 22 23 | 06 07 26 27
10 11 30 31 | 14 15 34 35
12 13 32 33 | 16 17 36 37
---
* after vtrn_u8
00 10 20 30 | 04 14 24 34
01 11 21 31 | 05 15 25 35
02 12 22 32 | 06 16 26 36
03 13 23 33 | 07 17 27 37
*/
const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),
vreinterpret_u16_u8(result.val[2]));
const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),
vreinterpret_u16_u8(result.val[3]));
const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
vreinterpret_u8_u16(r13_u16.val[0]));
const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
vreinterpret_u8_u16(r13_u16.val[1]));
const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);
const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);
const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);
const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);
vst1_lane_u32((uint32_t *)dst, x_0_4, 0);
dst += pitch;
vst1_lane_u32((uint32_t *)dst, x_1_5, 0);
dst += pitch;
vst1_lane_u32((uint32_t *)dst, x_2_6, 0);
dst += pitch;
vst1_lane_u32((uint32_t *)dst, x_3_7, 0);
dst += pitch;
vst1_lane_u32((uint32_t *)dst, x_0_4, 1);
dst += pitch;
vst1_lane_u32((uint32_t *)dst, x_1_5, 1);
dst += pitch;
vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
dst += pitch;
vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
#endif
}
void vp8_loop_filter_vertical_edge_y_neon(
unsigned char *src,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh) {
unsigned char *s, *d;
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
uint8x8x4_t q4ResultH, q4ResultL;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
s = src - 4;
d6 = vld1_u8(s);
s += pitch;
d8 = vld1_u8(s);
s += pitch;
d10 = vld1_u8(s);
s += pitch;
d12 = vld1_u8(s);
s += pitch;
d14 = vld1_u8(s);
s += pitch;
d16 = vld1_u8(s);
s += pitch;
d18 = vld1_u8(s);
s += pitch;
d20 = vld1_u8(s);
s += pitch;
d7 = vld1_u8(s);
s += pitch;
d9 = vld1_u8(s);
s += pitch;
d11 = vld1_u8(s);
s += pitch;
d13 = vld1_u8(s);
s += pitch;
d15 = vld1_u8(s);
s += pitch;
d17 = vld1_u8(s);
s += pitch;
d19 = vld1_u8(s);
s += pitch;
d21 = vld1_u8(s);
q3 = vcombine_u8(d6, d7);
q4 = vcombine_u8(d8, d9);
q5 = vcombine_u8(d10, d11);
q6 = vcombine_u8(d12, d13);
q7 = vcombine_u8(d14, d15);
q8 = vcombine_u8(d16, d17);
q9 = vcombine_u8(d18, d19);
q10 = vcombine_u8(d20, d21);
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
vreinterpretq_u16_u32(q2tmp2.val[0]));
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
vreinterpretq_u16_u32(q2tmp3.val[0]));
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
vreinterpretq_u16_u32(q2tmp2.val[1]));
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
vreinterpretq_u16_u32(q2tmp3.val[1]));
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
vreinterpretq_u8_u16(q2tmp5.val[0]));
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
vreinterpretq_u8_u16(q2tmp5.val[1]));
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
vreinterpretq_u8_u16(q2tmp7.val[0]));
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
vreinterpretq_u8_u16(q2tmp7.val[1]));
q3 = q2tmp8.val[0];
q4 = q2tmp8.val[1];
q5 = q2tmp9.val[0];
q6 = q2tmp9.val[1];
q7 = q2tmp10.val[0];
q8 = q2tmp10.val[1];
q9 = q2tmp11.val[0];
q10 = q2tmp11.val[1];
vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q5, &q6, &q7, &q8);
q4ResultL.val[0] = vget_low_u8(q5); // d10
q4ResultL.val[1] = vget_low_u8(q6); // d12
q4ResultL.val[2] = vget_low_u8(q7); // d14
q4ResultL.val[3] = vget_low_u8(q8); // d16
q4ResultH.val[0] = vget_high_u8(q5); // d11
q4ResultH.val[1] = vget_high_u8(q6); // d13
q4ResultH.val[2] = vget_high_u8(q7); // d15
q4ResultH.val[3] = vget_high_u8(q8); // d17
d = src - 2;
write_4x8(d, pitch, q4ResultL);
d += pitch * 8;
write_4x8(d, pitch, q4ResultH);
}
void vp8_loop_filter_vertical_edge_uv_neon(
unsigned char *u,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh,
unsigned char *v) {
unsigned char *us, *ud;
unsigned char *vs, *vd;
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
uint8x8x4_t q4ResultH, q4ResultL;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
us = u - 4;
d6 = vld1_u8(us);
us += pitch;
d8 = vld1_u8(us);
us += pitch;
d10 = vld1_u8(us);
us += pitch;
d12 = vld1_u8(us);
us += pitch;
d14 = vld1_u8(us);
us += pitch;
d16 = vld1_u8(us);
us += pitch;
d18 = vld1_u8(us);
us += pitch;
d20 = vld1_u8(us);
vs = v - 4;
d7 = vld1_u8(vs);
vs += pitch;
d9 = vld1_u8(vs);
vs += pitch;
d11 = vld1_u8(vs);
vs += pitch;
d13 = vld1_u8(vs);
vs += pitch;
d15 = vld1_u8(vs);
vs += pitch;
d17 = vld1_u8(vs);
vs += pitch;
d19 = vld1_u8(vs);
vs += pitch;
d21 = vld1_u8(vs);
q3 = vcombine_u8(d6, d7);
q4 = vcombine_u8(d8, d9);
q5 = vcombine_u8(d10, d11);
q6 = vcombine_u8(d12, d13);
q7 = vcombine_u8(d14, d15);
q8 = vcombine_u8(d16, d17);
q9 = vcombine_u8(d18, d19);
q10 = vcombine_u8(d20, d21);
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
vreinterpretq_u16_u32(q2tmp2.val[0]));
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
vreinterpretq_u16_u32(q2tmp3.val[0]));
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
vreinterpretq_u16_u32(q2tmp2.val[1]));
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
vreinterpretq_u16_u32(q2tmp3.val[1]));
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
vreinterpretq_u8_u16(q2tmp5.val[0]));
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
vreinterpretq_u8_u16(q2tmp5.val[1]));
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
vreinterpretq_u8_u16(q2tmp7.val[0]));
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
vreinterpretq_u8_u16(q2tmp7.val[1]));
q3 = q2tmp8.val[0];
q4 = q2tmp8.val[1];
q5 = q2tmp9.val[0];
q6 = q2tmp9.val[1];
q7 = q2tmp10.val[0];
q8 = q2tmp10.val[1];
q9 = q2tmp11.val[0];
q10 = q2tmp11.val[1];
vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q5, &q6, &q7, &q8);
q4ResultL.val[0] = vget_low_u8(q5); // d10
q4ResultL.val[1] = vget_low_u8(q6); // d12
q4ResultL.val[2] = vget_low_u8(q7); // d14
q4ResultL.val[3] = vget_low_u8(q8); // d16
ud = u - 2;
write_4x8(ud, pitch, q4ResultL);
q4ResultH.val[0] = vget_high_u8(q5); // d11
q4ResultH.val[1] = vget_high_u8(q6); // d13
q4ResultH.val[2] = vget_high_u8(q7); // d15
q4ResultH.val[3] = vget_high_u8(q8); // d17
vd = v - 2;
write_4x8(vd, pitch, q4ResultH);
}

View File

@ -1,117 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
;EXPORT |vp8_loop_filter_simple_horizontal_edge_neon|
EXPORT |vp8_loop_filter_bhs_neon|
EXPORT |vp8_loop_filter_mbhs_neon|
ARM
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *s, PRESERVE
; r1 int p, PRESERVE
; q1 limit, PRESERVE
|vp8_loop_filter_simple_horizontal_edge_neon| PROC
sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines
vld1.u8 {q7}, [r0@128], r1 ; q0
vld1.u8 {q5}, [r3@128], r1 ; p0
vld1.u8 {q8}, [r0@128] ; q1
vld1.u8 {q6}, [r3@128] ; p1
vabd.u8 q15, q6, q7 ; abs(p0 - q0)
vabd.u8 q14, q5, q8 ; abs(p1 - q1)
vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
vmov.u8 q0, #0x80 ; 0x80
vmov.s16 q13, #3
vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value
veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value
veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value
veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value
vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1
vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
vsubl.s8 q3, d15, d13
vqsub.s8 q4, q5, q8 ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1)
vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0)
vmul.s16 q3, q3, q13
vmov.u8 q10, #0x03 ; 0x03
vmov.u8 q9, #0x04 ; 0x04
vaddw.s8 q2, q2, d8 ; vp8_filter + 3 * ( qs0 - ps0)
vaddw.s8 q3, q3, d9
vqmovn.s16 d8, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d9, q3
vand q14, q4, q15 ; vp8_filter &= mask
vqadd.s8 q2, q14, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
vqadd.s8 q3, q14, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
vshr.s8 q4, q3, #3 ; Filter1 >>= 3
sub r0, r0, r1
;calculate output
vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2)
vqsub.s8 q10, q7, q4 ; u = vp8_signed_char_clamp(qs0 - Filter1)
veor q6, q11, q0 ; *op0 = u^0x80
veor q7, q10, q0 ; *oq0 = u^0x80
vst1.u8 {q6}, [r3@128] ; store op0
vst1.u8 {q7}, [r0@128] ; store oq0
bx lr
ENDP ; |vp8_loop_filter_simple_horizontal_edge_neon|
; r0 unsigned char *y
; r1 int ystride
; r2 const unsigned char *blimit
|vp8_loop_filter_bhs_neon| PROC
push {r4, lr}
ldrb r3, [r2] ; load blim from mem
vdup.s8 q1, r3 ; duplicate blim
add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride
bl vp8_loop_filter_simple_horizontal_edge_neon
; vp8_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1
add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride
bl vp8_loop_filter_simple_horizontal_edge_neon
add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride
pop {r4, lr}
b vp8_loop_filter_simple_horizontal_edge_neon
ENDP ;|vp8_loop_filter_bhs_neon|
; r0 unsigned char *y
; r1 int ystride
; r2 const unsigned char *blimit
|vp8_loop_filter_mbhs_neon| PROC
ldrb r3, [r2] ; load blim from mem
vdup.s8 q1, r3 ; duplicate mblim
b vp8_loop_filter_simple_horizontal_edge_neon
ENDP ;|vp8_loop_filter_bhs_neon|
END

View File

@ -0,0 +1,111 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "./vpx_config.h"
static INLINE void vp8_loop_filter_simple_horizontal_edge_neon(
unsigned char *s,
int p,
const unsigned char *blimit) {
uint8_t *sp;
uint8x16_t qblimit, q0u8;
uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8;
int16x8_t q2s16, q3s16, q13s16;
int8x8_t d8s8, d9s8;
int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8;
qblimit = vdupq_n_u8(*blimit);
sp = s - (p << 1);
q5u8 = vld1q_u8(sp);
sp += p;
q6u8 = vld1q_u8(sp);
sp += p;
q7u8 = vld1q_u8(sp);
sp += p;
q8u8 = vld1q_u8(sp);
q15u8 = vabdq_u8(q6u8, q7u8);
q14u8 = vabdq_u8(q5u8, q8u8);
q15u8 = vqaddq_u8(q15u8, q15u8);
q14u8 = vshrq_n_u8(q14u8, 1);
q0u8 = vdupq_n_u8(0x80);
q13s16 = vdupq_n_s16(3);
q15u8 = vqaddq_u8(q15u8, q14u8);
q5u8 = veorq_u8(q5u8, q0u8);
q6u8 = veorq_u8(q6u8, q0u8);
q7u8 = veorq_u8(q7u8, q0u8);
q8u8 = veorq_u8(q8u8, q0u8);
q15u8 = vcgeq_u8(qblimit, q15u8);
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)),
vget_low_s8(vreinterpretq_s8_u8(q6u8)));
q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)),
vget_high_s8(vreinterpretq_s8_u8(q6u8)));
q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8),
vreinterpretq_s8_u8(q8u8));
q2s16 = vmulq_s16(q2s16, q13s16);
q3s16 = vmulq_s16(q3s16, q13s16);
q10u8 = vdupq_n_u8(3);
q9u8 = vdupq_n_u8(4);
q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8));
q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8));
d8s8 = vqmovn_s16(q2s16);
d9s8 = vqmovn_s16(q3s16);
q4s8 = vcombine_s8(d8s8, d9s8);
q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8));
q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8));
q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8));
q2s8 = vshrq_n_s8(q2s8, 3);
q3s8 = vshrq_n_s8(q3s8, 3);
q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8);
q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8);
q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
vst1q_u8(s, q7u8);
s -= p;
vst1q_u8(s, q6u8);
return;
}
void vp8_loop_filter_bhs_neon(
unsigned char *y_ptr,
int y_stride,
const unsigned char *blimit) {
y_ptr += y_stride * 4;
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
y_ptr += y_stride * 4;
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
y_ptr += y_stride * 4;
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
return;
}
void vp8_loop_filter_mbhs_neon(
unsigned char *y_ptr,
int y_stride,
const unsigned char *blimit) {
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
return;
}

View File

@ -1,154 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
;EXPORT |vp8_loop_filter_simple_vertical_edge_neon|
EXPORT |vp8_loop_filter_bvs_neon|
EXPORT |vp8_loop_filter_mbvs_neon|
ARM
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *s, PRESERVE
; r1 int p, PRESERVE
; q1 limit, PRESERVE
|vp8_loop_filter_simple_vertical_edge_neon| PROC
sub r0, r0, #2 ; move src pointer down by 2 columns
add r12, r1, r1
add r3, r0, r1
vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12
vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12
vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12
vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12
vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12
vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12
vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12
vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12
vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12
vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12
vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12
vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12
vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12
vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12
vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12
vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3]
vswp d7, d10
vswp d12, d9
;vp8_filter_mask() function
;vp8_hevmask() function
sub r0, r0, r1, lsl #4
vabd.u8 q15, q5, q4 ; abs(p0 - q0)
vabd.u8 q14, q3, q6 ; abs(p1 - q1)
vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
vmov.u8 q0, #0x80 ; 0x80
vmov.s16 q11, #3
vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value
veor q5, q5, q0 ; ps0: p0 offset to convert to a signed value
veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value
veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value
vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
vsubl.s8 q2, d8, d10 ; ( qs0 - ps0)
vsubl.s8 q13, d9, d11
vqsub.s8 q14, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0)
vmul.s16 q13, q13, q11
vmov.u8 q11, #0x03 ; 0x03
vmov.u8 q12, #0x04 ; 0x04
vaddw.s8 q2, q2, d28 ; vp8_filter + 3 * ( qs0 - ps0)
vaddw.s8 q13, q13, d29
vqmovn.s16 d28, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d29, q13
add r0, r0, #1
add r3, r0, r1
vand q14, q14, q15 ; vp8_filter &= mask
vqadd.s8 q2, q14, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
vqadd.s8 q3, q14, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
vshr.s8 q14, q3, #3 ; Filter1 >>= 3
;calculate output
vqadd.s8 q11, q5, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2)
vqsub.s8 q10, q4, q14 ; u = vp8_signed_char_clamp(qs0 - Filter1)
veor q6, q11, q0 ; *op0 = u^0x80
veor q7, q10, q0 ; *oq0 = u^0x80
add r12, r1, r1
vswp d13, d14
;store op1, op0, oq0, oq1
vst2.8 {d12[0], d13[0]}, [r0], r12
vst2.8 {d12[1], d13[1]}, [r3], r12
vst2.8 {d12[2], d13[2]}, [r0], r12
vst2.8 {d12[3], d13[3]}, [r3], r12
vst2.8 {d12[4], d13[4]}, [r0], r12
vst2.8 {d12[5], d13[5]}, [r3], r12
vst2.8 {d12[6], d13[6]}, [r0], r12
vst2.8 {d12[7], d13[7]}, [r3], r12
vst2.8 {d14[0], d15[0]}, [r0], r12
vst2.8 {d14[1], d15[1]}, [r3], r12
vst2.8 {d14[2], d15[2]}, [r0], r12
vst2.8 {d14[3], d15[3]}, [r3], r12
vst2.8 {d14[4], d15[4]}, [r0], r12
vst2.8 {d14[5], d15[5]}, [r3], r12
vst2.8 {d14[6], d15[6]}, [r0], r12
vst2.8 {d14[7], d15[7]}, [r3]
bx lr
ENDP ; |vp8_loop_filter_simple_vertical_edge_neon|
; r0 unsigned char *y
; r1 int ystride
; r2 const unsigned char *blimit
|vp8_loop_filter_bvs_neon| PROC
push {r4, lr}
ldrb r3, [r2] ; load blim from mem
mov r4, r0
add r0, r0, #4
vdup.s8 q1, r3 ; duplicate blim
bl vp8_loop_filter_simple_vertical_edge_neon
; vp8_loop_filter_simple_vertical_edge_neon preserves r1 and q1
add r0, r4, #8
bl vp8_loop_filter_simple_vertical_edge_neon
add r0, r4, #12
pop {r4, lr}
b vp8_loop_filter_simple_vertical_edge_neon
ENDP ;|vp8_loop_filter_bvs_neon|
; r0 unsigned char *y
; r1 int ystride
; r2 const unsigned char *blimit
|vp8_loop_filter_mbvs_neon| PROC
ldrb r3, [r2] ; load mblim from mem
vdup.s8 q1, r3 ; duplicate mblim
b vp8_loop_filter_simple_vertical_edge_neon
ENDP ;|vp8_loop_filter_bvs_neon|
END

View File

@ -0,0 +1,279 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "./vpx_config.h"
#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
static INLINE void write_2x8(unsigned char *dst, int pitch,
const uint8x8x2_t result,
const uint8x8x2_t result2) {
vst2_lane_u8(dst, result, 0);
dst += pitch;
vst2_lane_u8(dst, result, 1);
dst += pitch;
vst2_lane_u8(dst, result, 2);
dst += pitch;
vst2_lane_u8(dst, result, 3);
dst += pitch;
vst2_lane_u8(dst, result, 4);
dst += pitch;
vst2_lane_u8(dst, result, 5);
dst += pitch;
vst2_lane_u8(dst, result, 6);
dst += pitch;
vst2_lane_u8(dst, result, 7);
dst += pitch;
vst2_lane_u8(dst, result2, 0);
dst += pitch;
vst2_lane_u8(dst, result2, 1);
dst += pitch;
vst2_lane_u8(dst, result2, 2);
dst += pitch;
vst2_lane_u8(dst, result2, 3);
dst += pitch;
vst2_lane_u8(dst, result2, 4);
dst += pitch;
vst2_lane_u8(dst, result2, 5);
dst += pitch;
vst2_lane_u8(dst, result2, 6);
dst += pitch;
vst2_lane_u8(dst, result2, 7);
}
#else
static INLINE void write_2x4(unsigned char *dst, int pitch,
const uint8x8x2_t result) {
/*
* uint8x8x2_t result
00 01 02 03 | 04 05 06 07
10 11 12 13 | 14 15 16 17
---
* after vtrn_u8
00 10 02 12 | 04 14 06 16
01 11 03 13 | 05 15 07 17
*/
const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0],
result.val[1]);
const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]);
const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]);
vst1_lane_u16((uint16_t *)dst, x_0_4, 0);
dst += pitch;
vst1_lane_u16((uint16_t *)dst, x_1_5, 0);
dst += pitch;
vst1_lane_u16((uint16_t *)dst, x_0_4, 1);
dst += pitch;
vst1_lane_u16((uint16_t *)dst, x_1_5, 1);
dst += pitch;
vst1_lane_u16((uint16_t *)dst, x_0_4, 2);
dst += pitch;
vst1_lane_u16((uint16_t *)dst, x_1_5, 2);
dst += pitch;
vst1_lane_u16((uint16_t *)dst, x_0_4, 3);
dst += pitch;
vst1_lane_u16((uint16_t *)dst, x_1_5, 3);
}
static INLINE void write_2x8(unsigned char *dst, int pitch,
const uint8x8x2_t result,
const uint8x8x2_t result2) {
write_2x4(dst, pitch, result);
dst += pitch * 8;
write_2x4(dst, pitch, result2);
}
#endif
#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
static INLINE
uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
x = vld4_lane_u8(src, x, 0);
src += pitch;
x = vld4_lane_u8(src, x, 1);
src += pitch;
x = vld4_lane_u8(src, x, 2);
src += pitch;
x = vld4_lane_u8(src, x, 3);
src += pitch;
x = vld4_lane_u8(src, x, 4);
src += pitch;
x = vld4_lane_u8(src, x, 5);
src += pitch;
x = vld4_lane_u8(src, x, 6);
src += pitch;
x = vld4_lane_u8(src, x, 7);
return x;
}
#else
static INLINE
uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
const uint8x8_t a = vld1_u8(src);
const uint8x8_t b = vld1_u8(src + pitch * 1);
const uint8x8_t c = vld1_u8(src + pitch * 2);
const uint8x8_t d = vld1_u8(src + pitch * 3);
const uint8x8_t e = vld1_u8(src + pitch * 4);
const uint8x8_t f = vld1_u8(src + pitch * 5);
const uint8x8_t g = vld1_u8(src + pitch * 6);
const uint8x8_t h = vld1_u8(src + pitch * 7);
const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a),
vreinterpret_u32_u8(e));
const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b),
vreinterpret_u32_u8(f));
const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c),
vreinterpret_u32_u8(g));
const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d),
vreinterpret_u32_u8(h));
const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]),
vreinterpret_u16_u32(r26_u32.val[0]));
const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]),
vreinterpret_u16_u32(r37_u32.val[0]));
const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
vreinterpret_u8_u16(r13_u16.val[0]));
const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
vreinterpret_u8_u16(r13_u16.val[1]));
/*
* after vtrn_u32
00 01 02 03 | 40 41 42 43
10 11 12 13 | 50 51 52 53
20 21 22 23 | 60 61 62 63
30 31 32 33 | 70 71 72 73
---
* after vtrn_u16
00 01 20 21 | 40 41 60 61
02 03 22 23 | 42 43 62 63
10 11 30 31 | 50 51 70 71
12 13 32 33 | 52 52 72 73
00 01 20 21 | 40 41 60 61
10 11 30 31 | 50 51 70 71
02 03 22 23 | 42 43 62 63
12 13 32 33 | 52 52 72 73
---
* after vtrn_u8
00 10 20 30 | 40 50 60 70
01 11 21 31 | 41 51 61 71
02 12 22 32 | 42 52 62 72
03 13 23 33 | 43 53 63 73
*/
x.val[0] = r01_u8.val[0];
x.val[1] = r01_u8.val[1];
x.val[2] = r23_u8.val[0];
x.val[3] = r23_u8.val[1];
return x;
}
#endif
static INLINE void vp8_loop_filter_simple_vertical_edge_neon(
unsigned char *s,
int p,
const unsigned char *blimit) {
unsigned char *src1;
uint8x16_t qblimit, q0u8;
uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8;
int16x8_t q2s16, q13s16, q11s16;
int8x8_t d28s8, d29s8;
int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8;
uint8x8x4_t d0u8x4; // d6, d7, d8, d9
uint8x8x4_t d1u8x4; // d10, d11, d12, d13
uint8x8x2_t d2u8x2; // d12, d13
uint8x8x2_t d3u8x2; // d14, d15
qblimit = vdupq_n_u8(*blimit);
src1 = s - 2;
d0u8x4 = read_4x8(src1, p, d0u8x4);
src1 += p * 8;
d1u8x4 = read_4x8(src1, p, d1u8x4);
q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]); // d6 d10
q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]); // d8 d12
q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]); // d7 d11
q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]); // d9 d13
q15u8 = vabdq_u8(q5u8, q4u8);
q14u8 = vabdq_u8(q3u8, q6u8);
q15u8 = vqaddq_u8(q15u8, q15u8);
q14u8 = vshrq_n_u8(q14u8, 1);
q0u8 = vdupq_n_u8(0x80);
q11s16 = vdupq_n_s16(3);
q15u8 = vqaddq_u8(q15u8, q14u8);
q3u8 = veorq_u8(q3u8, q0u8);
q4u8 = veorq_u8(q4u8, q0u8);
q5u8 = veorq_u8(q5u8, q0u8);
q6u8 = veorq_u8(q6u8, q0u8);
q15u8 = vcgeq_u8(qblimit, q15u8);
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)),
vget_low_s8(vreinterpretq_s8_u8(q5u8)));
q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)),
vget_high_s8(vreinterpretq_s8_u8(q5u8)));
q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8),
vreinterpretq_s8_u8(q6u8));
q2s16 = vmulq_s16(q2s16, q11s16);
q13s16 = vmulq_s16(q13s16, q11s16);
q11u8 = vdupq_n_u8(3);
q12u8 = vdupq_n_u8(4);
q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8));
q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8));
d28s8 = vqmovn_s16(q2s16);
d29s8 = vqmovn_s16(q13s16);
q14s8 = vcombine_s8(d28s8, d29s8);
q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8));
q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8));
q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8));
q2s8 = vshrq_n_s8(q2s8, 3);
q14s8 = vshrq_n_s8(q3s8, 3);
q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8);
q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8);
q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
d2u8x2.val[0] = vget_low_u8(q6u8); // d12
d2u8x2.val[1] = vget_low_u8(q7u8); // d14
d3u8x2.val[0] = vget_high_u8(q6u8); // d13
d3u8x2.val[1] = vget_high_u8(q7u8); // d15
src1 = s - 1;
write_2x8(src1, p, d2u8x2, d3u8x2);
}
void vp8_loop_filter_bvs_neon(
unsigned char *y_ptr,
int y_stride,
const unsigned char *blimit) {
y_ptr += 4;
vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
y_ptr += 4;
vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
y_ptr += 4;
vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
return;
}
void vp8_loop_filter_mbvs_neon(
unsigned char *y_ptr,
int y_stride,
const unsigned char *blimit) {
vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
return;
}

View File

@ -1,469 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_mbloop_filter_horizontal_edge_y_neon|
EXPORT |vp8_mbloop_filter_horizontal_edge_uv_neon|
EXPORT |vp8_mbloop_filter_vertical_edge_y_neon|
EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon|
ARM
AREA ||.text||, CODE, READONLY, ALIGN=2
; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
; const unsigned char *blimit,
; const unsigned char *limit,
; const unsigned char *thresh)
; r0 unsigned char *src,
; r1 int pitch,
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
push {lr}
add r1, r1, r1 ; double stride
ldr r12, [sp, #4] ; load thresh
sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines
vdup.u8 q2, r12 ; thresh
add r12, r0, r1, lsr #1 ; move src pointer up by 1 line
vld1.u8 {q3}, [r0@128], r1 ; p3
vld1.u8 {q4}, [r12@128], r1 ; p2
vld1.u8 {q5}, [r0@128], r1 ; p1
vld1.u8 {q6}, [r12@128], r1 ; p0
vld1.u8 {q7}, [r0@128], r1 ; q0
vld1.u8 {q8}, [r12@128], r1 ; q1
vld1.u8 {q9}, [r0@128], r1 ; q2
vld1.u8 {q10}, [r12@128], r1 ; q3
bl vp8_mbloop_filter_neon
sub r12, r12, r1, lsl #2
add r0, r12, r1, lsr #1
vst1.u8 {q4}, [r12@128],r1 ; store op2
vst1.u8 {q5}, [r0@128],r1 ; store op1
vst1.u8 {q6}, [r12@128], r1 ; store op0
vst1.u8 {q7}, [r0@128],r1 ; store oq0
vst1.u8 {q8}, [r12@128] ; store oq1
vst1.u8 {q9}, [r0@128] ; store oq2
pop {pc}
ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon|
; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
; const unsigned char *blimit,
; const unsigned char *limit,
; const unsigned char *thresh,
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
; sp+4 unsigned char *v
|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
push {lr}
ldr r12, [sp, #4] ; load thresh
sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines
vdup.u8 q2, r12 ; thresh
ldr r12, [sp, #8] ; load v ptr
sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines
vld1.u8 {d6}, [r0@64], r1 ; p3
vld1.u8 {d7}, [r12@64], r1 ; p3
vld1.u8 {d8}, [r0@64], r1 ; p2
vld1.u8 {d9}, [r12@64], r1 ; p2
vld1.u8 {d10}, [r0@64], r1 ; p1
vld1.u8 {d11}, [r12@64], r1 ; p1
vld1.u8 {d12}, [r0@64], r1 ; p0
vld1.u8 {d13}, [r12@64], r1 ; p0
vld1.u8 {d14}, [r0@64], r1 ; q0
vld1.u8 {d15}, [r12@64], r1 ; q0
vld1.u8 {d16}, [r0@64], r1 ; q1
vld1.u8 {d17}, [r12@64], r1 ; q1
vld1.u8 {d18}, [r0@64], r1 ; q2
vld1.u8 {d19}, [r12@64], r1 ; q2
vld1.u8 {d20}, [r0@64], r1 ; q3
vld1.u8 {d21}, [r12@64], r1 ; q3
bl vp8_mbloop_filter_neon
sub r0, r0, r1, lsl #3
sub r12, r12, r1, lsl #3
add r0, r0, r1
add r12, r12, r1
vst1.u8 {d8}, [r0@64], r1 ; store u op2
vst1.u8 {d9}, [r12@64], r1 ; store v op2
vst1.u8 {d10}, [r0@64], r1 ; store u op1
vst1.u8 {d11}, [r12@64], r1 ; store v op1
vst1.u8 {d12}, [r0@64], r1 ; store u op0
vst1.u8 {d13}, [r12@64], r1 ; store v op0
vst1.u8 {d14}, [r0@64], r1 ; store u oq0
vst1.u8 {d15}, [r12@64], r1 ; store v oq0
vst1.u8 {d16}, [r0@64], r1 ; store u oq1
vst1.u8 {d17}, [r12@64], r1 ; store v oq1
vst1.u8 {d18}, [r0@64], r1 ; store u oq2
vst1.u8 {d19}, [r12@64], r1 ; store v oq2
pop {pc}
ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
; const unsigned char *blimit,
; const unsigned char *limit,
; const unsigned char *thresh)
; r0 unsigned char *src,
; r1 int pitch,
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
|vp8_mbloop_filter_vertical_edge_y_neon| PROC
push {lr}
ldr r12, [sp, #4] ; load thresh
sub r0, r0, #4 ; move src pointer down by 4 columns
vdup.s8 q2, r12 ; thresh
add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines
vld1.u8 {d6}, [r0], r1 ; load first 8-line src data
vld1.u8 {d7}, [r12], r1 ; load second 8-line src data
vld1.u8 {d8}, [r0], r1
vld1.u8 {d9}, [r12], r1
vld1.u8 {d10}, [r0], r1
vld1.u8 {d11}, [r12], r1
vld1.u8 {d12}, [r0], r1
vld1.u8 {d13}, [r12], r1
vld1.u8 {d14}, [r0], r1
vld1.u8 {d15}, [r12], r1
vld1.u8 {d16}, [r0], r1
vld1.u8 {d17}, [r12], r1
vld1.u8 {d18}, [r0], r1
vld1.u8 {d19}, [r12], r1
vld1.u8 {d20}, [r0], r1
vld1.u8 {d21}, [r12], r1
;transpose to 8x16 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
sub r0, r0, r1, lsl #3
bl vp8_mbloop_filter_neon
sub r12, r12, r1, lsl #3
;transpose to 16x8 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
;store op2, op1, op0, oq0, oq1, oq2
vst1.8 {d6}, [r0], r1
vst1.8 {d7}, [r12], r1
vst1.8 {d8}, [r0], r1
vst1.8 {d9}, [r12], r1
vst1.8 {d10}, [r0], r1
vst1.8 {d11}, [r12], r1
vst1.8 {d12}, [r0], r1
vst1.8 {d13}, [r12], r1
vst1.8 {d14}, [r0], r1
vst1.8 {d15}, [r12], r1
vst1.8 {d16}, [r0], r1
vst1.8 {d17}, [r12], r1
vst1.8 {d18}, [r0], r1
vst1.8 {d19}, [r12], r1
vst1.8 {d20}, [r0]
vst1.8 {d21}, [r12]
pop {pc}
ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon|
; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
; const unsigned char *blimit,
; const unsigned char *limit,
; const unsigned char *thresh,
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; sp+4 unsigned char *v
|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
push {lr}
ldr r12, [sp, #4] ; load thresh
sub r0, r0, #4 ; move u pointer down by 4 columns
vdup.u8 q2, r12 ; thresh
ldr r12, [sp, #8] ; load v ptr
sub r12, r12, #4 ; move v pointer down by 4 columns
vld1.u8 {d6}, [r0], r1 ;load u data
vld1.u8 {d7}, [r12], r1 ;load v data
vld1.u8 {d8}, [r0], r1
vld1.u8 {d9}, [r12], r1
vld1.u8 {d10}, [r0], r1
vld1.u8 {d11}, [r12], r1
vld1.u8 {d12}, [r0], r1
vld1.u8 {d13}, [r12], r1
vld1.u8 {d14}, [r0], r1
vld1.u8 {d15}, [r12], r1
vld1.u8 {d16}, [r0], r1
vld1.u8 {d17}, [r12], r1
vld1.u8 {d18}, [r0], r1
vld1.u8 {d19}, [r12], r1
vld1.u8 {d20}, [r0], r1
vld1.u8 {d21}, [r12], r1
;transpose to 8x16 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
sub r0, r0, r1, lsl #3
bl vp8_mbloop_filter_neon
sub r12, r12, r1, lsl #3
;transpose to 16x8 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
;store op2, op1, op0, oq0, oq1, oq2
vst1.8 {d6}, [r0], r1
vst1.8 {d7}, [r12], r1
vst1.8 {d8}, [r0], r1
vst1.8 {d9}, [r12], r1
vst1.8 {d10}, [r0], r1
vst1.8 {d11}, [r12], r1
vst1.8 {d12}, [r0], r1
vst1.8 {d13}, [r12], r1
vst1.8 {d14}, [r0], r1
vst1.8 {d15}, [r12], r1
vst1.8 {d16}, [r0], r1
vst1.8 {d17}, [r12], r1
vst1.8 {d18}, [r0], r1
vst1.8 {d19}, [r12], r1
vst1.8 {d20}, [r0]
vst1.8 {d21}, [r12]
pop {pc}
ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon|
; void vp8_mbloop_filter_neon()
; This is a helper function for the macroblock loopfilters. The individual
; functions do the necessary load, transpose (if necessary), preserve (if
; necessary) and store.
; r0,r1 PRESERVE
; r2 mblimit
; r3 limit
; q2 thresh
; q3 p3 PRESERVE
; q4 p2
; q5 p1
; q6 p0
; q7 q0
; q8 q1
; q9 q2
; q10 q3 PRESERVE
|vp8_mbloop_filter_neon| PROC
; vp8_filter_mask
vabd.u8 q11, q3, q4 ; abs(p3 - p2)
vabd.u8 q12, q4, q5 ; abs(p2 - p1)
vabd.u8 q13, q5, q6 ; abs(p1 - p0)
vabd.u8 q14, q8, q7 ; abs(q1 - q0)
vabd.u8 q1, q9, q8 ; abs(q2 - q1)
vabd.u8 q0, q10, q9 ; abs(q3 - q2)
vmax.u8 q11, q11, q12
vmax.u8 q12, q13, q14
vmax.u8 q1, q1, q0
vmax.u8 q15, q11, q12
vabd.u8 q12, q6, q7 ; abs(p0 - q0)
; vp8_hevmask
vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1
vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1
vmax.u8 q15, q15, q1
vdup.u8 q1, r3 ; limit
vdup.u8 q2, r2 ; mblimit
vmov.u8 q0, #0x80 ; 0x80
vcge.u8 q15, q1, q15
vabd.u8 q1, q5, q8 ; a = abs(p1 - q1)
vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2
vmov.u16 q11, #3 ; #3
; vp8_filter
; convert to signed
veor q7, q7, q0 ; qs0
vshr.u8 q1, q1, #1 ; a = a / 2
veor q6, q6, q0 ; ps0
veor q5, q5, q0 ; ps1
vqadd.u8 q12, q12, q1 ; a = b + a
veor q8, q8, q0 ; qs1
veor q4, q4, q0 ; ps2
veor q9, q9, q0 ; qs2
vorr q14, q13, q14 ; vp8_hevmask
vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1
vsubl.s8 q2, d14, d12 ; qs0 - ps0
vsubl.s8 q13, d15, d13
vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1)
vmul.i16 q2, q2, q11 ; 3 * ( qs0 - ps0)
vand q15, q15, q12 ; vp8_filter_mask
vmul.i16 q13, q13, q11
vmov.u8 q12, #3 ; #3
vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0)
vaddw.s8 q13, q13, d3
vmov.u8 q11, #4 ; #4
; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d2, q2
vqmovn.s16 d3, q13
vand q1, q1, q15 ; vp8_filter &= mask
vmov.u16 q15, #63 ; #63
vand q13, q1, q14 ; Filter2 &= hev
vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4)
vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3)
vmov q0, q15
vshr.s8 q2, q2, #3 ; Filter1 >>= 3
vshr.s8 q13, q13, #3 ; Filter2 >>= 3
vmov q11, q15
vmov q12, q15
vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1)
vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2)
vbic q1, q1, q14 ; vp8_filter &= ~hev
; roughly 1/7th difference across boundary
; roughly 2/7th difference across boundary
; roughly 3/7th difference across boundary
vmov.u8 d5, #9 ; #9
vmov.u8 d4, #18 ; #18
vmov q13, q15
vmov q14, q15
vmlal.s8 q0, d2, d5 ; 63 + Filter2 * 9
vmlal.s8 q11, d3, d5
vmov.u8 d5, #27 ; #27
vmlal.s8 q12, d2, d4 ; 63 + Filter2 * 18
vmlal.s8 q13, d3, d4
vmlal.s8 q14, d2, d5 ; 63 + Filter2 * 27
vmlal.s8 q15, d3, d5
vqshrn.s16 d0, q0, #7 ; u = clamp((63 + Filter2 * 9)>>7)
vqshrn.s16 d1, q11, #7
vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7)
vqshrn.s16 d25, q13, #7
vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7)
vqshrn.s16 d29, q15, #7
vmov.u8 q1, #0x80 ; 0x80
vqsub.s8 q11, q9, q0 ; s = clamp(qs2 - u)
vqadd.s8 q0, q4, q0 ; s = clamp(ps2 + u)
vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u)
vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u)
vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u)
vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u)
veor q9, q11, q1 ; *oq2 = s^0x80
veor q4, q0, q1 ; *op2 = s^0x80
veor q8, q13, q1 ; *oq1 = s^0x80
veor q5, q12, q1 ; *op2 = s^0x80
veor q7, q15, q1 ; *oq0 = s^0x80
veor q6, q14, q1 ; *op0 = s^0x80
bx lr
ENDP ; |vp8_mbloop_filter_neon|
;-----------------
END

View File

@ -0,0 +1,625 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "./vpx_config.h"
static INLINE void vp8_mbloop_filter_neon(
uint8x16_t qblimit, // mblimit
uint8x16_t qlimit, // limit
uint8x16_t qthresh, // thresh
uint8x16_t q3, // p2
uint8x16_t q4, // p2
uint8x16_t q5, // p1
uint8x16_t q6, // p0
uint8x16_t q7, // q0
uint8x16_t q8, // q1
uint8x16_t q9, // q2
uint8x16_t q10, // q3
uint8x16_t *q4r, // p1
uint8x16_t *q5r, // p1
uint8x16_t *q6r, // p0
uint8x16_t *q7r, // q0
uint8x16_t *q8r, // q1
uint8x16_t *q9r) { // q1
uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8;
int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16;
int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8;
uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16;
int8x16_t q0s8, q12s8, q14s8, q15s8;
int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29;
q11u8 = vabdq_u8(q3, q4);
q12u8 = vabdq_u8(q4, q5);
q13u8 = vabdq_u8(q5, q6);
q14u8 = vabdq_u8(q8, q7);
q1u8 = vabdq_u8(q9, q8);
q0u8 = vabdq_u8(q10, q9);
q11u8 = vmaxq_u8(q11u8, q12u8);
q12u8 = vmaxq_u8(q13u8, q14u8);
q1u8 = vmaxq_u8(q1u8, q0u8);
q15u8 = vmaxq_u8(q11u8, q12u8);
q12u8 = vabdq_u8(q6, q7);
// vp8_hevmask
q13u8 = vcgtq_u8(q13u8, qthresh);
q14u8 = vcgtq_u8(q14u8, qthresh);
q15u8 = vmaxq_u8(q15u8, q1u8);
q15u8 = vcgeq_u8(qlimit, q15u8);
q1u8 = vabdq_u8(q5, q8);
q12u8 = vqaddq_u8(q12u8, q12u8);
// vp8_filter() function
// convert to signed
q0u8 = vdupq_n_u8(0x80);
q9 = veorq_u8(q9, q0u8);
q8 = veorq_u8(q8, q0u8);
q7 = veorq_u8(q7, q0u8);
q6 = veorq_u8(q6, q0u8);
q5 = veorq_u8(q5, q0u8);
q4 = veorq_u8(q4, q0u8);
q1u8 = vshrq_n_u8(q1u8, 1);
q12u8 = vqaddq_u8(q12u8, q1u8);
q14u8 = vorrq_u8(q13u8, q14u8);
q12u8 = vcgeq_u8(qblimit, q12u8);
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
vget_low_s8(vreinterpretq_s8_u8(q6)));
q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
vget_high_s8(vreinterpretq_s8_u8(q6)));
q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
vreinterpretq_s8_u8(q8));
q11s16 = vdupq_n_s16(3);
q2s16 = vmulq_s16(q2s16, q11s16);
q13s16 = vmulq_s16(q13s16, q11s16);
q15u8 = vandq_u8(q15u8, q12u8);
q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8));
q12u8 = vdupq_n_u8(3);
q11u8 = vdupq_n_u8(4);
// vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
d2 = vqmovn_s16(q2s16);
d3 = vqmovn_s16(q13s16);
q1s8 = vcombine_s8(d2, d3);
q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8));
q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8));
q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8));
q2s8 = vshrq_n_s8(q2s8, 3);
q13s8 = vshrq_n_s8(q13s8, 3);
q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8);
q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8);
q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63);
d5 = vdup_n_s8(9);
d4 = vdup_n_s8(18);
q0s16 = vmlal_s8(vreinterpretq_s16_u16(q0u16), vget_low_s8(q1s8), d5);
q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5);
d5 = vdup_n_s8(27);
q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8), d4);
q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4);
q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8), d5);
q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5);
d0 = vqshrn_n_s16(q0s16 , 7);
d1 = vqshrn_n_s16(q11s16, 7);
d24 = vqshrn_n_s16(q12s16, 7);
d25 = vqshrn_n_s16(q13s16, 7);
d28 = vqshrn_n_s16(q14s16, 7);
d29 = vqshrn_n_s16(q15s16, 7);
q0s8 = vcombine_s8(d0, d1);
q12s8 = vcombine_s8(d24, d25);
q14s8 = vcombine_s8(d28, d29);
q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8);
q0s8 = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8);
q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8);
q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8);
q15s8 = vqsubq_s8((q7s8), q14s8);
q14s8 = vqaddq_s8((q6s8), q14s8);
q1u8 = vdupq_n_u8(0x80);
*q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8);
*q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8);
*q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8);
*q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8);
*q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8);
*q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8);
return;
}
void vp8_mbloop_filter_horizontal_edge_y_neon(
unsigned char *src,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh) {
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
src -= (pitch << 2);
q3 = vld1q_u8(src);
src += pitch;
q4 = vld1q_u8(src);
src += pitch;
q5 = vld1q_u8(src);
src += pitch;
q6 = vld1q_u8(src);
src += pitch;
q7 = vld1q_u8(src);
src += pitch;
q8 = vld1q_u8(src);
src += pitch;
q9 = vld1q_u8(src);
src += pitch;
q10 = vld1q_u8(src);
vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q4, &q5, &q6, &q7, &q8, &q9);
src -= (pitch * 6);
vst1q_u8(src, q4);
src += pitch;
vst1q_u8(src, q5);
src += pitch;
vst1q_u8(src, q6);
src += pitch;
vst1q_u8(src, q7);
src += pitch;
vst1q_u8(src, q8);
src += pitch;
vst1q_u8(src, q9);
return;
}
void vp8_mbloop_filter_horizontal_edge_uv_neon(
unsigned char *u,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh,
unsigned char *v) {
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
u -= (pitch << 2);
v -= (pitch << 2);
d6 = vld1_u8(u);
u += pitch;
d7 = vld1_u8(v);
v += pitch;
d8 = vld1_u8(u);
u += pitch;
d9 = vld1_u8(v);
v += pitch;
d10 = vld1_u8(u);
u += pitch;
d11 = vld1_u8(v);
v += pitch;
d12 = vld1_u8(u);
u += pitch;
d13 = vld1_u8(v);
v += pitch;
d14 = vld1_u8(u);
u += pitch;
d15 = vld1_u8(v);
v += pitch;
d16 = vld1_u8(u);
u += pitch;
d17 = vld1_u8(v);
v += pitch;
d18 = vld1_u8(u);
u += pitch;
d19 = vld1_u8(v);
v += pitch;
d20 = vld1_u8(u);
d21 = vld1_u8(v);
q3 = vcombine_u8(d6, d7);
q4 = vcombine_u8(d8, d9);
q5 = vcombine_u8(d10, d11);
q6 = vcombine_u8(d12, d13);
q7 = vcombine_u8(d14, d15);
q8 = vcombine_u8(d16, d17);
q9 = vcombine_u8(d18, d19);
q10 = vcombine_u8(d20, d21);
vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q4, &q5, &q6, &q7, &q8, &q9);
u -= (pitch * 6);
v -= (pitch * 6);
vst1_u8(u, vget_low_u8(q4));
u += pitch;
vst1_u8(v, vget_high_u8(q4));
v += pitch;
vst1_u8(u, vget_low_u8(q5));
u += pitch;
vst1_u8(v, vget_high_u8(q5));
v += pitch;
vst1_u8(u, vget_low_u8(q6));
u += pitch;
vst1_u8(v, vget_high_u8(q6));
v += pitch;
vst1_u8(u, vget_low_u8(q7));
u += pitch;
vst1_u8(v, vget_high_u8(q7));
v += pitch;
vst1_u8(u, vget_low_u8(q8));
u += pitch;
vst1_u8(v, vget_high_u8(q8));
v += pitch;
vst1_u8(u, vget_low_u8(q9));
vst1_u8(v, vget_high_u8(q9));
return;
}
void vp8_mbloop_filter_vertical_edge_y_neon(
unsigned char *src,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh) {
unsigned char *s1, *s2;
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
s1 = src - 4;
s2 = s1 + 8 * pitch;
d6 = vld1_u8(s1);
s1 += pitch;
d7 = vld1_u8(s2);
s2 += pitch;
d8 = vld1_u8(s1);
s1 += pitch;
d9 = vld1_u8(s2);
s2 += pitch;
d10 = vld1_u8(s1);
s1 += pitch;
d11 = vld1_u8(s2);
s2 += pitch;
d12 = vld1_u8(s1);
s1 += pitch;
d13 = vld1_u8(s2);
s2 += pitch;
d14 = vld1_u8(s1);
s1 += pitch;
d15 = vld1_u8(s2);
s2 += pitch;
d16 = vld1_u8(s1);
s1 += pitch;
d17 = vld1_u8(s2);
s2 += pitch;
d18 = vld1_u8(s1);
s1 += pitch;
d19 = vld1_u8(s2);
s2 += pitch;
d20 = vld1_u8(s1);
d21 = vld1_u8(s2);
q3 = vcombine_u8(d6, d7);
q4 = vcombine_u8(d8, d9);
q5 = vcombine_u8(d10, d11);
q6 = vcombine_u8(d12, d13);
q7 = vcombine_u8(d14, d15);
q8 = vcombine_u8(d16, d17);
q9 = vcombine_u8(d18, d19);
q10 = vcombine_u8(d20, d21);
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
vreinterpretq_u16_u32(q2tmp2.val[0]));
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
vreinterpretq_u16_u32(q2tmp3.val[0]));
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
vreinterpretq_u16_u32(q2tmp2.val[1]));
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
vreinterpretq_u16_u32(q2tmp3.val[1]));
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
vreinterpretq_u8_u16(q2tmp5.val[0]));
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
vreinterpretq_u8_u16(q2tmp5.val[1]));
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
vreinterpretq_u8_u16(q2tmp7.val[0]));
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
vreinterpretq_u8_u16(q2tmp7.val[1]));
q3 = q2tmp8.val[0];
q4 = q2tmp8.val[1];
q5 = q2tmp9.val[0];
q6 = q2tmp9.val[1];
q7 = q2tmp10.val[0];
q8 = q2tmp10.val[1];
q9 = q2tmp11.val[0];
q10 = q2tmp11.val[1];
vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q4, &q5, &q6, &q7, &q8, &q9);
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
vreinterpretq_u16_u32(q2tmp2.val[0]));
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
vreinterpretq_u16_u32(q2tmp3.val[0]));
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
vreinterpretq_u16_u32(q2tmp2.val[1]));
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
vreinterpretq_u16_u32(q2tmp3.val[1]));
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
vreinterpretq_u8_u16(q2tmp5.val[0]));
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
vreinterpretq_u8_u16(q2tmp5.val[1]));
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
vreinterpretq_u8_u16(q2tmp7.val[0]));
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
vreinterpretq_u8_u16(q2tmp7.val[1]));
q3 = q2tmp8.val[0];
q4 = q2tmp8.val[1];
q5 = q2tmp9.val[0];
q6 = q2tmp9.val[1];
q7 = q2tmp10.val[0];
q8 = q2tmp10.val[1];
q9 = q2tmp11.val[0];
q10 = q2tmp11.val[1];
s1 -= 7 * pitch;
s2 -= 7 * pitch;
vst1_u8(s1, vget_low_u8(q3));
s1 += pitch;
vst1_u8(s2, vget_high_u8(q3));
s2 += pitch;
vst1_u8(s1, vget_low_u8(q4));
s1 += pitch;
vst1_u8(s2, vget_high_u8(q4));
s2 += pitch;
vst1_u8(s1, vget_low_u8(q5));
s1 += pitch;
vst1_u8(s2, vget_high_u8(q5));
s2 += pitch;
vst1_u8(s1, vget_low_u8(q6));
s1 += pitch;
vst1_u8(s2, vget_high_u8(q6));
s2 += pitch;
vst1_u8(s1, vget_low_u8(q7));
s1 += pitch;
vst1_u8(s2, vget_high_u8(q7));
s2 += pitch;
vst1_u8(s1, vget_low_u8(q8));
s1 += pitch;
vst1_u8(s2, vget_high_u8(q8));
s2 += pitch;
vst1_u8(s1, vget_low_u8(q9));
s1 += pitch;
vst1_u8(s2, vget_high_u8(q9));
s2 += pitch;
vst1_u8(s1, vget_low_u8(q10));
vst1_u8(s2, vget_high_u8(q10));
return;
}
void vp8_mbloop_filter_vertical_edge_uv_neon(
unsigned char *u,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh,
unsigned char *v) {
unsigned char *us, *ud;
unsigned char *vs, *vd;
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
us = u - 4;
vs = v - 4;
d6 = vld1_u8(us);
us += pitch;
d7 = vld1_u8(vs);
vs += pitch;
d8 = vld1_u8(us);
us += pitch;
d9 = vld1_u8(vs);
vs += pitch;
d10 = vld1_u8(us);
us += pitch;
d11 = vld1_u8(vs);
vs += pitch;
d12 = vld1_u8(us);
us += pitch;
d13 = vld1_u8(vs);
vs += pitch;
d14 = vld1_u8(us);
us += pitch;
d15 = vld1_u8(vs);
vs += pitch;
d16 = vld1_u8(us);
us += pitch;
d17 = vld1_u8(vs);
vs += pitch;
d18 = vld1_u8(us);
us += pitch;
d19 = vld1_u8(vs);
vs += pitch;
d20 = vld1_u8(us);
d21 = vld1_u8(vs);
q3 = vcombine_u8(d6, d7);
q4 = vcombine_u8(d8, d9);
q5 = vcombine_u8(d10, d11);
q6 = vcombine_u8(d12, d13);
q7 = vcombine_u8(d14, d15);
q8 = vcombine_u8(d16, d17);
q9 = vcombine_u8(d18, d19);
q10 = vcombine_u8(d20, d21);
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
vreinterpretq_u16_u32(q2tmp2.val[0]));
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
vreinterpretq_u16_u32(q2tmp3.val[0]));
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
vreinterpretq_u16_u32(q2tmp2.val[1]));
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
vreinterpretq_u16_u32(q2tmp3.val[1]));
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
vreinterpretq_u8_u16(q2tmp5.val[0]));
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
vreinterpretq_u8_u16(q2tmp5.val[1]));
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
vreinterpretq_u8_u16(q2tmp7.val[0]));
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
vreinterpretq_u8_u16(q2tmp7.val[1]));
q3 = q2tmp8.val[0];
q4 = q2tmp8.val[1];
q5 = q2tmp9.val[0];
q6 = q2tmp9.val[1];
q7 = q2tmp10.val[0];
q8 = q2tmp10.val[1];
q9 = q2tmp11.val[0];
q10 = q2tmp11.val[1];
vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q4, &q5, &q6, &q7, &q8, &q9);
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
vreinterpretq_u16_u32(q2tmp2.val[0]));
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
vreinterpretq_u16_u32(q2tmp3.val[0]));
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
vreinterpretq_u16_u32(q2tmp2.val[1]));
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
vreinterpretq_u16_u32(q2tmp3.val[1]));
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
vreinterpretq_u8_u16(q2tmp5.val[0]));
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
vreinterpretq_u8_u16(q2tmp5.val[1]));
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
vreinterpretq_u8_u16(q2tmp7.val[0]));
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
vreinterpretq_u8_u16(q2tmp7.val[1]));
q3 = q2tmp8.val[0];
q4 = q2tmp8.val[1];
q5 = q2tmp9.val[0];
q6 = q2tmp9.val[1];
q7 = q2tmp10.val[0];
q8 = q2tmp10.val[1];
q9 = q2tmp11.val[0];
q10 = q2tmp11.val[1];
ud = u - 4;
vst1_u8(ud, vget_low_u8(q3));
ud += pitch;
vst1_u8(ud, vget_low_u8(q4));
ud += pitch;
vst1_u8(ud, vget_low_u8(q5));
ud += pitch;
vst1_u8(ud, vget_low_u8(q6));
ud += pitch;
vst1_u8(ud, vget_low_u8(q7));
ud += pitch;
vst1_u8(ud, vget_low_u8(q8));
ud += pitch;
vst1_u8(ud, vget_low_u8(q9));
ud += pitch;
vst1_u8(ud, vget_low_u8(q10));
vd = v - 4;
vst1_u8(vd, vget_high_u8(q3));
vd += pitch;
vst1_u8(vd, vget_high_u8(q4));
vd += pitch;
vst1_u8(vd, vget_high_u8(q5));
vd += pitch;
vst1_u8(vd, vget_high_u8(q6));
vd += pitch;
vst1_u8(vd, vget_high_u8(q7));
vd += pitch;
vst1_u8(vd, vget_high_u8(q8));
vd += pitch;
vst1_u8(vd, vget_high_u8(q9));
vd += pitch;
vst1_u8(vd, vget_high_u8(q10));
return;
}

View File

@ -0,0 +1,210 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "vp8/common/blockd.h"
void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x,
unsigned char * yabove_row,
unsigned char * yleft,
int left_stride,
unsigned char * ypred_ptr,
int y_stride) {
const int mode = x->mode_info_context->mbmi.mode;
int i;
switch (mode) {
case DC_PRED:
{
int shift = x->up_available + x->left_available;
uint8x16_t v_expected_dc = vdupq_n_u8(128);
if (shift) {
unsigned int average = 0;
int expected_dc;
if (x->up_available) {
const uint8x16_t v_above = vld1q_u8(yabove_row);
const uint16x8_t a = vpaddlq_u8(v_above);
const uint32x4_t b = vpaddlq_u16(a);
const uint64x2_t c = vpaddlq_u32(b);
const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
vreinterpret_u32_u64(vget_high_u64(c)));
average = vget_lane_u32(d, 0);
}
if (x->left_available) {
for (i = 0; i < 16; ++i) {
average += yleft[0];
yleft += left_stride;
}
}
shift += 3;
expected_dc = (average + (1 << (shift - 1))) >> shift;
v_expected_dc = vmovq_n_u8((uint8_t)expected_dc);
}
for (i = 0; i < 16; ++i) {
vst1q_u8(ypred_ptr, v_expected_dc);
ypred_ptr += y_stride;
}
}
break;
case V_PRED:
{
const uint8x16_t v_above = vld1q_u8(yabove_row);
for (i = 0; i < 16; ++i) {
vst1q_u8(ypred_ptr, v_above);
ypred_ptr += y_stride;
}
}
break;
case H_PRED:
{
for (i = 0; i < 16; ++i) {
const uint8x16_t v_yleft = vmovq_n_u8((uint8_t)yleft[0]);
yleft += left_stride;
vst1q_u8(ypred_ptr, v_yleft);
ypred_ptr += y_stride;
}
}
break;
case TM_PRED:
{
const uint16x8_t v_ytop_left = vmovq_n_u16((int16_t)yabove_row[-1]);
const uint8x16_t v_above = vld1q_u8(yabove_row);
for (i = 0; i < 16; ++i) {
const uint8x8_t v_yleft = vmov_n_u8((int8_t)yleft[0]);
const uint16x8_t a_lo = vaddl_u8(vget_low_u8(v_above), v_yleft);
const uint16x8_t a_hi = vaddl_u8(vget_high_u8(v_above), v_yleft);
const int16x8_t b_lo = vsubq_s16(vreinterpretq_s16_u16(a_lo),
vreinterpretq_s16_u16(v_ytop_left));
const int16x8_t b_hi = vsubq_s16(vreinterpretq_s16_u16(a_hi),
vreinterpretq_s16_u16(v_ytop_left));
const uint8x8_t pred_lo = vqmovun_s16(b_lo);
const uint8x8_t pred_hi = vqmovun_s16(b_hi);
vst1q_u8(ypred_ptr, vcombine_u8(pred_lo, pred_hi));
ypred_ptr += y_stride;
yleft += left_stride;
}
}
break;
}
}
void vp8_build_intra_predictors_mbuv_s_neon(MACROBLOCKD *x,
unsigned char * uabove_row,
unsigned char * vabove_row,
unsigned char * uleft,
unsigned char * vleft,
int left_stride,
unsigned char * upred_ptr,
unsigned char * vpred_ptr,
int pred_stride) {
const int mode = x->mode_info_context->mbmi.uv_mode;
int i;
switch (mode) {
case DC_PRED:
{
int shift = x->up_available + x->left_available;
uint8x8_t v_expected_udc = vdup_n_u8(128);
uint8x8_t v_expected_vdc = vdup_n_u8(128);
if (shift) {
unsigned int average_u = 0;
unsigned int average_v = 0;
int expected_udc;
int expected_vdc;
if (x->up_available) {
const uint8x8_t v_uabove = vld1_u8(uabove_row);
const uint8x8_t v_vabove = vld1_u8(vabove_row);
const uint16x8_t a = vpaddlq_u8(vcombine_u8(v_uabove, v_vabove));
const uint32x4_t b = vpaddlq_u16(a);
const uint64x2_t c = vpaddlq_u32(b);
average_u = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 0);
average_v = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 2);
}
if (x->left_available) {
for (i = 0; i < 8; ++i) {
average_u += uleft[0];
uleft += left_stride;
average_v += vleft[0];
vleft += left_stride;
}
}
shift += 2;
expected_udc = (average_u + (1 << (shift - 1))) >> shift;
expected_vdc = (average_v + (1 << (shift - 1))) >> shift;
v_expected_udc = vmov_n_u8((uint8_t)expected_udc);
v_expected_vdc = vmov_n_u8((uint8_t)expected_vdc);
}
for (i = 0; i < 8; ++i) {
vst1_u8(upred_ptr, v_expected_udc);
upred_ptr += pred_stride;
vst1_u8(vpred_ptr, v_expected_vdc);
vpred_ptr += pred_stride;
}
}
break;
case V_PRED:
{
const uint8x8_t v_uabove = vld1_u8(uabove_row);
const uint8x8_t v_vabove = vld1_u8(vabove_row);
for (i = 0; i < 8; ++i) {
vst1_u8(upred_ptr, v_uabove);
upred_ptr += pred_stride;
vst1_u8(vpred_ptr, v_vabove);
vpred_ptr += pred_stride;
}
}
break;
case H_PRED:
{
for (i = 0; i < 8; ++i) {
const uint8x8_t v_uleft = vmov_n_u8((uint8_t)uleft[0]);
const uint8x8_t v_vleft = vmov_n_u8((uint8_t)vleft[0]);
uleft += left_stride;
vleft += left_stride;
vst1_u8(upred_ptr, v_uleft);
upred_ptr += pred_stride;
vst1_u8(vpred_ptr, v_vleft);
vpred_ptr += pred_stride;
}
}
break;
case TM_PRED:
{
const uint16x8_t v_utop_left = vmovq_n_u16((int16_t)uabove_row[-1]);
const uint16x8_t v_vtop_left = vmovq_n_u16((int16_t)vabove_row[-1]);
const uint8x8_t v_uabove = vld1_u8(uabove_row);
const uint8x8_t v_vabove = vld1_u8(vabove_row);
for (i = 0; i < 8; ++i) {
const uint8x8_t v_uleft = vmov_n_u8((int8_t)uleft[0]);
const uint8x8_t v_vleft = vmov_n_u8((int8_t)vleft[0]);
const uint16x8_t a_u = vaddl_u8(v_uabove, v_uleft);
const uint16x8_t a_v = vaddl_u8(v_vabove, v_vleft);
const int16x8_t b_u = vsubq_s16(vreinterpretq_s16_u16(a_u),
vreinterpretq_s16_u16(v_utop_left));
const int16x8_t b_v = vsubq_s16(vreinterpretq_s16_u16(a_v),
vreinterpretq_s16_u16(v_vtop_left));
const uint8x8_t pred_u = vqmovun_s16(b_u);
const uint8x8_t pred_v = vqmovun_s16(b_v);
vst1_u8(upred_ptr, pred_u);
vst1_u8(vpred_ptr, pred_v);
upred_ptr += pred_stride;
vpred_ptr += pred_stride;
uleft += left_stride;
vleft += left_stride;
}
}
break;
}
}

View File

@ -1,207 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sad16x16_neon|
EXPORT |vp8_sad16x8_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr
; r1 int src_stride
; r2 unsigned char *ref_ptr
; r3 int ref_stride
|vp8_sad16x16_neon| PROC
;;
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabdl.u8 q12, d0, d8
vabdl.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0], r1
vld1.8 {q7}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
;;
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabal.u8 q12, d0, d8
vabal.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0], r1
vld1.8 {q7}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
;;
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabal.u8 q12, d0, d8
vabal.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0], r1
vld1.8 {q7}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
;;
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabal.u8 q12, d0, d8
vabal.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0]
vld1.8 {q7}, [r2]
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vadd.u16 q0, q12, q13
vpaddl.u16 q1, q0
vpaddl.u32 q0, q1
vadd.u32 d0, d0, d1
vmov.32 r0, d0[0]
bx lr
ENDP
;==============================
;unsigned int vp8_sad16x8_c(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
|vp8_sad16x8_neon| PROC
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabdl.u8 q12, d0, d8
vabdl.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0], r1
vld1.8 {q7}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabal.u8 q12, d0, d8
vabal.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0], r1
vld1.8 {q7}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vadd.u16 q0, q12, q13
vpaddl.u16 q1, q0
vpaddl.u32 q0, q1
vadd.u32 d0, d0, d1
vmov.32 r0, d0[0]
bx lr
ENDP
END

View File

@ -1,209 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sad8x8_neon|
EXPORT |vp8_sad8x16_neon|
EXPORT |vp8_sad4x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; unsigned int vp8_sad8x8_c(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
|vp8_sad8x8_neon| PROC
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabdl.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vabal.u8 q12, d6, d14
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabal.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q12, d6, d14
vpaddl.u16 q1, q12
vpaddl.u32 q0, q1
vadd.u32 d0, d0, d1
vmov.32 r0, d0[0]
bx lr
ENDP
;============================
;unsigned int vp8_sad8x16_c(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
|vp8_sad8x16_neon| PROC
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabdl.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vabal.u8 q12, d6, d14
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabal.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vabal.u8 q12, d6, d14
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabal.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vabal.u8 q12, d6, d14
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabal.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q12, d6, d14
vpaddl.u16 q1, q12
vpaddl.u32 q0, q1
vadd.u32 d0, d0, d1
vmov.32 r0, d0[0]
bx lr
ENDP
;===========================
;unsigned int vp8_sad4x4_c(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
|vp8_sad4x4_neon| PROC
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabdl.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q12, d6, d14
vpaddl.u16 d1, d24
vpaddl.u32 d0, d1
vmov.32 r0, d0[0]
bx lr
ENDP
END

View File

@ -0,0 +1,184 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
unsigned int vp8_sad8x8_neon(
unsigned char *src_ptr,
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x8_t d0, d8;
uint16x8_t q12;
uint32x4_t q1;
uint64x2_t q3;
uint32x2_t d5;
int i;
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(d0, d8);
for (i = 0; i < 7; i++) {
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, d0, d8);
}
q1 = vpaddlq_u16(q12);
q3 = vpaddlq_u32(q1);
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
vreinterpret_u32_u64(vget_high_u64(q3)));
return vget_lane_u32(d5, 0);
}
unsigned int vp8_sad8x16_neon(
unsigned char *src_ptr,
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x8_t d0, d8;
uint16x8_t q12;
uint32x4_t q1;
uint64x2_t q3;
uint32x2_t d5;
int i;
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(d0, d8);
for (i = 0; i < 15; i++) {
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, d0, d8);
}
q1 = vpaddlq_u16(q12);
q3 = vpaddlq_u32(q1);
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
vreinterpret_u32_u64(vget_high_u64(q3)));
return vget_lane_u32(d5, 0);
}
unsigned int vp8_sad4x4_neon(
unsigned char *src_ptr,
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x8_t d0, d8;
uint16x8_t q12;
uint32x2_t d1;
uint64x1_t d3;
int i;
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(d0, d8);
for (i = 0; i < 3; i++) {
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, d0, d8);
}
d1 = vpaddl_u16(vget_low_u16(q12));
d3 = vpaddl_u32(d1);
return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
}
unsigned int vp8_sad16x16_neon(
unsigned char *src_ptr,
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x16_t q0, q4;
uint16x8_t q12, q13;
uint32x4_t q1;
uint64x2_t q3;
uint32x2_t d5;
int i;
q0 = vld1q_u8(src_ptr);
src_ptr += src_stride;
q4 = vld1q_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
for (i = 0; i < 15; i++) {
q0 = vld1q_u8(src_ptr);
src_ptr += src_stride;
q4 = vld1q_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
}
q12 = vaddq_u16(q12, q13);
q1 = vpaddlq_u16(q12);
q3 = vpaddlq_u32(q1);
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
vreinterpret_u32_u64(vget_high_u64(q3)));
return vget_lane_u32(d5, 0);
}
unsigned int vp8_sad16x8_neon(
unsigned char *src_ptr,
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x16_t q0, q4;
uint16x8_t q12, q13;
uint32x4_t q1;
uint64x2_t q3;
uint32x2_t d5;
int i;
q0 = vld1q_u8(src_ptr);
src_ptr += src_stride;
q4 = vld1q_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
for (i = 0; i < 7; i++) {
q0 = vld1q_u8(src_ptr);
src_ptr += src_stride;
q4 = vld1q_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
}
q12 = vaddq_u16(q12, q13);
q1 = vpaddlq_u16(q12);
q3 = vpaddlq_u32(q1);
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
vreinterpret_u32_u64(vget_high_u64(q3)));
return vget_lane_u32(d5, 0);
}

View File

@ -1,36 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_push_neon|
EXPORT |vp8_pop_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
|vp8_push_neon| PROC
vst1.i64 {d8, d9, d10, d11}, [r0]!
vst1.i64 {d12, d13, d14, d15}, [r0]!
bx lr
ENDP
|vp8_pop_neon| PROC
vld1.i64 {d8, d9, d10, d11}, [r0]!
vld1.i64 {d12, d13, d14, d15}, [r0]!
bx lr
ENDP
END

View File

@ -1,139 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_idct4x4llm_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;*************************************************************
;void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
; unsigned char *dst, int stride)
;r0 short * input
;r1 short * pred
;r2 int pitch
;r3 unsigned char dst
;sp int stride
;*************************************************************
; static const int cospi8sqrt2minus1=20091;
; static const int sinpi8sqrt2 =35468;
; static const int rounding = 0;
; Optimization note: The resulted data from dequantization are signed
; 13-bit data that is in the range of [-4096, 4095]. This allows to
; use "vqdmulh"(neon) instruction since it won't go out of range
; (13+16+1=30bits<32bits). This instruction gives the high half
; result of the multiplication that is needed in IDCT.
|vp8_short_idct4x4llm_neon| PROC
adr r12, idct_coeff
vld1.16 {q1, q2}, [r0]
vld1.16 {d0}, [r12]
vswp d3, d4 ;q2(vp[4] vp[12])
ldr r0, [sp] ; stride
vqdmulh.s16 q3, q2, d0[2]
vqdmulh.s16 q4, q2, d0[0]
vqadd.s16 d12, d2, d3 ;a1
vqsub.s16 d13, d2, d3 ;b1
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
vqadd.s16 q4, q4, q2
;d6 - c1:temp1
;d7 - d1:temp2
;d8 - d1:temp1
;d9 - c1:temp2
vqsub.s16 d10, d6, d9 ;c1
vqadd.s16 d11, d7, d8 ;d1
vqadd.s16 d2, d12, d11
vqadd.s16 d3, d13, d10
vqsub.s16 d4, d13, d10
vqsub.s16 d5, d12, d11
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
vswp d3, d4
vqdmulh.s16 q3, q2, d0[2]
vqdmulh.s16 q4, q2, d0[0]
vqadd.s16 d12, d2, d3 ;a1
vqsub.s16 d13, d2, d3 ;b1
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
vqadd.s16 q4, q4, q2
vqsub.s16 d10, d6, d9 ;c1
vqadd.s16 d11, d7, d8 ;d1
vqadd.s16 d2, d12, d11
vqadd.s16 d3, d13, d10
vqsub.s16 d4, d13, d10
vqsub.s16 d5, d12, d11
vrshr.s16 d2, d2, #3
vrshr.s16 d3, d3, #3
vrshr.s16 d4, d4, #3
vrshr.s16 d5, d5, #3
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
; load prediction data
vld1.32 d6[0], [r1], r2
vld1.32 d6[1], [r1], r2
vld1.32 d7[0], [r1], r2
vld1.32 d7[1], [r1], r2
; add prediction and residual
vaddw.u8 q1, q1, d6
vaddw.u8 q2, q2, d7
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
; store to destination
vst1.32 d1[0], [r3], r0
vst1.32 d1[1], [r3], r0
vst1.32 d2[0], [r3], r0
vst1.32 d2[1], [r3], r0
bx lr
ENDP
;-----------------
idct_coeff
DCD 0x4e7b4e7b, 0x8a8c8a8c
;20091, 20091, 35468, 35468
END

View File

@ -0,0 +1,123 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
static const int16_t cospi8sqrt2minus1 = 20091;
static const int16_t sinpi8sqrt2 = 35468;
void vp8_short_idct4x4llm_neon(
int16_t *input,
unsigned char *pred_ptr,
int pred_stride,
unsigned char *dst_ptr,
int dst_stride) {
int i;
uint32x2_t d6u32 = vdup_n_u32(0);
uint8x8_t d1u8;
int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
uint16x8_t q1u16;
int16x8_t q1s16, q2s16, q3s16, q4s16;
int32x2x2_t v2tmp0, v2tmp1;
int16x4x2_t v2tmp2, v2tmp3;
d2 = vld1_s16(input);
d3 = vld1_s16(input + 4);
d4 = vld1_s16(input + 8);
d5 = vld1_s16(input + 12);
// 1st for loop
q1s16 = vcombine_s16(d2, d4); // Swap d3 d4 here
q2s16 = vcombine_s16(d3, d5);
q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1
d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1
q3s16 = vshrq_n_s16(q3s16, 1);
q4s16 = vshrq_n_s16(q4s16, 1);
q3s16 = vqaddq_s16(q3s16, q2s16);
q4s16 = vqaddq_s16(q4s16, q2s16);
d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1
d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1
d2 = vqadd_s16(d12, d11);
d3 = vqadd_s16(d13, d10);
d4 = vqsub_s16(d13, d10);
d5 = vqsub_s16(d12, d11);
v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
vreinterpret_s16_s32(v2tmp1.val[0]));
v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
vreinterpret_s16_s32(v2tmp1.val[1]));
// 2nd for loop
q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]);
q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]);
q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1
d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1
q3s16 = vshrq_n_s16(q3s16, 1);
q4s16 = vshrq_n_s16(q4s16, 1);
q3s16 = vqaddq_s16(q3s16, q2s16);
q4s16 = vqaddq_s16(q4s16, q2s16);
d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1
d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1
d2 = vqadd_s16(d12, d11);
d3 = vqadd_s16(d13, d10);
d4 = vqsub_s16(d13, d10);
d5 = vqsub_s16(d12, d11);
d2 = vrshr_n_s16(d2, 3);
d3 = vrshr_n_s16(d3, 3);
d4 = vrshr_n_s16(d4, 3);
d5 = vrshr_n_s16(d5, 3);
v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
vreinterpret_s16_s32(v2tmp1.val[0]));
v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
vreinterpret_s16_s32(v2tmp1.val[1]));
q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]);
q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]);
// dc_only_idct_add
for (i = 0; i < 2; i++, q1s16 = q2s16) {
d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0);
pred_ptr += pred_stride;
d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1);
pred_ptr += pred_stride;
q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16),
vreinterpret_u8_u32(d6u32));
d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0);
dst_ptr += dst_stride;
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1);
dst_ptr += dst_stride;
}
return;
}

View File

@ -1,490 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sixtap_predict16x16_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
filter16_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; r4 unsigned char *dst_ptr,
; stack(r5) int dst_pitch
;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to
; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication,
; the result can be negtive. So, I treat the result as s16. But, since it is also possible
; that the result can be a large positive number (> 2^15-1), which could be confused as a
; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2,
; which ensures that the result stays in s16 range. Finally, saturated add the result by
; applying 3rd filter coeff. Same applys to other filter functions.
|vp8_sixtap_predict16x16_neon| PROC
push {r4-r5, lr}
adr r12, filter16_coeff
ldr r4, [sp, #12] ;load parameters from stack
ldr r5, [sp, #16] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_filter16x16_only
add r2, r12, r2, lsl #5 ;calculate filter location
cmp r3, #0 ;skip second_pass filter if yoffset=0
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
beq firstpass_filter16x16_only
sub sp, sp, #336 ;reserve space on stack for temporary storage
mov lr, sp
vabs.s32 q12, q14
vabs.s32 q13, q15
mov r2, #7 ;loop counter
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
sub r0, r0, r1, lsl #1
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vdup.8 d1, d24[4]
vdup.8 d2, d25[0]
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
;First Pass: output_height lines x output_width columns (21x16)
filt_blk2d_fp16x16_loop_neon
vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data
vld1.u8 {d9, d10, d11}, [r0], r1
vld1.u8 {d12, d13, d14}, [r0], r1
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q9, d7, d0
vmull.u8 q10, d9, d0
vmull.u8 q11, d10, d0
vmull.u8 q12, d12, d0
vmull.u8 q13, d13, d0
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d29, d9, d10, #1
vext.8 d30, d12, d13, #1
vmlsl.u8 q8, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q10, d29, d1
vmlsl.u8 q12, d30, d1
vext.8 d28, d7, d8, #1
vext.8 d29, d10, d11, #1
vext.8 d30, d13, d14, #1
vmlsl.u8 q9, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q11, d29, d1
vmlsl.u8 q13, d30, d1
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
vext.8 d29, d9, d10, #4
vext.8 d30, d12, d13, #4
vmlsl.u8 q8, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q10, d29, d4
vmlsl.u8 q12, d30, d4
vext.8 d28, d7, d8, #4
vext.8 d29, d10, d11, #4
vext.8 d30, d13, d14, #4
vmlsl.u8 q9, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q11, d29, d4
vmlsl.u8 q13, d30, d4
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d9, d10, #5
vext.8 d30, d12, d13, #5
vmlal.u8 q8, d28, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q10, d29, d5
vmlal.u8 q12, d30, d5
vext.8 d28, d7, d8, #5
vext.8 d29, d10, d11, #5
vext.8 d30, d13, d14, #5
vmlal.u8 q9, d28, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q11, d29, d5
vmlal.u8 q13, d30, d5
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
vext.8 d29, d9, d10, #2
vext.8 d30, d12, d13, #2
vmlal.u8 q8, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q10, d29, d2
vmlal.u8 q12, d30, d2
vext.8 d28, d7, d8, #2
vext.8 d29, d10, d11, #2
vext.8 d30, d13, d14, #2
vmlal.u8 q9, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q11, d29, d2
vmlal.u8 q13, d30, d2
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
vext.8 d29, d9, d10, #3
vext.8 d30, d12, d13, #3
vext.8 d15, d7, d8, #3
vext.8 d31, d10, d11, #3
vext.8 d6, d13, d14, #3
vmull.u8 q4, d28, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q5, d29, d3
vmull.u8 q6, d30, d3
vqadd.s16 q8, q4 ;sum of all (src_data*filter_parameters)
vqadd.s16 q10, q5
vqadd.s16 q12, q6
vmull.u8 q6, d15, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q7, d31, d3
vmull.u8 q3, d6, d3
subs r2, r2, #1
vqadd.s16 q9, q6
vqadd.s16 q11, q7
vqadd.s16 q13, q3
vqrshrun.s16 d6, q8, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q9, #7
vqrshrun.s16 d8, q10, #7
vqrshrun.s16 d9, q11, #7
vqrshrun.s16 d10, q12, #7
vqrshrun.s16 d11, q13, #7
vst1.u8 {d6, d7, d8}, [lr]! ;store result
vst1.u8 {d9, d10, d11}, [lr]!
bne filt_blk2d_fp16x16_loop_neon
;Second pass: 16x16
;secondpass_filter - do first 8-columns and then second 8-columns
add r3, r12, r3, lsl #5
sub lr, lr, #336
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
mov r3, #2 ;loop counter
vabs.s32 q7, q5
vabs.s32 q8, q6
mov r2, #16
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vdup.8 d1, d14[4]
vdup.8 d2, d15[0]
vdup.8 d3, d15[4]
vdup.8 d4, d16[0]
vdup.8 d5, d16[4]
filt_blk2d_sp16x16_outloop_neon
vld1.u8 {d18}, [lr], r2 ;load src data
vld1.u8 {d19}, [lr], r2
vld1.u8 {d20}, [lr], r2
vld1.u8 {d21}, [lr], r2
mov r12, #4 ;loop counter
vld1.u8 {d22}, [lr], r2
secondpass_inner_loop_neon
vld1.u8 {d23}, [lr], r2 ;load src data
vld1.u8 {d24}, [lr], r2
vld1.u8 {d25}, [lr], r2
vld1.u8 {d26}, [lr], r2
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d19, d0
vmull.u8 q5, d20, d0
vmull.u8 q6, d21, d0
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q4, d20, d1
vmlsl.u8 q5, d21, d1
vmlsl.u8 q6, d22, d1
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d23, d4
vmlsl.u8 q5, d24, d4
vmlsl.u8 q6, d25, d4
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d21, d2
vmlal.u8 q5, d22, d2
vmlal.u8 q6, d23, d2
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q4, d24, d5
vmlal.u8 q5, d25, d5
vmlal.u8 q6, d26, d5
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q8, d22, d3
vmull.u8 q9, d23, d3
vmull.u8 q10, d24, d3
subs r12, r12, #1
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vst1.u8 {d6}, [r4], r5 ;store result
vmov q9, q11
vst1.u8 {d7}, [r4], r5
vmov q10, q12
vst1.u8 {d8}, [r4], r5
vmov d22, d26
vst1.u8 {d9}, [r4], r5
bne secondpass_inner_loop_neon
subs r3, r3, #1
sub lr, lr, #336
add lr, lr, #8
sub r4, r4, r5, lsl #4
add r4, r4, #8
bne filt_blk2d_sp16x16_outloop_neon
add sp, sp, #336
pop {r4-r5,pc}
;--------------------
firstpass_filter16x16_only
vabs.s32 q12, q14
vabs.s32 q13, q15
mov r2, #8 ;loop counter
sub r0, r0, #2 ;move srcptr back to (column-2)
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vdup.8 d1, d24[4]
vdup.8 d2, d25[0]
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
;First Pass: output_height lines x output_width columns (16x16)
filt_blk2d_fpo16x16_loop_neon
vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data
vld1.u8 {d9, d10, d11}, [r0], r1
pld [r0]
pld [r0, r1]
vmull.u8 q6, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q7, d7, d0
vmull.u8 q8, d9, d0
vmull.u8 q9, d10, d0
vext.8 d20, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d21, d9, d10, #1
vext.8 d22, d7, d8, #1
vext.8 d23, d10, d11, #1
vext.8 d24, d6, d7, #4 ;construct src_ptr[2]
vext.8 d25, d9, d10, #4
vext.8 d26, d7, d8, #4
vext.8 d27, d10, d11, #4
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d9, d10, #5
vmlsl.u8 q6, d20, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d21, d1
vmlsl.u8 q7, d22, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q9, d23, d1
vmlsl.u8 q6, d24, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d25, d4
vmlsl.u8 q7, d26, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q9, d27, d4
vmlal.u8 q6, d28, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q8, d29, d5
vext.8 d20, d7, d8, #5
vext.8 d21, d10, d11, #5
vext.8 d22, d6, d7, #2 ;construct src_ptr[0]
vext.8 d23, d9, d10, #2
vext.8 d24, d7, d8, #2
vext.8 d25, d10, d11, #2
vext.8 d26, d6, d7, #3 ;construct src_ptr[1]
vext.8 d27, d9, d10, #3
vext.8 d28, d7, d8, #3
vext.8 d29, d10, d11, #3
vmlal.u8 q7, d20, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q9, d21, d5
vmlal.u8 q6, d22, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d23, d2
vmlal.u8 q7, d24, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q9, d25, d2
vmull.u8 q10, d26, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q11, d27, d3
vmull.u8 q12, d28, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q15, d29, d3
vqadd.s16 q6, q10 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q11
vqadd.s16 q7, q12
vqadd.s16 q9, q15
subs r2, r2, #1
vqrshrun.s16 d6, q6, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q7, #7
vqrshrun.s16 d8, q8, #7
vqrshrun.s16 d9, q9, #7
vst1.u8 {q3}, [r4], r5 ;store result
vst1.u8 {q4}, [r4], r5
bne filt_blk2d_fpo16x16_loop_neon
pop {r4-r5,pc}
;--------------------
secondpass_filter16x16_only
;Second pass: 16x16
add r3, r12, r3, lsl #5
sub r0, r0, r1, lsl #1
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
mov r3, #2 ;loop counter
vabs.s32 q7, q5
vabs.s32 q8, q6
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vdup.8 d1, d14[4]
vdup.8 d2, d15[0]
vdup.8 d3, d15[4]
vdup.8 d4, d16[0]
vdup.8 d5, d16[4]
filt_blk2d_spo16x16_outloop_neon
vld1.u8 {d18}, [r0], r1 ;load src data
vld1.u8 {d19}, [r0], r1
vld1.u8 {d20}, [r0], r1
vld1.u8 {d21}, [r0], r1
mov r12, #4 ;loop counter
vld1.u8 {d22}, [r0], r1
secondpass_only_inner_loop_neon
vld1.u8 {d23}, [r0], r1 ;load src data
vld1.u8 {d24}, [r0], r1
vld1.u8 {d25}, [r0], r1
vld1.u8 {d26}, [r0], r1
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d19, d0
vmull.u8 q5, d20, d0
vmull.u8 q6, d21, d0
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q4, d20, d1
vmlsl.u8 q5, d21, d1
vmlsl.u8 q6, d22, d1
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d23, d4
vmlsl.u8 q5, d24, d4
vmlsl.u8 q6, d25, d4
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d21, d2
vmlal.u8 q5, d22, d2
vmlal.u8 q6, d23, d2
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q4, d24, d5
vmlal.u8 q5, d25, d5
vmlal.u8 q6, d26, d5
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q8, d22, d3
vmull.u8 q9, d23, d3
vmull.u8 q10, d24, d3
subs r12, r12, #1
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vst1.u8 {d6}, [r4], r5 ;store result
vmov q9, q11
vst1.u8 {d7}, [r4], r5
vmov q10, q12
vst1.u8 {d8}, [r4], r5
vmov d22, d26
vst1.u8 {d9}, [r4], r5
bne secondpass_only_inner_loop_neon
subs r3, r3, #1
sub r0, r0, r1, lsl #4
sub r0, r0, r1, lsl #2
sub r0, r0, r1
add r0, r0, #8
sub r4, r4, r5, lsl #4
add r4, r4, #8
bne filt_blk2d_spo16x16_outloop_neon
pop {r4-r5,pc}
ENDP
;-----------------
END

View File

@ -1,422 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sixtap_predict4x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
filter4_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; stack(r4) unsigned char *dst_ptr,
; stack(lr) int dst_pitch
|vp8_sixtap_predict4x4_neon| PROC
push {r4, lr}
adr r12, filter4_coeff
ldr r4, [sp, #8] ;load parameters from stack
ldr lr, [sp, #12] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_filter4x4_only
add r2, r12, r2, lsl #5 ;calculate filter location
cmp r3, #0 ;skip second_pass filter if yoffset=0
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
beq firstpass_filter4x4_only
vabs.s32 q12, q14 ;get abs(filer_parameters)
vabs.s32 q13, q15
sub r0, r0, #2 ;go back 2 columns of src data
sub r0, r0, r1, lsl #1 ;go back 2 lines of src data
;First pass: output_height lines x output_width columns (9x4)
vld1.u8 {q3}, [r0], r1 ;load first 4-line src data
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vld1.u8 {q4}, [r0], r1
vdup.8 d1, d24[4]
vld1.u8 {q5}, [r0], r1
vdup.8 d2, d25[0]
vld1.u8 {q6}, [r0], r1
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
vext.8 d19, d8, d9, #5
vext.8 d20, d10, d11, #5
vext.8 d21, d12, d13, #5
vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
vswp d11, d12
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
vzip.32 d20, d21
vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5])
vmull.u8 q8, d20, d5
vmov q4, q3 ;keep original src data in q4 q6
vmov q6, q5
vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
vzip.32 d10, d11
vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
vshr.u64 q10, q6, #8
vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0])
vmlal.u8 q8, d10, d0
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
vzip.32 d20, d21
vshr.u64 q3, q4, #32 ;construct src_ptr[2]
vshr.u64 q5, q6, #32
vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d20, d1
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
vzip.32 d10, d11
vshr.u64 q9, q4, #16 ;construct src_ptr[0]
vshr.u64 q10, q6, #16
vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d10, d4
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
vzip.32 d20, d21
vshr.u64 q3, q4, #24 ;construct src_ptr[1]
vshr.u64 q5, q6, #24
vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d20, d2
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
vzip.32 d10, d11
vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q10, d10, d3
vld1.u8 {q3}, [r0], r1 ;load rest 5-line src data
vld1.u8 {q4}, [r0], r1
vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q10
vld1.u8 {q5}, [r0], r1
vld1.u8 {q6}, [r0], r1
vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d28, q8, #7
;First Pass on rest 5-line data
vld1.u8 {q11}, [r0], r1
vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
vext.8 d19, d8, d9, #5
vext.8 d20, d10, d11, #5
vext.8 d21, d12, d13, #5
vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
vswp d11, d12
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
vzip.32 d20, d21
vext.8 d31, d22, d23, #5 ;construct src_ptr[3]
vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5])
vmull.u8 q8, d20, d5
vmull.u8 q12, d31, d5 ;(src_ptr[3] * vp8_filter[5])
vmov q4, q3 ;keep original src data in q4 q6
vmov q6, q5
vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
vzip.32 d10, d11
vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
vshr.u64 q10, q6, #8
vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0])
vmlal.u8 q8, d10, d0
vmlal.u8 q12, d22, d0 ;(src_ptr[-2] * vp8_filter[0])
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
vzip.32 d20, d21
vshr.u64 q3, q4, #32 ;construct src_ptr[2]
vshr.u64 q5, q6, #32
vext.8 d31, d22, d23, #1 ;construct src_ptr[-1]
vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d20, d1
vmlsl.u8 q12, d31, d1 ;-(src_ptr[-1] * vp8_filter[1])
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
vzip.32 d10, d11
vshr.u64 q9, q4, #16 ;construct src_ptr[0]
vshr.u64 q10, q6, #16
vext.8 d31, d22, d23, #4 ;construct src_ptr[2]
vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d10, d4
vmlsl.u8 q12, d31, d4 ;-(src_ptr[2] * vp8_filter[4])
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
vzip.32 d20, d21
vshr.u64 q3, q4, #24 ;construct src_ptr[1]
vshr.u64 q5, q6, #24
vext.8 d31, d22, d23, #2 ;construct src_ptr[0]
vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d20, d2
vmlal.u8 q12, d31, d2 ;(src_ptr[0] * vp8_filter[2])
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
vzip.32 d10, d11
vext.8 d31, d22, d23, #3 ;construct src_ptr[1]
vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q10, d10, d3
vmull.u8 q11, d31, d3 ;(src_ptr[1] * vp8_filter[3])
add r3, r12, r3, lsl #5
vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q10
vqadd.s16 q12, q11
vext.8 d23, d27, d28, #4
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vqrshrun.s16 d29, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d30, q8, #7
vqrshrun.s16 d31, q12, #7
;Second pass: 4x4
vabs.s32 q7, q5
vabs.s32 q8, q6
vext.8 d24, d28, d29, #4
vext.8 d25, d29, d30, #4
vext.8 d26, d30, d31, #4
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vdup.8 d1, d14[4]
vdup.8 d2, d15[0]
vdup.8 d3, d15[4]
vdup.8 d4, d16[0]
vdup.8 d5, d16[4]
vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d28, d0
vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5])
vmull.u8 q6, d26, d5
vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d30, d4
vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q6, d24, d1
vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d29, d2
vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3])
vmlal.u8 q6, d25, d3
add r0, r4, lr
add r1, r0, lr
add r2, r1, lr
vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q6, q4
vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8
vqrshrun.s16 d4, q6, #7
vst1.32 {d3[0]}, [r4] ;store result
vst1.32 {d3[1]}, [r0]
vst1.32 {d4[0]}, [r1]
vst1.32 {d4[1]}, [r2]
pop {r4, pc}
;---------------------
firstpass_filter4x4_only
vabs.s32 q12, q14 ;get abs(filer_parameters)
vabs.s32 q13, q15
sub r0, r0, #2 ;go back 2 columns of src data
;First pass: output_height lines x output_width columns (4x4)
vld1.u8 {q3}, [r0], r1 ;load first 4-line src data
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vld1.u8 {q4}, [r0], r1
vdup.8 d1, d24[4]
vld1.u8 {q5}, [r0], r1
vdup.8 d2, d25[0]
vld1.u8 {q6}, [r0], r1
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
vext.8 d19, d8, d9, #5
vext.8 d20, d10, d11, #5
vext.8 d21, d12, d13, #5
vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
vswp d11, d12
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
vzip.32 d20, d21
vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5])
vmull.u8 q8, d20, d5
vmov q4, q3 ;keep original src data in q4 q6
vmov q6, q5
vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
vzip.32 d10, d11
vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
vshr.u64 q10, q6, #8
vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0])
vmlal.u8 q8, d10, d0
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
vzip.32 d20, d21
vshr.u64 q3, q4, #32 ;construct src_ptr[2]
vshr.u64 q5, q6, #32
vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d20, d1
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
vzip.32 d10, d11
vshr.u64 q9, q4, #16 ;construct src_ptr[0]
vshr.u64 q10, q6, #16
vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d10, d4
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
vzip.32 d20, d21
vshr.u64 q3, q4, #24 ;construct src_ptr[1]
vshr.u64 q5, q6, #24
vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d20, d2
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
vzip.32 d10, d11
vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q10, d10, d3
add r0, r4, lr
add r1, r0, lr
add r2, r1, lr
vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q10
vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d28, q8, #7
vst1.32 {d27[0]}, [r4] ;store result
vst1.32 {d27[1]}, [r0]
vst1.32 {d28[0]}, [r1]
vst1.32 {d28[1]}, [r2]
pop {r4, pc}
;---------------------
secondpass_filter4x4_only
sub r0, r0, r1, lsl #1
add r3, r12, r3, lsl #5
vld1.32 {d27[0]}, [r0], r1 ;load src data
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vld1.32 {d27[1]}, [r0], r1
vabs.s32 q7, q5
vld1.32 {d28[0]}, [r0], r1
vabs.s32 q8, q6
vld1.32 {d28[1]}, [r0], r1
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vld1.32 {d29[0]}, [r0], r1
vdup.8 d1, d14[4]
vld1.32 {d29[1]}, [r0], r1
vdup.8 d2, d15[0]
vld1.32 {d30[0]}, [r0], r1
vdup.8 d3, d15[4]
vld1.32 {d30[1]}, [r0], r1
vdup.8 d4, d16[0]
vld1.32 {d31[0]}, [r0], r1
vdup.8 d5, d16[4]
vext.8 d23, d27, d28, #4
vext.8 d24, d28, d29, #4
vext.8 d25, d29, d30, #4
vext.8 d26, d30, d31, #4
vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d28, d0
vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5])
vmull.u8 q6, d26, d5
vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d30, d4
vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q6, d24, d1
vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d29, d2
vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3])
vmlal.u8 q6, d25, d3
add r0, r4, lr
add r1, r0, lr
add r2, r1, lr
vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q6, q4
vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8
vqrshrun.s16 d4, q6, #7
vst1.32 {d3[0]}, [r4] ;store result
vst1.32 {d3[1]}, [r0]
vst1.32 {d4[0]}, [r1]
vst1.32 {d4[1]}, [r2]
pop {r4, pc}
ENDP
;-----------------
END

View File

@ -1,473 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sixtap_predict8x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
filter8_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; r4 unsigned char *dst_ptr,
; stack(r5) int dst_pitch
|vp8_sixtap_predict8x4_neon| PROC
push {r4-r5, lr}
adr r12, filter8_coeff
ldr r4, [sp, #12] ;load parameters from stack
ldr r5, [sp, #16] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_filter8x4_only
add r2, r12, r2, lsl #5 ;calculate filter location
cmp r3, #0 ;skip second_pass filter if yoffset=0
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
beq firstpass_filter8x4_only
sub sp, sp, #32 ;reserve space on stack for temporary storage
vabs.s32 q12, q14
vabs.s32 q13, q15
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
mov lr, sp
sub r0, r0, r1, lsl #1
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vdup.8 d1, d24[4]
vdup.8 d2, d25[0]
;First pass: output_height lines x output_width columns (9x8)
vld1.u8 {q3}, [r0], r1 ;load src data
vdup.8 d3, d25[4]
vld1.u8 {q4}, [r0], r1
vdup.8 d4, d26[0]
vld1.u8 {q5}, [r0], r1
vdup.8 d5, d26[4]
vld1.u8 {q6}, [r0], r1
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q8, d8, d0
vmull.u8 q9, d10, d0
vmull.u8 q10, d12, d0
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d29, d8, d9, #1
vext.8 d30, d10, d11, #1
vext.8 d31, d12, d13, #1
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d29, d1
vmlsl.u8 q9, d30, d1
vmlsl.u8 q10, d31, d1
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
vext.8 d29, d8, d9, #4
vext.8 d30, d10, d11, #4
vext.8 d31, d12, d13, #4
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d29, d4
vmlsl.u8 q9, d30, d4
vmlsl.u8 q10, d31, d4
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
vext.8 d29, d8, d9, #2
vext.8 d30, d10, d11, #2
vext.8 d31, d12, d13, #2
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d29, d2
vmlal.u8 q9, d30, d2
vmlal.u8 q10, d31, d2
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d8, d9, #5
vext.8 d30, d10, d11, #5
vext.8 d31, d12, d13, #5
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q8, d29, d5
vmlal.u8 q9, d30, d5
vmlal.u8 q10, d31, d5
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
vext.8 d29, d8, d9, #3
vext.8 d30, d10, d11, #3
vext.8 d31, d12, d13, #3
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q4, d29, d3
vmull.u8 q5, d30, d3
vmull.u8 q6, d31, d3
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vld1.u8 {q3}, [r0], r1 ;load src data
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d23, q8, #7
vqrshrun.s16 d24, q9, #7
vqrshrun.s16 d25, q10, #7
vld1.u8 {q4}, [r0], r1
vst1.u8 {d22}, [lr]! ;store result
vld1.u8 {q5}, [r0], r1
vst1.u8 {d23}, [lr]!
vld1.u8 {q6}, [r0], r1
vst1.u8 {d24}, [lr]!
vld1.u8 {q7}, [r0], r1
vst1.u8 {d25}, [lr]!
;first_pass filtering on the rest 5-line data
vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q9, d8, d0
vmull.u8 q10, d10, d0
vmull.u8 q11, d12, d0
vmull.u8 q12, d14, d0
vext.8 d27, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d28, d8, d9, #1
vext.8 d29, d10, d11, #1
vext.8 d30, d12, d13, #1
vext.8 d31, d14, d15, #1
vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q9, d28, d1
vmlsl.u8 q10, d29, d1
vmlsl.u8 q11, d30, d1
vmlsl.u8 q12, d31, d1
vext.8 d27, d6, d7, #4 ;construct src_ptr[2]
vext.8 d28, d8, d9, #4
vext.8 d29, d10, d11, #4
vext.8 d30, d12, d13, #4
vext.8 d31, d14, d15, #4
vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q9, d28, d4
vmlsl.u8 q10, d29, d4
vmlsl.u8 q11, d30, d4
vmlsl.u8 q12, d31, d4
vext.8 d27, d6, d7, #2 ;construct src_ptr[0]
vext.8 d28, d8, d9, #2
vext.8 d29, d10, d11, #2
vext.8 d30, d12, d13, #2
vext.8 d31, d14, d15, #2
vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q9, d28, d2
vmlal.u8 q10, d29, d2
vmlal.u8 q11, d30, d2
vmlal.u8 q12, d31, d2
vext.8 d27, d6, d7, #5 ;construct src_ptr[3]
vext.8 d28, d8, d9, #5
vext.8 d29, d10, d11, #5
vext.8 d30, d12, d13, #5
vext.8 d31, d14, d15, #5
vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q9, d28, d5
vmlal.u8 q10, d29, d5
vmlal.u8 q11, d30, d5
vmlal.u8 q12, d31, d5
vext.8 d27, d6, d7, #3 ;construct src_ptr[1]
vext.8 d28, d8, d9, #3
vext.8 d29, d10, d11, #3
vext.8 d30, d12, d13, #3
vext.8 d31, d14, d15, #3
vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q4, d28, d3
vmull.u8 q5, d29, d3
vmull.u8 q6, d30, d3
vmull.u8 q7, d31, d3
vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q9, q4
vqadd.s16 q10, q5
vqadd.s16 q11, q6
vqadd.s16 q12, q7
vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8
vqrshrun.s16 d27, q9, #7
vqrshrun.s16 d28, q10, #7
vqrshrun.s16 d29, q11, #7 ;load intermediate data from stack
vqrshrun.s16 d30, q12, #7
;Second pass: 8x4
;secondpass_filter
add r3, r12, r3, lsl #5
sub lr, lr, #32
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vld1.u8 {q11}, [lr]!
vabs.s32 q7, q5
vabs.s32 q8, q6
vld1.u8 {q12}, [lr]!
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vdup.8 d1, d14[4]
vdup.8 d2, d15[0]
vdup.8 d3, d15[4]
vdup.8 d4, d16[0]
vdup.8 d5, d16[4]
vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d23, d0
vmull.u8 q5, d24, d0
vmull.u8 q6, d25, d0
vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q4, d24, d1
vmlsl.u8 q5, d25, d1
vmlsl.u8 q6, d26, d1
vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d27, d4
vmlsl.u8 q5, d28, d4
vmlsl.u8 q6, d29, d4
vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d25, d2
vmlal.u8 q5, d26, d2
vmlal.u8 q6, d27, d2
vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q4, d28, d5
vmlal.u8 q5, d29, d5
vmlal.u8 q6, d30, d5
vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q8, d26, d3
vmull.u8 q9, d27, d3
vmull.u8 q10, d28, d3
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vst1.u8 {d6}, [r4], r5 ;store result
vst1.u8 {d7}, [r4], r5
vst1.u8 {d8}, [r4], r5
vst1.u8 {d9}, [r4], r5
add sp, sp, #32
pop {r4-r5,pc}
;--------------------
firstpass_filter8x4_only
vabs.s32 q12, q14
vabs.s32 q13, q15
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
vld1.u8 {q3}, [r0], r1 ;load src data
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vld1.u8 {q4}, [r0], r1
vdup.8 d1, d24[4]
vld1.u8 {q5}, [r0], r1
vdup.8 d2, d25[0]
vld1.u8 {q6}, [r0], r1
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
;First pass: output_height lines x output_width columns (4x8)
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q8, d8, d0
vmull.u8 q9, d10, d0
vmull.u8 q10, d12, d0
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d29, d8, d9, #1
vext.8 d30, d10, d11, #1
vext.8 d31, d12, d13, #1
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d29, d1
vmlsl.u8 q9, d30, d1
vmlsl.u8 q10, d31, d1
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
vext.8 d29, d8, d9, #4
vext.8 d30, d10, d11, #4
vext.8 d31, d12, d13, #4
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d29, d4
vmlsl.u8 q9, d30, d4
vmlsl.u8 q10, d31, d4
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
vext.8 d29, d8, d9, #2
vext.8 d30, d10, d11, #2
vext.8 d31, d12, d13, #2
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d29, d2
vmlal.u8 q9, d30, d2
vmlal.u8 q10, d31, d2
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d8, d9, #5
vext.8 d30, d10, d11, #5
vext.8 d31, d12, d13, #5
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q8, d29, d5
vmlal.u8 q9, d30, d5
vmlal.u8 q10, d31, d5
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
vext.8 d29, d8, d9, #3
vext.8 d30, d10, d11, #3
vext.8 d31, d12, d13, #3
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q4, d29, d3
vmull.u8 q5, d30, d3
vmull.u8 q6, d31, d3
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d23, q8, #7
vqrshrun.s16 d24, q9, #7
vqrshrun.s16 d25, q10, #7
vst1.u8 {d22}, [r4], r5 ;store result
vst1.u8 {d23}, [r4], r5
vst1.u8 {d24}, [r4], r5
vst1.u8 {d25}, [r4], r5
pop {r4-r5,pc}
;---------------------
secondpass_filter8x4_only
;Second pass: 8x4
add r3, r12, r3, lsl #5
sub r0, r0, r1, lsl #1
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vabs.s32 q7, q5
vabs.s32 q8, q6
vld1.u8 {d22}, [r0], r1
vld1.u8 {d23}, [r0], r1
vld1.u8 {d24}, [r0], r1
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vld1.u8 {d25}, [r0], r1
vdup.8 d1, d14[4]
vld1.u8 {d26}, [r0], r1
vdup.8 d2, d15[0]
vld1.u8 {d27}, [r0], r1
vdup.8 d3, d15[4]
vld1.u8 {d28}, [r0], r1
vdup.8 d4, d16[0]
vld1.u8 {d29}, [r0], r1
vdup.8 d5, d16[4]
vld1.u8 {d30}, [r0], r1
vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d23, d0
vmull.u8 q5, d24, d0
vmull.u8 q6, d25, d0
vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q4, d24, d1
vmlsl.u8 q5, d25, d1
vmlsl.u8 q6, d26, d1
vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d27, d4
vmlsl.u8 q5, d28, d4
vmlsl.u8 q6, d29, d4
vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d25, d2
vmlal.u8 q5, d26, d2
vmlal.u8 q6, d27, d2
vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q4, d28, d5
vmlal.u8 q5, d29, d5
vmlal.u8 q6, d30, d5
vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q8, d26, d3
vmull.u8 q9, d27, d3
vmull.u8 q10, d28, d3
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vst1.u8 {d6}, [r4], r5 ;store result
vst1.u8 {d7}, [r4], r5
vst1.u8 {d8}, [r4], r5
vst1.u8 {d9}, [r4], r5
pop {r4-r5,pc}
ENDP
;-----------------
END

View File

@ -1,524 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sixtap_predict8x8_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
filter8_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; stack(r4) unsigned char *dst_ptr,
; stack(r5) int dst_pitch
|vp8_sixtap_predict8x8_neon| PROC
push {r4-r5, lr}
adr r12, filter8_coeff
ldr r4, [sp, #12] ;load parameters from stack
ldr r5, [sp, #16] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_filter8x8_only
add r2, r12, r2, lsl #5 ;calculate filter location
cmp r3, #0 ;skip second_pass filter if yoffset=0
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
beq firstpass_filter8x8_only
sub sp, sp, #64 ;reserve space on stack for temporary storage
mov lr, sp
vabs.s32 q12, q14
vabs.s32 q13, q15
mov r2, #2 ;loop counter
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
sub r0, r0, r1, lsl #1
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vdup.8 d1, d24[4]
vdup.8 d2, d25[0]
;First pass: output_height lines x output_width columns (13x8)
vld1.u8 {q3}, [r0], r1 ;load src data
vdup.8 d3, d25[4]
vld1.u8 {q4}, [r0], r1
vdup.8 d4, d26[0]
vld1.u8 {q5}, [r0], r1
vdup.8 d5, d26[4]
vld1.u8 {q6}, [r0], r1
filt_blk2d_fp8x8_loop_neon
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q8, d8, d0
vmull.u8 q9, d10, d0
vmull.u8 q10, d12, d0
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d29, d8, d9, #1
vext.8 d30, d10, d11, #1
vext.8 d31, d12, d13, #1
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d29, d1
vmlsl.u8 q9, d30, d1
vmlsl.u8 q10, d31, d1
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
vext.8 d29, d8, d9, #4
vext.8 d30, d10, d11, #4
vext.8 d31, d12, d13, #4
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d29, d4
vmlsl.u8 q9, d30, d4
vmlsl.u8 q10, d31, d4
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
vext.8 d29, d8, d9, #2
vext.8 d30, d10, d11, #2
vext.8 d31, d12, d13, #2
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d29, d2
vmlal.u8 q9, d30, d2
vmlal.u8 q10, d31, d2
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d8, d9, #5
vext.8 d30, d10, d11, #5
vext.8 d31, d12, d13, #5
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q8, d29, d5
vmlal.u8 q9, d30, d5
vmlal.u8 q10, d31, d5
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
vext.8 d29, d8, d9, #3
vext.8 d30, d10, d11, #3
vext.8 d31, d12, d13, #3
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q4, d29, d3
vmull.u8 q5, d30, d3
vmull.u8 q6, d31, d3
subs r2, r2, #1
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vld1.u8 {q3}, [r0], r1 ;load src data
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d23, q8, #7
vqrshrun.s16 d24, q9, #7
vqrshrun.s16 d25, q10, #7
vst1.u8 {d22}, [lr]! ;store result
vld1.u8 {q4}, [r0], r1
vst1.u8 {d23}, [lr]!
vld1.u8 {q5}, [r0], r1
vst1.u8 {d24}, [lr]!
vld1.u8 {q6}, [r0], r1
vst1.u8 {d25}, [lr]!
bne filt_blk2d_fp8x8_loop_neon
;first_pass filtering on the rest 5-line data
;vld1.u8 {q3}, [r0], r1 ;load src data
;vld1.u8 {q4}, [r0], r1
;vld1.u8 {q5}, [r0], r1
;vld1.u8 {q6}, [r0], r1
vld1.u8 {q7}, [r0], r1
vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q9, d8, d0
vmull.u8 q10, d10, d0
vmull.u8 q11, d12, d0
vmull.u8 q12, d14, d0
vext.8 d27, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d28, d8, d9, #1
vext.8 d29, d10, d11, #1
vext.8 d30, d12, d13, #1
vext.8 d31, d14, d15, #1
vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q9, d28, d1
vmlsl.u8 q10, d29, d1
vmlsl.u8 q11, d30, d1
vmlsl.u8 q12, d31, d1
vext.8 d27, d6, d7, #4 ;construct src_ptr[2]
vext.8 d28, d8, d9, #4
vext.8 d29, d10, d11, #4
vext.8 d30, d12, d13, #4
vext.8 d31, d14, d15, #4
vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q9, d28, d4
vmlsl.u8 q10, d29, d4
vmlsl.u8 q11, d30, d4
vmlsl.u8 q12, d31, d4
vext.8 d27, d6, d7, #2 ;construct src_ptr[0]
vext.8 d28, d8, d9, #2
vext.8 d29, d10, d11, #2
vext.8 d30, d12, d13, #2
vext.8 d31, d14, d15, #2
vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q9, d28, d2
vmlal.u8 q10, d29, d2
vmlal.u8 q11, d30, d2
vmlal.u8 q12, d31, d2
vext.8 d27, d6, d7, #5 ;construct src_ptr[3]
vext.8 d28, d8, d9, #5
vext.8 d29, d10, d11, #5
vext.8 d30, d12, d13, #5
vext.8 d31, d14, d15, #5
vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q9, d28, d5
vmlal.u8 q10, d29, d5
vmlal.u8 q11, d30, d5
vmlal.u8 q12, d31, d5
vext.8 d27, d6, d7, #3 ;construct src_ptr[1]
vext.8 d28, d8, d9, #3
vext.8 d29, d10, d11, #3
vext.8 d30, d12, d13, #3
vext.8 d31, d14, d15, #3
vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q4, d28, d3
vmull.u8 q5, d29, d3
vmull.u8 q6, d30, d3
vmull.u8 q7, d31, d3
vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q9, q4
vqadd.s16 q10, q5
vqadd.s16 q11, q6
vqadd.s16 q12, q7
add r3, r12, r3, lsl #5
vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8
sub lr, lr, #64
vqrshrun.s16 d27, q9, #7
vld1.u8 {q9}, [lr]! ;load intermediate data from stack
vqrshrun.s16 d28, q10, #7
vld1.u8 {q10}, [lr]!
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vqrshrun.s16 d29, q11, #7
vld1.u8 {q11}, [lr]!
vabs.s32 q7, q5
vabs.s32 q8, q6
vqrshrun.s16 d30, q12, #7
vld1.u8 {q12}, [lr]!
;Second pass: 8x8
mov r3, #2 ;loop counter
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vdup.8 d1, d14[4]
vdup.8 d2, d15[0]
vdup.8 d3, d15[4]
vdup.8 d4, d16[0]
vdup.8 d5, d16[4]
filt_blk2d_sp8x8_loop_neon
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d19, d0
vmull.u8 q5, d20, d0
vmull.u8 q6, d21, d0
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q4, d20, d1
vmlsl.u8 q5, d21, d1
vmlsl.u8 q6, d22, d1
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d23, d4
vmlsl.u8 q5, d24, d4
vmlsl.u8 q6, d25, d4
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d21, d2
vmlal.u8 q5, d22, d2
vmlal.u8 q6, d23, d2
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q4, d24, d5
vmlal.u8 q5, d25, d5
vmlal.u8 q6, d26, d5
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q8, d22, d3
vmull.u8 q9, d23, d3
vmull.u8 q10, d24, d3
subs r3, r3, #1
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vmov q9, q11
vst1.u8 {d6}, [r4], r5 ;store result
vmov q10, q12
vst1.u8 {d7}, [r4], r5
vmov q11, q13
vst1.u8 {d8}, [r4], r5
vmov q12, q14
vst1.u8 {d9}, [r4], r5
vmov d26, d30
bne filt_blk2d_sp8x8_loop_neon
add sp, sp, #64
pop {r4-r5,pc}
;---------------------
firstpass_filter8x8_only
;add r2, r12, r2, lsl #5 ;calculate filter location
;vld1.s32 {q14, q15}, [r2] ;load first_pass filter
vabs.s32 q12, q14
vabs.s32 q13, q15
mov r2, #2 ;loop counter
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vdup.8 d1, d24[4]
vdup.8 d2, d25[0]
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
;First pass: output_height lines x output_width columns (8x8)
filt_blk2d_fpo8x8_loop_neon
vld1.u8 {q3}, [r0], r1 ;load src data
vld1.u8 {q4}, [r0], r1
vld1.u8 {q5}, [r0], r1
vld1.u8 {q6}, [r0], r1
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q8, d8, d0
vmull.u8 q9, d10, d0
vmull.u8 q10, d12, d0
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d29, d8, d9, #1
vext.8 d30, d10, d11, #1
vext.8 d31, d12, d13, #1
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d29, d1
vmlsl.u8 q9, d30, d1
vmlsl.u8 q10, d31, d1
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
vext.8 d29, d8, d9, #4
vext.8 d30, d10, d11, #4
vext.8 d31, d12, d13, #4
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d29, d4
vmlsl.u8 q9, d30, d4
vmlsl.u8 q10, d31, d4
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
vext.8 d29, d8, d9, #2
vext.8 d30, d10, d11, #2
vext.8 d31, d12, d13, #2
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d29, d2
vmlal.u8 q9, d30, d2
vmlal.u8 q10, d31, d2
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d8, d9, #5
vext.8 d30, d10, d11, #5
vext.8 d31, d12, d13, #5
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q8, d29, d5
vmlal.u8 q9, d30, d5
vmlal.u8 q10, d31, d5
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
vext.8 d29, d8, d9, #3
vext.8 d30, d10, d11, #3
vext.8 d31, d12, d13, #3
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q4, d29, d3
vmull.u8 q5, d30, d3
vmull.u8 q6, d31, d3
;
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
subs r2, r2, #1
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d23, q8, #7
vqrshrun.s16 d24, q9, #7
vqrshrun.s16 d25, q10, #7
vst1.u8 {d22}, [r4], r5 ;store result
vst1.u8 {d23}, [r4], r5
vst1.u8 {d24}, [r4], r5
vst1.u8 {d25}, [r4], r5
bne filt_blk2d_fpo8x8_loop_neon
pop {r4-r5,pc}
;---------------------
secondpass_filter8x8_only
sub r0, r0, r1, lsl #1
add r3, r12, r3, lsl #5
vld1.u8 {d18}, [r0], r1 ;load src data
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vld1.u8 {d19}, [r0], r1
vabs.s32 q7, q5
vld1.u8 {d20}, [r0], r1
vabs.s32 q8, q6
vld1.u8 {d21}, [r0], r1
mov r3, #2 ;loop counter
vld1.u8 {d22}, [r0], r1
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vld1.u8 {d23}, [r0], r1
vdup.8 d1, d14[4]
vld1.u8 {d24}, [r0], r1
vdup.8 d2, d15[0]
vld1.u8 {d25}, [r0], r1
vdup.8 d3, d15[4]
vld1.u8 {d26}, [r0], r1
vdup.8 d4, d16[0]
vld1.u8 {d27}, [r0], r1
vdup.8 d5, d16[4]
vld1.u8 {d28}, [r0], r1
vld1.u8 {d29}, [r0], r1
vld1.u8 {d30}, [r0], r1
;Second pass: 8x8
filt_blk2d_spo8x8_loop_neon
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d19, d0
vmull.u8 q5, d20, d0
vmull.u8 q6, d21, d0
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q4, d20, d1
vmlsl.u8 q5, d21, d1
vmlsl.u8 q6, d22, d1
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d23, d4
vmlsl.u8 q5, d24, d4
vmlsl.u8 q6, d25, d4
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d21, d2
vmlal.u8 q5, d22, d2
vmlal.u8 q6, d23, d2
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q4, d24, d5
vmlal.u8 q5, d25, d5
vmlal.u8 q6, d26, d5
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q8, d22, d3
vmull.u8 q9, d23, d3
vmull.u8 q10, d24, d3
subs r3, r3, #1
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vmov q9, q11
vst1.u8 {d6}, [r4], r5 ;store result
vmov q10, q12
vst1.u8 {d7}, [r4], r5
vmov q11, q13
vst1.u8 {d8}, [r4], r5
vmov q12, q14
vst1.u8 {d9}, [r4], r5
vmov d26, d30
bne filt_blk2d_spo8x8_loop_neon
pop {r4-r5,pc}
ENDP
;-----------------
END

File diff suppressed because it is too large Load Diff

View File

@ -1,276 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_variance16x16_neon|
EXPORT |vp8_variance16x8_neon|
EXPORT |vp8_variance8x16_neon|
EXPORT |vp8_variance8x8_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
|vp8_variance16x16_neon| PROC
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
mov r12, #8
variance16x16_neon_loop
vld1.8 {q0}, [r0], r1 ;Load up source and reference
vld1.8 {q2}, [r2], r3
vld1.8 {q1}, [r0], r1
vld1.8 {q3}, [r2], r3
vsubl.u8 q11, d0, d4 ;calculate diff
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
;VPADAL adds adjacent pairs of elements of a vector, and accumulates
;the results into the elements of the destination vector. The explanation
;in ARM guide is wrong.
vpadal.s16 q8, q11 ;calculate sum
vmlal.s16 q9, d22, d22 ;calculate sse
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vpadal.s16 q8, q13
vmlal.s16 q9, d26, d26
vmlal.s16 q10, d27, d27
vpadal.s16 q8, q14
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
bne variance16x16_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
ldr r12, [sp] ;load *sse from stack
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
;vmov.32 r0, d0[0] ;this instruction costs a lot
;vmov.32 r1, d1[0]
;mul r0, r0, r0
;str r1, [r12]
;sub r0, r1, r0, lsr #8
; while sum is signed, sum * sum is always positive and must be treated as
; unsigned to avoid propagating the sign bit.
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r12] ;store sse
vshr.u32 d10, d10, #8
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
bx lr
ENDP
;================================
;unsigned int vp8_variance16x8_c(
; unsigned char *src_ptr,
; int source_stride,
; unsigned char *ref_ptr,
; int recon_stride,
; unsigned int *sse)
|vp8_variance16x8_neon| PROC
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
mov r12, #4
variance16x8_neon_loop
vld1.8 {q0}, [r0], r1 ;Load up source and reference
vld1.8 {q2}, [r2], r3
vld1.8 {q1}, [r0], r1
vld1.8 {q3}, [r2], r3
vsubl.u8 q11, d0, d4 ;calculate diff
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
vpadal.s16 q8, q11 ;calculate sum
vmlal.s16 q9, d22, d22 ;calculate sse
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vpadal.s16 q8, q13
vmlal.s16 q9, d26, d26
vmlal.s16 q10, d27, d27
vpadal.s16 q8, q14
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
bne variance16x8_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
ldr r12, [sp] ;load *sse from stack
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r12] ;store sse
vshr.u32 d10, d10, #7
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
bx lr
ENDP
;=================================
;unsigned int vp8_variance8x16_c(
; unsigned char *src_ptr,
; int source_stride,
; unsigned char *ref_ptr,
; int recon_stride,
; unsigned int *sse)
|vp8_variance8x16_neon| PROC
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
mov r12, #8
variance8x16_neon_loop
vld1.8 {d0}, [r0], r1 ;Load up source and reference
vld1.8 {d4}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d6}, [r2], r3
vsubl.u8 q11, d0, d4 ;calculate diff
vsubl.u8 q12, d2, d6
vpadal.s16 q8, q11 ;calculate sum
vmlal.s16 q9, d22, d22 ;calculate sse
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
bne variance8x16_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
ldr r12, [sp] ;load *sse from stack
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r12] ;store sse
vshr.u32 d10, d10, #7
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
bx lr
ENDP
;==================================
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
|vp8_variance8x8_neon| PROC
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
mov r12, #2
variance8x8_neon_loop
vld1.8 {d0}, [r0], r1 ;Load up source and reference
vld1.8 {d4}, [r2], r3
vld1.8 {d1}, [r0], r1
vld1.8 {d5}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d6}, [r2], r3
vld1.8 {d3}, [r0], r1
vld1.8 {d7}, [r2], r3
vsubl.u8 q11, d0, d4 ;calculate diff
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
vpadal.s16 q8, q11 ;calculate sum
vmlal.s16 q9, d22, d22 ;calculate sse
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vpadal.s16 q8, q13
vmlal.s16 q9, d26, d26
vmlal.s16 q10, d27, d27
vpadal.s16 q8, q14
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
bne variance8x8_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
ldr r12, [sp] ;load *sse from stack
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r12] ;store sse
vshr.u32 d10, d10, #6
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
bx lr
ENDP
END

View File

@ -0,0 +1,323 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#ifdef _MSC_VER
#define __builtin_prefetch(x)
#endif
unsigned int vp8_variance16x16_neon(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse) {
int i;
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
uint32x2_t d0u32, d10u32;
int64x1_t d0s64, d1s64;
uint8x16_t q0u8, q1u8, q2u8, q3u8;
uint16x8_t q11u16, q12u16, q13u16, q14u16;
int32x4_t q8s32, q9s32, q10s32;
int64x2_t q0s64, q1s64, q5s64;
q8s32 = vdupq_n_s32(0);
q9s32 = vdupq_n_s32(0);
q10s32 = vdupq_n_s32(0);
for (i = 0; i < 8; i++) {
q0u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
q1u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
__builtin_prefetch(src_ptr);
q2u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
q3u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
__builtin_prefetch(ref_ptr);
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
}
q10s32 = vaddq_s32(q10s32, q9s32);
q0s64 = vpaddlq_s32(q8s32);
q1s64 = vpaddlq_s32(q10s32);
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
vreinterpret_s32_s64(d0s64));
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
return vget_lane_u32(d0u32, 0);
}
unsigned int vp8_variance16x8_neon(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse) {
int i;
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
uint32x2_t d0u32, d10u32;
int64x1_t d0s64, d1s64;
uint8x16_t q0u8, q1u8, q2u8, q3u8;
uint16x8_t q11u16, q12u16, q13u16, q14u16;
int32x4_t q8s32, q9s32, q10s32;
int64x2_t q0s64, q1s64, q5s64;
q8s32 = vdupq_n_s32(0);
q9s32 = vdupq_n_s32(0);
q10s32 = vdupq_n_s32(0);
for (i = 0; i < 4; i++) { // variance16x8_neon_loop
q0u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
q1u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
__builtin_prefetch(src_ptr);
q2u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
q3u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
__builtin_prefetch(ref_ptr);
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
}
q10s32 = vaddq_s32(q10s32, q9s32);
q0s64 = vpaddlq_s32(q8s32);
q1s64 = vpaddlq_s32(q10s32);
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
vreinterpret_s32_s64(d0s64));
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
return vget_lane_u32(d0u32, 0);
}
unsigned int vp8_variance8x16_neon(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse) {
int i;
uint8x8_t d0u8, d2u8, d4u8, d6u8;
int16x4_t d22s16, d23s16, d24s16, d25s16;
uint32x2_t d0u32, d10u32;
int64x1_t d0s64, d1s64;
uint16x8_t q11u16, q12u16;
int32x4_t q8s32, q9s32, q10s32;
int64x2_t q0s64, q1s64, q5s64;
q8s32 = vdupq_n_s32(0);
q9s32 = vdupq_n_s32(0);
q10s32 = vdupq_n_s32(0);
for (i = 0; i < 8; i++) { // variance8x16_neon_loop
d0u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d2u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
__builtin_prefetch(src_ptr);
d4u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d6u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
__builtin_prefetch(ref_ptr);
q11u16 = vsubl_u8(d0u8, d4u8);
q12u16 = vsubl_u8(d2u8, d6u8);
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
}
q10s32 = vaddq_s32(q10s32, q9s32);
q0s64 = vpaddlq_s32(q8s32);
q1s64 = vpaddlq_s32(q10s32);
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
vreinterpret_s32_s64(d0s64));
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
return vget_lane_u32(d0u32, 0);
}
unsigned int vp8_variance8x8_neon(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse) {
int i;
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
uint32x2_t d0u32, d10u32;
int64x1_t d0s64, d1s64;
uint16x8_t q11u16, q12u16, q13u16, q14u16;
int32x4_t q8s32, q9s32, q10s32;
int64x2_t q0s64, q1s64, q5s64;
q8s32 = vdupq_n_s32(0);
q9s32 = vdupq_n_s32(0);
q10s32 = vdupq_n_s32(0);
for (i = 0; i < 2; i++) { // variance8x8_neon_loop
d0u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d1u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d2u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d3u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d4u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d5u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d6u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d7u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
q11u16 = vsubl_u8(d0u8, d4u8);
q12u16 = vsubl_u8(d1u8, d5u8);
q13u16 = vsubl_u8(d2u8, d6u8);
q14u16 = vsubl_u8(d3u8, d7u8);
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
}
q10s32 = vaddq_s32(q10s32, q9s32);
q0s64 = vpaddlq_s32(q8s32);
q1s64 = vpaddlq_s32(q10s32);
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
vreinterpret_s32_s64(d0s64));
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6);
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
return vget_lane_u32(d0u32, 0);
}

View File

@ -1,423 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
;-----------------
EXPORT |vp8_sub_pixel_variance16x16_neon_func|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; stack(r4) unsigned char *dst_ptr,
; stack(r5) int dst_pixels_per_line,
; stack(r6) unsigned int *sse
;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon.
bilinear_taps_coeff
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
|vp8_sub_pixel_variance16x16_neon_func| PROC
push {r4-r6, lr}
adr r12, bilinear_taps_coeff
ldr r4, [sp, #16] ;load *dst_ptr from stack
ldr r5, [sp, #20] ;load dst_pixels_per_line from stack
ldr r6, [sp, #24] ;load *sse from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_bfilter16x16_only
add r2, r12, r2, lsl #3 ;calculate filter location
cmp r3, #0 ;skip second_pass filter if yoffset=0
vld1.s32 {d31}, [r2] ;load first_pass filter
beq firstpass_bfilter16x16_only
sub sp, sp, #272 ;reserve space on stack for temporary storage
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
mov lr, sp
vld1.u8 {d5, d6, d7}, [r0], r1
mov r2, #3 ;loop counter
vld1.u8 {d8, d9, d10}, [r0], r1
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
vld1.u8 {d11, d12, d13}, [r0], r1
vdup.8 d1, d31[4]
;First Pass: output_height lines x output_width columns (17x16)
vp8e_filt_blk2d_fp16x16_loop_neon
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
vmull.u8 q8, d3, d0
vmull.u8 q9, d5, d0
vmull.u8 q10, d6, d0
vmull.u8 q11, d8, d0
vmull.u8 q12, d9, d0
vmull.u8 q13, d11, d0
vmull.u8 q14, d12, d0
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
vext.8 d5, d5, d6, #1
vext.8 d8, d8, d9, #1
vext.8 d11, d11, d12, #1
vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q9, d5, d1
vmlal.u8 q11, d8, d1
vmlal.u8 q13, d11, d1
vext.8 d3, d3, d4, #1
vext.8 d6, d6, d7, #1
vext.8 d9, d9, d10, #1
vext.8 d12, d12, d13, #1
vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q10, d6, d1
vmlal.u8 q12, d9, d1
vmlal.u8 q14, d12, d1
subs r2, r2, #1
vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
vqrshrn.u16 d15, q8, #7
vqrshrn.u16 d16, q9, #7
vqrshrn.u16 d17, q10, #7
vqrshrn.u16 d18, q11, #7
vqrshrn.u16 d19, q12, #7
vqrshrn.u16 d20, q13, #7
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
vqrshrn.u16 d21, q14, #7
vld1.u8 {d5, d6, d7}, [r0], r1
vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
vld1.u8 {d8, d9, d10}, [r0], r1
vst1.u8 {d18, d19, d20, d21}, [lr]!
vld1.u8 {d11, d12, d13}, [r0], r1
bne vp8e_filt_blk2d_fp16x16_loop_neon
;First-pass filtering for rest 5 lines
vld1.u8 {d14, d15, d16}, [r0], r1
vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0])
vmull.u8 q10, d3, d0
vmull.u8 q11, d5, d0
vmull.u8 q12, d6, d0
vmull.u8 q13, d8, d0
vmull.u8 q14, d9, d0
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
vext.8 d5, d5, d6, #1
vext.8 d8, d8, d9, #1
vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q11, d5, d1
vmlal.u8 q13, d8, d1
vext.8 d3, d3, d4, #1
vext.8 d6, d6, d7, #1
vext.8 d9, d9, d10, #1
vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q12, d6, d1
vmlal.u8 q14, d9, d1
vmull.u8 q1, d11, d0
vmull.u8 q2, d12, d0
vmull.u8 q3, d14, d0
vmull.u8 q4, d15, d0
vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
vext.8 d14, d14, d15, #1
vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q3, d14, d1
vext.8 d12, d12, d13, #1
vext.8 d15, d15, d16, #1
vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q4, d15, d1
vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
vqrshrn.u16 d11, q10, #7
vqrshrn.u16 d12, q11, #7
vqrshrn.u16 d13, q12, #7
vqrshrn.u16 d14, q13, #7
vqrshrn.u16 d15, q14, #7
vqrshrn.u16 d16, q1, #7
vqrshrn.u16 d17, q2, #7
vqrshrn.u16 d18, q3, #7
vqrshrn.u16 d19, q4, #7
vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
vst1.u8 {d14, d15, d16, d17}, [lr]!
vst1.u8 {d18, d19}, [lr]!
;Second pass: 16x16
;secondpass_filter
add r3, r12, r3, lsl #3
sub lr, lr, #272
vld1.u32 {d31}, [r3] ;load second_pass filter
sub sp, sp, #256
mov r3, sp
vld1.u8 {d22, d23}, [lr]! ;load src data
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
vdup.8 d1, d31[4]
mov r12, #4 ;loop counter
vp8e_filt_blk2d_sp16x16_loop_neon
vld1.u8 {d24, d25}, [lr]!
vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
vld1.u8 {d26, d27}, [lr]!
vmull.u8 q2, d23, d0
vld1.u8 {d28, d29}, [lr]!
vmull.u8 q3, d24, d0
vld1.u8 {d30, d31}, [lr]!
vmull.u8 q4, d25, d0
vmull.u8 q5, d26, d0
vmull.u8 q6, d27, d0
vmull.u8 q7, d28, d0
vmull.u8 q8, d29, d0
vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
vmlal.u8 q2, d25, d1
vmlal.u8 q3, d26, d1
vmlal.u8 q4, d27, d1
vmlal.u8 q5, d28, d1
vmlal.u8 q6, d29, d1
vmlal.u8 q7, d30, d1
vmlal.u8 q8, d31, d1
subs r12, r12, #1
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d3, q2, #7
vqrshrn.u16 d4, q3, #7
vqrshrn.u16 d5, q4, #7
vqrshrn.u16 d6, q5, #7
vqrshrn.u16 d7, q6, #7
vqrshrn.u16 d8, q7, #7
vqrshrn.u16 d9, q8, #7
vst1.u8 {d2, d3}, [r3]! ;store result
vst1.u8 {d4, d5}, [r3]!
vst1.u8 {d6, d7}, [r3]!
vmov q11, q15
vst1.u8 {d8, d9}, [r3]!
bne vp8e_filt_blk2d_sp16x16_loop_neon
b sub_pixel_variance16x16_neon
;--------------------
firstpass_bfilter16x16_only
mov r2, #4 ;loop counter
sub sp, sp, #528 ;reserve space on stack for temporary storage
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
vdup.8 d1, d31[4]
mov r3, sp
;First Pass: output_height lines x output_width columns (16x16)
vp8e_filt_blk2d_fpo16x16_loop_neon
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
vld1.u8 {d5, d6, d7}, [r0], r1
vld1.u8 {d8, d9, d10}, [r0], r1
vld1.u8 {d11, d12, d13}, [r0], r1
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
vmull.u8 q8, d3, d0
vmull.u8 q9, d5, d0
vmull.u8 q10, d6, d0
vmull.u8 q11, d8, d0
vmull.u8 q12, d9, d0
vmull.u8 q13, d11, d0
vmull.u8 q14, d12, d0
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
vext.8 d5, d5, d6, #1
vext.8 d8, d8, d9, #1
vext.8 d11, d11, d12, #1
vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q9, d5, d1
vmlal.u8 q11, d8, d1
vmlal.u8 q13, d11, d1
vext.8 d3, d3, d4, #1
vext.8 d6, d6, d7, #1
vext.8 d9, d9, d10, #1
vext.8 d12, d12, d13, #1
vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q10, d6, d1
vmlal.u8 q12, d9, d1
vmlal.u8 q14, d12, d1
subs r2, r2, #1
vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
vqrshrn.u16 d15, q8, #7
vqrshrn.u16 d16, q9, #7
vqrshrn.u16 d17, q10, #7
vqrshrn.u16 d18, q11, #7
vqrshrn.u16 d19, q12, #7
vqrshrn.u16 d20, q13, #7
vst1.u8 {d14, d15}, [r3]! ;store result
vqrshrn.u16 d21, q14, #7
vst1.u8 {d16, d17}, [r3]!
vst1.u8 {d18, d19}, [r3]!
vst1.u8 {d20, d21}, [r3]!
bne vp8e_filt_blk2d_fpo16x16_loop_neon
b sub_pixel_variance16x16_neon
;---------------------
secondpass_bfilter16x16_only
;Second pass: 16x16
;secondpass_filter
sub sp, sp, #528 ;reserve space on stack for temporary storage
add r3, r12, r3, lsl #3
mov r12, #4 ;loop counter
vld1.u32 {d31}, [r3] ;load second_pass filter
vld1.u8 {d22, d23}, [r0], r1 ;load src data
mov r3, sp
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
vdup.8 d1, d31[4]
vp8e_filt_blk2d_spo16x16_loop_neon
vld1.u8 {d24, d25}, [r0], r1
vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
vld1.u8 {d26, d27}, [r0], r1
vmull.u8 q2, d23, d0
vld1.u8 {d28, d29}, [r0], r1
vmull.u8 q3, d24, d0
vld1.u8 {d30, d31}, [r0], r1
vmull.u8 q4, d25, d0
vmull.u8 q5, d26, d0
vmull.u8 q6, d27, d0
vmull.u8 q7, d28, d0
vmull.u8 q8, d29, d0
vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
vmlal.u8 q2, d25, d1
vmlal.u8 q3, d26, d1
vmlal.u8 q4, d27, d1
vmlal.u8 q5, d28, d1
vmlal.u8 q6, d29, d1
vmlal.u8 q7, d30, d1
vmlal.u8 q8, d31, d1
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d3, q2, #7
vqrshrn.u16 d4, q3, #7
vqrshrn.u16 d5, q4, #7
vqrshrn.u16 d6, q5, #7
vqrshrn.u16 d7, q6, #7
vqrshrn.u16 d8, q7, #7
vqrshrn.u16 d9, q8, #7
vst1.u8 {d2, d3}, [r3]! ;store result
subs r12, r12, #1
vst1.u8 {d4, d5}, [r3]!
vmov q11, q15
vst1.u8 {d6, d7}, [r3]!
vst1.u8 {d8, d9}, [r3]!
bne vp8e_filt_blk2d_spo16x16_loop_neon
b sub_pixel_variance16x16_neon
;----------------------------
;variance16x16
sub_pixel_variance16x16_neon
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
sub r3, r3, #256
mov r12, #8
sub_pixel_variance16x16_neon_loop
vld1.8 {q0}, [r3]! ;Load up source and reference
vld1.8 {q2}, [r4], r5
vld1.8 {q1}, [r3]!
vld1.8 {q3}, [r4], r5
vsubl.u8 q11, d0, d4 ;diff
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
vpadal.s16 q8, q11 ;sum
vmlal.s16 q9, d22, d22 ;sse
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vpadal.s16 q8, q13
vmlal.s16 q9, d26, d26
vmlal.s16 q10, d27, d27
vpadal.s16 q8, q14
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
bne sub_pixel_variance16x16_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r6] ;store sse
vshr.u32 d10, d10, #8
vsub.u32 d0, d1, d10
add sp, sp, #528
vmov.32 r0, d0[0] ;return
pop {r4-r6,pc}
ENDP
END

View File

@ -1,572 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_variance_halfpixvar16x16_h_neon|
EXPORT |vp8_variance_halfpixvar16x16_v_neon|
EXPORT |vp8_variance_halfpixvar16x16_hv_neon|
EXPORT |vp8_sub_pixel_variance16x16s_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;================================================
;unsigned int vp8_variance_halfpixvar16x16_h_neon
;(
; unsigned char *src_ptr, r0
; int src_pixels_per_line, r1
; unsigned char *dst_ptr, r2
; int dst_pixels_per_line, r3
; unsigned int *sse
;);
;================================================
|vp8_variance_halfpixvar16x16_h_neon| PROC
push {lr}
mov r12, #4 ;loop counter
ldr lr, [sp, #4] ;load *sse from stack
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
;First Pass: output_height lines x output_width columns (16x16)
vp8_filt_fpo16x16s_4_0_loop_neon
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
vld1.8 {q11}, [r2], r3
vld1.u8 {d4, d5, d6, d7}, [r0], r1
vld1.8 {q12}, [r2], r3
vld1.u8 {d8, d9, d10, d11}, [r0], r1
vld1.8 {q13}, [r2], r3
vld1.u8 {d12, d13, d14, d15}, [r0], r1
;pld [r0]
;pld [r0, r1]
;pld [r0, r1, lsl #1]
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
vext.8 q3, q2, q3, #1
vext.8 q5, q4, q5, #1
vext.8 q7, q6, q7, #1
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
vld1.8 {q14}, [r2], r3
vrhadd.u8 q1, q2, q3
vrhadd.u8 q2, q4, q5
vrhadd.u8 q3, q6, q7
vsubl.u8 q4, d0, d22 ;diff
vsubl.u8 q5, d1, d23
vsubl.u8 q6, d2, d24
vsubl.u8 q7, d3, d25
vsubl.u8 q0, d4, d26
vsubl.u8 q1, d5, d27
vsubl.u8 q2, d6, d28
vsubl.u8 q3, d7, d29
vpadal.s16 q8, q4 ;sum
vmlal.s16 q9, d8, d8 ;sse
vmlal.s16 q10, d9, d9
subs r12, r12, #1
vpadal.s16 q8, q5
vmlal.s16 q9, d10, d10
vmlal.s16 q10, d11, d11
vpadal.s16 q8, q6
vmlal.s16 q9, d12, d12
vmlal.s16 q10, d13, d13
vpadal.s16 q8, q7
vmlal.s16 q9, d14, d14
vmlal.s16 q10, d15, d15
vpadal.s16 q8, q0 ;sum
vmlal.s16 q9, d0, d0 ;sse
vmlal.s16 q10, d1, d1
vpadal.s16 q8, q1
vmlal.s16 q9, d2, d2
vmlal.s16 q10, d3, d3
vpadal.s16 q8, q2
vmlal.s16 q9, d4, d4
vmlal.s16 q10, d5, d5
vpadal.s16 q8, q3
vmlal.s16 q9, d6, d6
vmlal.s16 q10, d7, d7
bne vp8_filt_fpo16x16s_4_0_loop_neon
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [lr] ;store sse
vshr.u32 d10, d10, #8
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
pop {pc}
ENDP
;================================================
;unsigned int vp8_variance_halfpixvar16x16_v_neon
;(
; unsigned char *src_ptr, r0
; int src_pixels_per_line, r1
; unsigned char *dst_ptr, r2
; int dst_pixels_per_line, r3
; unsigned int *sse
;);
;================================================
|vp8_variance_halfpixvar16x16_v_neon| PROC
push {lr}
mov r12, #4 ;loop counter
vld1.u8 {q0}, [r0], r1 ;load src data
ldr lr, [sp, #4] ;load *sse from stack
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
vp8_filt_spo16x16s_0_4_loop_neon
vld1.u8 {q2}, [r0], r1
vld1.8 {q1}, [r2], r3
vld1.u8 {q4}, [r0], r1
vld1.8 {q3}, [r2], r3
vld1.u8 {q6}, [r0], r1
vld1.8 {q5}, [r2], r3
vld1.u8 {q15}, [r0], r1
vrhadd.u8 q0, q0, q2
vld1.8 {q7}, [r2], r3
vrhadd.u8 q2, q2, q4
vrhadd.u8 q4, q4, q6
vrhadd.u8 q6, q6, q15
vsubl.u8 q11, d0, d2 ;diff
vsubl.u8 q12, d1, d3
vsubl.u8 q13, d4, d6
vsubl.u8 q14, d5, d7
vsubl.u8 q0, d8, d10
vsubl.u8 q1, d9, d11
vsubl.u8 q2, d12, d14
vsubl.u8 q3, d13, d15
vpadal.s16 q8, q11 ;sum
vmlal.s16 q9, d22, d22 ;sse
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vpadal.s16 q8, q13
vmlal.s16 q9, d26, d26
vmlal.s16 q10, d27, d27
vpadal.s16 q8, q14
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
vpadal.s16 q8, q0 ;sum
vmlal.s16 q9, d0, d0 ;sse
vmlal.s16 q10, d1, d1
vpadal.s16 q8, q1
vmlal.s16 q9, d2, d2
vmlal.s16 q10, d3, d3
vpadal.s16 q8, q2
vmlal.s16 q9, d4, d4
vmlal.s16 q10, d5, d5
vmov q0, q15
vpadal.s16 q8, q3
vmlal.s16 q9, d6, d6
vmlal.s16 q10, d7, d7
bne vp8_filt_spo16x16s_0_4_loop_neon
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [lr] ;store sse
vshr.u32 d10, d10, #8
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
pop {pc}
ENDP
;================================================
;unsigned int vp8_variance_halfpixvar16x16_hv_neon
;(
; unsigned char *src_ptr, r0
; int src_pixels_per_line, r1
; unsigned char *dst_ptr, r2
; int dst_pixels_per_line, r3
; unsigned int *sse
;);
;================================================
|vp8_variance_halfpixvar16x16_hv_neon| PROC
push {lr}
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
ldr lr, [sp, #4] ;load *sse from stack
vmov.i8 q13, #0 ;q8 - sum
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
vmov.i8 q14, #0 ;q9, q10 - sse
vmov.i8 q15, #0
mov r12, #4 ;loop counter
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
;First Pass: output_height lines x output_width columns (17x16)
vp8_filt16x16s_4_4_loop_neon
vld1.u8 {d4, d5, d6, d7}, [r0], r1
vld1.u8 {d8, d9, d10, d11}, [r0], r1
vld1.u8 {d12, d13, d14, d15}, [r0], r1
vld1.u8 {d16, d17, d18, d19}, [r0], r1
;pld [r0]
;pld [r0, r1]
;pld [r0, r1, lsl #1]
vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
vext.8 q5, q4, q5, #1
vext.8 q7, q6, q7, #1
vext.8 q9, q8, q9, #1
vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
vrhadd.u8 q2, q4, q5
vrhadd.u8 q3, q6, q7
vrhadd.u8 q4, q8, q9
vld1.8 {q5}, [r2], r3
vrhadd.u8 q0, q0, q1
vld1.8 {q6}, [r2], r3
vrhadd.u8 q1, q1, q2
vld1.8 {q7}, [r2], r3
vrhadd.u8 q2, q2, q3
vld1.8 {q8}, [r2], r3
vrhadd.u8 q3, q3, q4
vsubl.u8 q9, d0, d10 ;diff
vsubl.u8 q10, d1, d11
vsubl.u8 q11, d2, d12
vsubl.u8 q12, d3, d13
vsubl.u8 q0, d4, d14 ;diff
vsubl.u8 q1, d5, d15
vsubl.u8 q5, d6, d16
vsubl.u8 q6, d7, d17
vpadal.s16 q13, q9 ;sum
vmlal.s16 q14, d18, d18 ;sse
vmlal.s16 q15, d19, d19
vpadal.s16 q13, q10 ;sum
vmlal.s16 q14, d20, d20 ;sse
vmlal.s16 q15, d21, d21
vpadal.s16 q13, q11 ;sum
vmlal.s16 q14, d22, d22 ;sse
vmlal.s16 q15, d23, d23
vpadal.s16 q13, q12 ;sum
vmlal.s16 q14, d24, d24 ;sse
vmlal.s16 q15, d25, d25
subs r12, r12, #1
vpadal.s16 q13, q0 ;sum
vmlal.s16 q14, d0, d0 ;sse
vmlal.s16 q15, d1, d1
vpadal.s16 q13, q1 ;sum
vmlal.s16 q14, d2, d2 ;sse
vmlal.s16 q15, d3, d3
vpadal.s16 q13, q5 ;sum
vmlal.s16 q14, d10, d10 ;sse
vmlal.s16 q15, d11, d11
vmov q0, q4
vpadal.s16 q13, q6 ;sum
vmlal.s16 q14, d12, d12 ;sse
vmlal.s16 q15, d13, d13
bne vp8_filt16x16s_4_4_loop_neon
vadd.u32 q15, q14, q15 ;accumulate sse
vpaddl.s32 q0, q13 ;accumulate sum
vpaddl.u32 q1, q15
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [lr] ;store sse
vshr.u32 d10, d10, #8
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
pop {pc}
ENDP
;==============================
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; stack unsigned char *dst_ptr,
; stack int dst_pixels_per_line,
; stack unsigned int *sse
;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()
;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,
;or filter coeff is {64, 64}. This simplified program only works in this situation.
;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.
|vp8_sub_pixel_variance16x16s_neon| PROC
push {r4, lr}
ldr r4, [sp, #8] ;load *dst_ptr from stack
ldr r12, [sp, #12] ;load dst_pixels_per_line from stack
ldr lr, [sp, #16] ;load *sse from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_bfilter16x16s_only
cmp r3, #0 ;skip second_pass filter if yoffset=0
beq firstpass_bfilter16x16s_only
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
sub sp, sp, #256 ;reserve space on stack for temporary storage
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
mov r3, sp
mov r2, #4 ;loop counter
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
;First Pass: output_height lines x output_width columns (17x16)
vp8e_filt_blk2d_fp16x16s_loop_neon
vld1.u8 {d4, d5, d6, d7}, [r0], r1
vld1.u8 {d8, d9, d10, d11}, [r0], r1
vld1.u8 {d12, d13, d14, d15}, [r0], r1
vld1.u8 {d16, d17, d18, d19}, [r0], r1
;pld [r0]
;pld [r0, r1]
;pld [r0, r1, lsl #1]
vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
vext.8 q5, q4, q5, #1
vext.8 q7, q6, q7, #1
vext.8 q9, q8, q9, #1
vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
vrhadd.u8 q2, q4, q5
vrhadd.u8 q3, q6, q7
vrhadd.u8 q4, q8, q9
vrhadd.u8 q0, q0, q1
vrhadd.u8 q1, q1, q2
vrhadd.u8 q2, q2, q3
vrhadd.u8 q3, q3, q4
subs r2, r2, #1
vst1.u8 {d0, d1 ,d2, d3}, [r3]! ;store result
vmov q0, q4
vst1.u8 {d4, d5, d6, d7}, [r3]!
bne vp8e_filt_blk2d_fp16x16s_loop_neon
b sub_pixel_variance16x16s_neon
;--------------------
firstpass_bfilter16x16s_only
mov r2, #2 ;loop counter
sub sp, sp, #256 ;reserve space on stack for temporary storage
mov r3, sp
;First Pass: output_height lines x output_width columns (16x16)
vp8e_filt_blk2d_fpo16x16s_loop_neon
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
vld1.u8 {d4, d5, d6, d7}, [r0], r1
vld1.u8 {d8, d9, d10, d11}, [r0], r1
vld1.u8 {d12, d13, d14, d15}, [r0], r1
;pld [r0]
;pld [r0, r1]
;pld [r0, r1, lsl #1]
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
vld1.u8 {d16, d17, d18, d19}, [r0], r1
vext.8 q3, q2, q3, #1
vld1.u8 {d20, d21, d22, d23}, [r0], r1
vext.8 q5, q4, q5, #1
vld1.u8 {d24, d25, d26, d27}, [r0], r1
vext.8 q7, q6, q7, #1
vld1.u8 {d28, d29, d30, d31}, [r0], r1
vext.8 q9, q8, q9, #1
vext.8 q11, q10, q11, #1
vext.8 q13, q12, q13, #1
vext.8 q15, q14, q15, #1
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
vrhadd.u8 q1, q2, q3
vrhadd.u8 q2, q4, q5
vrhadd.u8 q3, q6, q7
vrhadd.u8 q4, q8, q9
vrhadd.u8 q5, q10, q11
vrhadd.u8 q6, q12, q13
vrhadd.u8 q7, q14, q15
subs r2, r2, #1
vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
vst1.u8 {d4, d5, d6, d7}, [r3]!
vst1.u8 {d8, d9, d10, d11}, [r3]!
vst1.u8 {d12, d13, d14, d15}, [r3]!
bne vp8e_filt_blk2d_fpo16x16s_loop_neon
b sub_pixel_variance16x16s_neon
;---------------------
secondpass_bfilter16x16s_only
sub sp, sp, #256 ;reserve space on stack for temporary storage
mov r2, #2 ;loop counter
vld1.u8 {d0, d1}, [r0], r1 ;load src data
mov r3, sp
vp8e_filt_blk2d_spo16x16s_loop_neon
vld1.u8 {d2, d3}, [r0], r1
vld1.u8 {d4, d5}, [r0], r1
vld1.u8 {d6, d7}, [r0], r1
vld1.u8 {d8, d9}, [r0], r1
vrhadd.u8 q0, q0, q1
vld1.u8 {d10, d11}, [r0], r1
vrhadd.u8 q1, q1, q2
vld1.u8 {d12, d13}, [r0], r1
vrhadd.u8 q2, q2, q3
vld1.u8 {d14, d15}, [r0], r1
vrhadd.u8 q3, q3, q4
vld1.u8 {d16, d17}, [r0], r1
vrhadd.u8 q4, q4, q5
vrhadd.u8 q5, q5, q6
vrhadd.u8 q6, q6, q7
vrhadd.u8 q7, q7, q8
subs r2, r2, #1
vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
vmov q0, q8
vst1.u8 {d4, d5, d6, d7}, [r3]!
vst1.u8 {d8, d9, d10, d11}, [r3]! ;store result
vst1.u8 {d12, d13, d14, d15}, [r3]!
bne vp8e_filt_blk2d_spo16x16s_loop_neon
b sub_pixel_variance16x16s_neon
;----------------------------
;variance16x16
sub_pixel_variance16x16s_neon
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
sub r3, r3, #256
mov r2, #4
sub_pixel_variance16x16s_neon_loop
vld1.8 {q0}, [r3]! ;Load up source and reference
vld1.8 {q1}, [r4], r12
vld1.8 {q2}, [r3]!
vld1.8 {q3}, [r4], r12
vld1.8 {q4}, [r3]!
vld1.8 {q5}, [r4], r12
vld1.8 {q6}, [r3]!
vld1.8 {q7}, [r4], r12
vsubl.u8 q11, d0, d2 ;diff
vsubl.u8 q12, d1, d3
vsubl.u8 q13, d4, d6
vsubl.u8 q14, d5, d7
vsubl.u8 q0, d8, d10
vsubl.u8 q1, d9, d11
vsubl.u8 q2, d12, d14
vsubl.u8 q3, d13, d15
vpadal.s16 q8, q11 ;sum
vmlal.s16 q9, d22, d22 ;sse
vmlal.s16 q10, d23, d23
subs r2, r2, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vpadal.s16 q8, q13
vmlal.s16 q9, d26, d26
vmlal.s16 q10, d27, d27
vpadal.s16 q8, q14
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
vpadal.s16 q8, q0 ;sum
vmlal.s16 q9, d0, d0 ;sse
vmlal.s16 q10, d1, d1
vpadal.s16 q8, q1
vmlal.s16 q9, d2, d2
vmlal.s16 q10, d3, d3
vpadal.s16 q8, q2
vmlal.s16 q9, d4, d4
vmlal.s16 q10, d5, d5
vpadal.s16 q8, q3
vmlal.s16 q9, d6, d6
vmlal.s16 q10, d7, d7
bne sub_pixel_variance16x16s_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [lr] ;store sse
vshr.u32 d10, d10, #8
vsub.u32 d0, d1, d10
add sp, sp, #256
vmov.32 r0, d0[0] ;return
pop {r4, pc}
ENDP
END

View File

@ -1,222 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sub_pixel_variance8x8_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; stack(r4) unsigned char *dst_ptr,
; stack(r5) int dst_pixels_per_line,
; stack(r6) unsigned int *sse
;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon.
|vp8_sub_pixel_variance8x8_neon| PROC
push {r4-r5, lr}
adr r12, bilinear_taps_coeff
ldr r4, [sp, #12] ;load *dst_ptr from stack
ldr r5, [sp, #16] ;load dst_pixels_per_line from stack
ldr lr, [sp, #20] ;load *sse from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq skip_firstpass_filter
;First pass: output_height lines x output_width columns (9x8)
add r2, r12, r2, lsl #3 ;calculate filter location
vld1.u8 {q1}, [r0], r1 ;load src data
vld1.u32 {d31}, [r2] ;load first_pass filter
vld1.u8 {q2}, [r0], r1
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
vld1.u8 {q3}, [r0], r1
vdup.8 d1, d31[4]
vld1.u8 {q4}, [r0], r1
vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
vmull.u8 q7, d4, d0
vmull.u8 q8, d6, d0
vmull.u8 q9, d8, d0
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
vext.8 d9, d8, d9, #1
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
vmlal.u8 q7, d5, d1
vmlal.u8 q8, d7, d1
vmlal.u8 q9, d9, d1
vld1.u8 {q1}, [r0], r1 ;load src data
vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
vld1.u8 {q2}, [r0], r1
vqrshrn.u16 d23, q7, #7
vld1.u8 {q3}, [r0], r1
vqrshrn.u16 d24, q8, #7
vld1.u8 {q4}, [r0], r1
vqrshrn.u16 d25, q9, #7
;first_pass filtering on the rest 5-line data
vld1.u8 {q5}, [r0], r1
vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
vmull.u8 q7, d4, d0
vmull.u8 q8, d6, d0
vmull.u8 q9, d8, d0
vmull.u8 q10, d10, d0
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
vext.8 d9, d8, d9, #1
vext.8 d11, d10, d11, #1
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
vmlal.u8 q7, d5, d1
vmlal.u8 q8, d7, d1
vmlal.u8 q9, d9, d1
vmlal.u8 q10, d11, d1
vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
vqrshrn.u16 d27, q7, #7
vqrshrn.u16 d28, q8, #7
vqrshrn.u16 d29, q9, #7
vqrshrn.u16 d30, q10, #7
;Second pass: 8x8
secondpass_filter
cmp r3, #0 ;skip second_pass filter if yoffset=0
;skip_secondpass_filter
beq sub_pixel_variance8x8_neon
add r3, r12, r3, lsl #3
vld1.u32 {d31}, [r3] ;load second_pass filter
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
vdup.8 d1, d31[4]
vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
vmull.u8 q2, d23, d0
vmull.u8 q3, d24, d0
vmull.u8 q4, d25, d0
vmull.u8 q5, d26, d0
vmull.u8 q6, d27, d0
vmull.u8 q7, d28, d0
vmull.u8 q8, d29, d0
vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * Filter[1])
vmlal.u8 q2, d24, d1
vmlal.u8 q3, d25, d1
vmlal.u8 q4, d26, d1
vmlal.u8 q5, d27, d1
vmlal.u8 q6, d28, d1
vmlal.u8 q7, d29, d1
vmlal.u8 q8, d30, d1
vqrshrn.u16 d22, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d23, q2, #7
vqrshrn.u16 d24, q3, #7
vqrshrn.u16 d25, q4, #7
vqrshrn.u16 d26, q5, #7
vqrshrn.u16 d27, q6, #7
vqrshrn.u16 d28, q7, #7
vqrshrn.u16 d29, q8, #7
b sub_pixel_variance8x8_neon
;--------------------
skip_firstpass_filter
vld1.u8 {d22}, [r0], r1 ;load src data
vld1.u8 {d23}, [r0], r1
vld1.u8 {d24}, [r0], r1
vld1.u8 {d25}, [r0], r1
vld1.u8 {d26}, [r0], r1
vld1.u8 {d27}, [r0], r1
vld1.u8 {d28}, [r0], r1
vld1.u8 {d29}, [r0], r1
vld1.u8 {d30}, [r0], r1
b secondpass_filter
;----------------------
;vp8_variance8x8_neon
sub_pixel_variance8x8_neon
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
mov r12, #2
sub_pixel_variance8x8_neon_loop
vld1.8 {d0}, [r4], r5 ;load dst data
subs r12, r12, #1
vld1.8 {d1}, [r4], r5
vld1.8 {d2}, [r4], r5
vsubl.u8 q4, d22, d0 ;calculate diff
vld1.8 {d3}, [r4], r5
vsubl.u8 q5, d23, d1
vsubl.u8 q6, d24, d2
vpadal.s16 q8, q4 ;sum
vmlal.s16 q9, d8, d8 ;sse
vmlal.s16 q10, d9, d9
vsubl.u8 q7, d25, d3
vpadal.s16 q8, q5
vmlal.s16 q9, d10, d10
vmlal.s16 q10, d11, d11
vmov q11, q13
vpadal.s16 q8, q6
vmlal.s16 q9, d12, d12
vmlal.s16 q10, d13, d13
vmov q12, q14
vpadal.s16 q8, q7
vmlal.s16 q9, d14, d14
vmlal.s16 q10, d15, d15
bne sub_pixel_variance8x8_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [lr] ;store sse
vshr.u32 d10, d10, #6
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
pop {r4-r5, pc}
ENDP
;-----------------
bilinear_taps_coeff
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
END

File diff suppressed because it is too large Load Diff

View File

@ -1,58 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_config.h"
#include "vp8_rtcd.h"
#include "vp8/common/blockd.h"
#include "vpx_mem/vpx_mem.h"
#if HAVE_NEON
extern void vp8_build_intra_predictors_mby_neon_func(
unsigned char *y_buffer,
unsigned char *ypred_ptr,
int y_stride,
int mode,
int Up,
int Left);
void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x)
{
unsigned char *y_buffer = x->dst.y_buffer;
unsigned char *ypred_ptr = x->predictor;
int y_stride = x->dst.y_stride;
int mode = x->mode_info_context->mbmi.mode;
int Up = x->up_available;
int Left = x->left_available;
vp8_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
}
extern void vp8_build_intra_predictors_mby_s_neon_func(
unsigned char *y_buffer,
unsigned char *ypred_ptr,
int y_stride,
int mode,
int Up,
int Left);
void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x)
{
unsigned char *y_buffer = x->dst.y_buffer;
unsigned char *ypred_ptr = x->predictor;
int y_stride = x->dst.y_stride;
int mode = x->mode_info_context->mbmi.mode;
int Up = x->up_available;
int Left = x->left_available;
vp8_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
}
#endif

View File

@ -9,8 +9,8 @@
*/
#ifndef __INC_BLOCKD_H
#define __INC_BLOCKD_H
#ifndef VP8_COMMON_BLOCKD_H_
#define VP8_COMMON_BLOCKD_H_
void vpx_log(const char *format, ...);
@ -20,6 +20,10 @@ void vpx_log(const char *format, ...);
#include "treecoder.h"
#include "vpx_ports/mem.h"
#ifdef __cplusplus
extern "C" {
#endif
/*#define DCPRED 1*/
#define DCPREDSIMTHRESH 0
#define DCPREDCNTTHRESH 3
@ -297,4 +301,8 @@ typedef struct macroblockd
extern void vp8_build_block_doffsets(MACROBLOCKD *x);
extern void vp8_setup_block_dptrs(MACROBLOCKD *x);
#endif /* __INC_BLOCKD_H */
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_BLOCKD_H_

View File

@ -8,6 +8,12 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_COEFUPDATEPROBS_H_
#define VP8_COMMON_COEFUPDATEPROBS_H_
#ifdef __cplusplus
extern "C" {
#endif
/* Update probabilities for the nodes in the token entropy tree.
Generated file included by entropy.c */
@ -183,3 +189,9 @@ const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTE
},
},
};
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_COEFUPDATEPROBS_H_

View File

@ -9,8 +9,8 @@
*/
#ifndef common_h
#define common_h 1
#ifndef VP8_COMMON_COMMON_H_
#define VP8_COMMON_COMMON_H_
#include <assert.h>
@ -18,6 +18,13 @@
#include "vpx_mem/vpx_mem.h"
#ifdef __cplusplus
extern "C" {
#endif
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
/* Only need this for fixed-size arrays, for structs just assign. */
#define vp8_copy( Dest, Src) { \
@ -37,4 +44,8 @@
#define vp8_zero_array( Dest, N) vpx_memset( Dest, 0, N * sizeof( *Dest));
#endif /* common_h */
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_COMMON_H_

View File

@ -8,6 +8,12 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_DEFAULT_COEF_PROBS_H_
#define VP8_COMMON_DEFAULT_COEF_PROBS_H_
#ifdef __cplusplus
extern "C" {
#endif
/*Generated file, included by entropy.c*/
@ -186,3 +192,9 @@ static const vp8_prob default_coef_probs [BLOCK_TYPES]
}
}
};
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_DEFAULT_COEF_PROBS_H_

View File

@ -9,12 +9,16 @@
*/
#ifndef __INC_ENTROPY_H
#define __INC_ENTROPY_H
#ifndef VP8_COMMON_ENTROPY_H_
#define VP8_COMMON_ENTROPY_H_
#include "treecoder.h"
#include "blockd.h"
#ifdef __cplusplus
extern "C" {
#endif
/* Coefficient token alphabet */
#define ZERO_TOKEN 0 /* 0 Extra Bits 0+0 */
@ -98,4 +102,8 @@ extern DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]);
extern const int vp8_mb_feature_data_bits[MB_LVL_MAX];
void vp8_coef_tree_initialize(void);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_ENTROPY_H_

View File

@ -9,12 +9,16 @@
*/
#ifndef __INC_ENTROPYMODE_H
#define __INC_ENTROPYMODE_H
#ifndef VP8_COMMON_ENTROPYMODE_H_
#define VP8_COMMON_ENTROPYMODE_H_
#include "onyxc_int.h"
#include "treecoder.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef enum
{
SUBMVREF_NORMAL,
@ -77,4 +81,8 @@ void vp8_init_mbmode_probs(VP8_COMMON *x);
void vp8_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES-1]);
void vp8_kf_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1]);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_ENTROPYMODE_H_

View File

@ -9,11 +9,15 @@
*/
#ifndef __INC_ENTROPYMV_H
#define __INC_ENTROPYMV_H
#ifndef VP8_COMMON_ENTROPYMV_H_
#define VP8_COMMON_ENTROPYMV_H_
#include "treecoder.h"
#ifdef __cplusplus
extern "C" {
#endif
enum
{
mv_max = 1023, /* max absolute value of a MV component */
@ -41,4 +45,8 @@ typedef struct mv_context
extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2];
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_ENTROPYMV_H_

View File

@ -9,11 +9,15 @@
*/
#ifndef __INC_EXTEND_H
#define __INC_EXTEND_H
#ifndef VP8_COMMON_EXTEND_H_
#define VP8_COMMON_EXTEND_H_
#include "vpx_scale/yv12config.h"
#ifdef __cplusplus
extern "C" {
#endif
void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr);
void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
YV12_BUFFER_CONFIG *dst);
@ -22,4 +26,8 @@ void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
int srcy, int srcx,
int srch, int srcw);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_EXTEND_H_

View File

@ -9,11 +9,15 @@
*/
#ifndef FILTER_H
#define FILTER_H
#ifndef VP8_COMMON_FILTER_H_
#define VP8_COMMON_FILTER_H_
#include "vpx_ports/mem.h"
#ifdef __cplusplus
extern "C" {
#endif
#define BLOCK_HEIGHT_WIDTH 4
#define VP8_FILTER_WEIGHT 128
#define VP8_FILTER_SHIFT 7
@ -21,4 +25,8 @@
extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]);
extern DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_FILTER_H_

View File

@ -9,14 +9,18 @@
*/
#ifndef __INC_FINDNEARMV_H
#define __INC_FINDNEARMV_H
#ifndef VP8_COMMON_FINDNEARMV_H_
#define VP8_COMMON_FINDNEARMV_H_
#include "mv.h"
#include "blockd.h"
#include "modecont.h"
#include "treecoder.h"
#ifdef __cplusplus
extern "C" {
#endif
static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp,
const int *ref_frame_sign_bias)
@ -179,4 +183,8 @@ static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b, int mi
return (cur_mb->bmi + b - 4)->as_mode;
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_FINDNEARMV_H_

View File

@ -9,8 +9,12 @@
*/
#ifndef __INC_HEADER_H
#define __INC_HEADER_H
#ifndef VP8_COMMON_HEADER_H_
#define VP8_COMMON_HEADER_H_
#ifdef __cplusplus
extern "C" {
#endif
/* 24 bits total */
typedef struct
@ -40,4 +44,8 @@ typedef struct
#endif
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_HEADER_H_

View File

@ -9,8 +9,8 @@
*/
#ifndef __INC_INVTRANS_H
#define __INC_INVTRANS_H
#ifndef VP8_COMMON_INVTRANS_H_
#define VP8_COMMON_INVTRANS_H_
#include "vpx_config.h"
#include "vp8_rtcd.h"
@ -21,6 +21,10 @@
#include "vpx_mem/vpx_mem.h"
#endif
#ifdef __cplusplus
extern "C" {
#endif
static void eob_adjust(char *eobs, short *diff)
{
/* eob adjust.... the idct can only skip if both the dc and eob are zero */
@ -59,4 +63,8 @@ static void vp8_inverse_transform_mby(MACROBLOCKD *xd)
xd->dst.y_buffer,
xd->dst.y_stride, xd->eobs);
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_INVTRANS_H_

View File

@ -15,6 +15,7 @@
#include "onyxc_int.h"
#include "vpx_mem/vpx_mem.h"
static void lf_init_lut(loop_filter_info_n *lfi)
{
int filt_lvl;

View File

@ -9,13 +9,17 @@
*/
#ifndef loopfilter_h
#define loopfilter_h
#ifndef VP8_COMMON_LOOPFILTER_H_
#define VP8_COMMON_LOOPFILTER_H_
#include "vpx_ports/mem.h"
#include "vpx_config.h"
#include "vp8_rtcd.h"
#ifdef __cplusplus
extern "C" {
#endif
#define MAX_LOOP_FILTER 63
/* fraction of total macroblock rows to be used in fast filter level picking */
/* has to be > 2 */
@ -102,4 +106,8 @@ void vp8_loop_filter_row_simple(struct VP8Common *cm,
int mb_row, int post_ystride, int post_uvstride,
unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_LOOPFILTER_H_

View File

@ -9,9 +9,17 @@
*/
#ifndef __INC_MODECONT_H
#define __INC_MODECONT_H
#ifndef VP8_COMMON_MODECONT_H_
#define VP8_COMMON_MODECONT_H_
#ifdef __cplusplus
extern "C" {
#endif
extern const int vp8_mode_contexts[6][4];
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_MODECONT_H_

View File

@ -9,10 +9,14 @@
*/
#ifndef __INC_MV_H
#define __INC_MV_H
#ifndef VP8_COMMON_MV_H_
#define VP8_COMMON_MV_H_
#include "vpx/vpx_integer.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct
{
short row;
@ -25,4 +29,8 @@ typedef union int_mv
MV as_mv;
} int_mv; /* facilitates faster equality tests and copies */
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_MV_H_

View File

@ -9,8 +9,8 @@
*/
#ifndef __INC_VP8_H
#define __INC_VP8_H
#ifndef VP8_COMMON_ONYX_H_
#define VP8_COMMON_ONYX_H_
#ifdef __cplusplus
extern "C"
@ -39,8 +39,8 @@ extern "C"
typedef enum
{
USAGE_STREAM_FROM_SERVER = 0x0,
USAGE_LOCAL_FILE_PLAYBACK = 0x1,
USAGE_LOCAL_FILE_PLAYBACK = 0x0,
USAGE_STREAM_FROM_SERVER = 0x1,
USAGE_CONSTRAINED_QUALITY = 0x2,
USAGE_CONSTANT_QUALITY = 0x3
} END_USAGE;
@ -104,7 +104,18 @@ extern "C"
struct vpx_rational timebase;
unsigned int target_bandwidth; /* kilobits per second */
/* parameter used for applying pre processing blur: recommendation 0 */
/* Parameter used for applying denoiser.
* For temporal denoiser: noise_sensitivity = 0 means off,
* noise_sensitivity = 1 means temporal denoiser on for Y channel only,
* noise_sensitivity = 2 means temporal denoiser on for all channels.
* noise_sensitivity = 3 means aggressive denoising mode.
* noise_sensitivity >= 4 means adaptive denoising mode.
* Temporal denoiser is enabled via the configuration option:
* CONFIG_TEMPORAL_DENOISING.
* For spatial denoiser: noise_sensitivity controls the amount of
* pre-processing blur: noise_sensitivity = 0 means off.
* Spatial denoiser invoked under !CONFIG_TEMPORAL_DENOISING.
*/
int noise_sensitivity;
/* parameter used for sharpening output: recommendation 0: */
@ -213,7 +224,7 @@ extern "C"
int arnr_strength;
int arnr_type;
struct vpx_fixed_buf two_pass_stats_in;
vpx_fixed_buf_t two_pass_stats_in;
struct vpx_codec_pkt_list *output_pkt_list;
vp8e_tuning tuning;
@ -267,4 +278,4 @@ extern "C"
}
#endif
#endif
#endif // VP8_COMMON_ONYX_H_

View File

@ -9,8 +9,8 @@
*/
#ifndef __INC_VP8C_INT_H
#define __INC_VP8C_INT_H
#ifndef VP8_COMMON_ONYXC_INT_H_
#define VP8_COMMON_ONYXC_INT_H_
#include "vpx_config.h"
#include "vp8_rtcd.h"
@ -26,6 +26,10 @@
#include "header.h"
/*#endif*/
#ifdef __cplusplus
extern "C" {
#endif
#define MINQ 0
#define MAXQ 127
#define QINDEX_RANGE (MAXQ + 1)
@ -174,4 +178,8 @@ typedef struct VP8Common
int cpu_caps;
} VP8_COMMON;
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_ONYXC_INT_H_

View File

@ -9,8 +9,8 @@
*/
#ifndef __INC_VP8D_H
#define __INC_VP8D_H
#ifndef VP8_COMMON_ONYXD_H_
#define VP8_COMMON_ONYXD_H_
/* Create/destroy static data structures. */
@ -60,4 +60,4 @@ extern "C"
#endif
#endif
#endif // VP8_COMMON_ONYXD_H_

View File

@ -71,11 +71,6 @@ static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] =
};
#endif
static const short kernel5[] =
{
1, 1, 4, 1, 1
};
const short vp8_rv[] =
{
8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
@ -308,13 +303,14 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, i
{
d[r&15] = (rv2[r&127] + sum + s[0]) >> 4;
}
s[-8*pitch] = d[(r-8)&15];
if (r >= 8)
s[-8*pitch] = d[(r-8)&15];
s += pitch;
}
}
}
#if CONFIG_POSTPROC
static void vp8_de_mblock(YV12_BUFFER_CONFIG *post,
int q)
{
@ -387,6 +383,7 @@ void vp8_deblock(VP8_COMMON *cm,
vp8_yv12_copy_frame(source, post);
}
}
#endif
#if !(CONFIG_TEMPORAL_DENOISING)
void vp8_de_noise(VP8_COMMON *cm,
@ -396,12 +393,12 @@ void vp8_de_noise(VP8_COMMON *cm,
int low_var_thresh,
int flag)
{
int mbr;
double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
int ppl = (int)(level + .5);
int mb_rows = source->y_width >> 4;
int mb_cols = source->y_height >> 4;
int mb_rows = cm->mb_rows;
int mb_cols = cm->mb_cols;
unsigned char *limits = cm->pp_limits_buffer;;
int mbr, mbc;
(void) post;
(void) low_var_thresh;
(void) flag;

View File

@ -9,8 +9,8 @@
*/
#ifndef POSTPROC_H
#define POSTPROC_H
#ifndef VP8_COMMON_POSTPROC_H_
#define VP8_COMMON_POSTPROC_H_
#include "vpx_ports/mem.h"
struct postproc_state
@ -26,6 +26,10 @@ struct postproc_state
};
#include "onyxc_int.h"
#include "ppflags.h"
#ifdef __cplusplus
extern "C" {
#endif
int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest,
vp8_ppflags_t *flags);
@ -47,4 +51,8 @@ void vp8_deblock(struct VP8Common *oci,
#define MFQE_PRECISION 4
void vp8_multiframe_quality_enhance(struct VP8Common *cm);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_POSTPROC_H_

View File

@ -9,8 +9,12 @@
*/
#ifndef __INC_PPFLAGS_H
#define __INC_PPFLAGS_H
#ifndef VP8_COMMON_PPFLAGS_H_
#define VP8_COMMON_PPFLAGS_H_
#ifdef __cplusplus
extern "C" {
#endif
enum
{
VP8D_NOFILTERING = 0,
@ -38,4 +42,8 @@ typedef struct
int display_mv_flag;
} vp8_ppflags_t;
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_PPFLAGS_H_

View File

@ -1,19 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifdef __INTEL_COMPILER
#pragma warning(disable:997 1011 170)
#endif
#ifdef _MSC_VER
#pragma warning(disable:4799)
#endif

View File

@ -8,14 +8,27 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_QUANT_COMMON_H_
#define VP8_COMMON_QUANT_COMMON_H_
#include "string.h"
#include "blockd.h"
#include "onyxc_int.h"
#ifdef __cplusplus
extern "C" {
#endif
extern int vp8_ac_yquant(int QIndex);
extern int vp8_dc_quant(int QIndex, int Delta);
extern int vp8_dc2quant(int QIndex, int Delta);
extern int vp8_ac2quant(int QIndex, int Delta);
extern int vp8_dc_uv_quant(int QIndex, int Delta);
extern int vp8_ac_uv_quant(int QIndex, int Delta);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_QUANT_COMMON_H_

View File

@ -9,8 +9,12 @@
*/
#ifndef __INC_RECONINTER_H
#define __INC_RECONINTER_H
#ifndef VP8_COMMON_RECONINTER_H_
#define VP8_COMMON_RECONINTER_H_
#ifdef __cplusplus
extern "C" {
#endif
extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x);
extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
@ -32,4 +36,8 @@ extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch,
extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_RECONINTER_H_

View File

@ -9,10 +9,14 @@
*/
#ifndef __INC_RECONINTRA4x4_H
#define __INC_RECONINTRA4x4_H
#ifndef VP8_COMMON_RECONINTRA4X4_H_
#define VP8_COMMON_RECONINTRA4X4_H_
#include "vp8/common/blockd.h"
#ifdef __cplusplus
extern "C" {
#endif
static void intra_prediction_down_copy(MACROBLOCKD *xd,
unsigned char *above_right_src)
{
@ -29,4 +33,8 @@ static void intra_prediction_down_copy(MACROBLOCKD *xd,
*dst_ptr2 = *src_ptr;
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_RECONINTRA4X4_H_

View File

@ -8,10 +8,14 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef SETUPINTRARECON_H
#define SETUPINTRARECON_H
#ifndef VP8_COMMON_SETUPINTRARECON_H_
#define VP8_COMMON_SETUPINTRARECON_H_
#include "vpx_scale/yv12config.h"
#ifdef __cplusplus
extern "C" {
#endif
extern void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
extern void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf);
@ -34,4 +38,8 @@ void setup_intra_recon_left(unsigned char *y_buffer,
v_buffer[uv_stride *i] = (unsigned char) 129;
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_SETUPINTRARECON_H_

View File

@ -9,11 +9,19 @@
*/
#ifndef SWAPYV12_BUFFER_H
#define SWAPYV12_BUFFER_H
#ifndef VP8_COMMON_SWAPYV12BUFFER_H_
#define VP8_COMMON_SWAPYV12BUFFER_H_
#include "vpx_scale/yv12config.h"
#ifdef __cplusplus
extern "C" {
#endif
void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_SWAPYV12BUFFER_H_

View File

@ -8,8 +8,20 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_SYSTEMDEPENDENT_H_
#define VP8_COMMON_SYSTEMDEPENDENT_H_
#include "vpx_config.h"
#ifdef __cplusplus
extern "C" {
#endif
struct VP8Common;
void vp8_machine_specific_config(struct VP8Common *);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_SYSTEMDEPENDENT_H_

View File

@ -9,8 +9,12 @@
*/
#ifndef _PTHREAD_EMULATION
#define _PTHREAD_EMULATION
#ifndef VP8_COMMON_THREADING_H_
#define VP8_COMMON_THREADING_H_
#ifdef __cplusplus
extern "C" {
#endif
#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
@ -183,4 +187,8 @@ static inline int sem_destroy(sem_t * sem)
#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_THREADING_H_

View File

@ -9,8 +9,12 @@
*/
#ifndef __INC_TREECODER_H
#define __INC_TREECODER_H
#ifndef VP8_COMMON_TREECODER_H_
#define VP8_COMMON_TREECODER_H_
#ifdef __cplusplus
extern "C" {
#endif
typedef unsigned char vp8bc_index_t; /* probability index */
@ -87,4 +91,8 @@ void vp8bc_tree_probs_from_distribution(
);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_TREECODER_H_

View File

@ -9,11 +9,15 @@
*/
#ifndef VARIANCE_H
#define VARIANCE_H
#ifndef VP8_COMMON_VARIANCE_H_
#define VP8_COMMON_VARIANCE_H_
#include "vpx_config.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef unsigned int(*vp8_sad_fn_t)(
const unsigned char *src_ptr,
int source_stride,
@ -112,4 +116,8 @@ typedef struct variance_vtable
#endif
} vp8_variance_fn_ptr_t;
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_VARIANCE_H_

View File

@ -8,6 +8,12 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_VP8_ENTROPYMODEDATA_H_
#define VP8_COMMON_VP8_ENTROPYMODEDATA_H_
#ifdef __cplusplus
extern "C" {
#endif
/*Generated file, included by entropymode.c*/
@ -240,3 +246,9 @@ const vp8_prob vp8_kf_bmode_prob
{ 112, 19, 12, 61, 195, 128, 48, 4, 24 }
}
};
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_VP8_ENTROPYMODEDATA_H_

View File

@ -8,11 +8,15 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef FILTER_X86_H
#define FILTER_X86_H
#ifndef VP8_COMMON_X86_FILTER_X86_H_
#define VP8_COMMON_X86_FILTER_X86_H_
#include "vpx_ports/mem.h"
#ifdef __cplusplus
extern "C" {
#endif
/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with
* duplicated values */
@ -22,4 +26,8 @@ extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]);
/* duplicated 8x */
extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]);
#endif /* FILTER_X86_H */
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_X86_FILTER_X86_H_

Some files were not shown because too many files have changed in this diff Show More