mirror of
https://github.com/mozilla/gecko-dev.git
synced 2025-02-11 01:57:00 +00:00
Bug 1063356 - Update libvpx source. r=kinetik
Results of running ./update.py --ndk ~/android/android-ndk-r9 --commit c731d6a4f19eea861ceb2ff31399420b2452eb74
This commit is contained in:
parent
1d50c84dd8
commit
307e5968d6
@ -1,22 +1,23 @@
|
||||
Additional IP Rights Grant (Patents)
|
||||
------------------------------------
|
||||
|
||||
"This implementation" means the copyrightable works distributed by
|
||||
Google as part of the WebM Project.
|
||||
"These implementations" means the copyrightable works that implement the WebM
|
||||
codecs distributed by Google as part of the WebM Project.
|
||||
|
||||
Google hereby grants to you a perpetual, worldwide, non-exclusive,
|
||||
no-charge, royalty-free, irrevocable (except as stated in this section)
|
||||
patent license to make, have made, use, offer to sell, sell, import,
|
||||
transfer, and otherwise run, modify and propagate the contents of this
|
||||
implementation of VP8, where such license applies only to those patent
|
||||
claims, both currently owned by Google and acquired in the future,
|
||||
licensable by Google that are necessarily infringed by this
|
||||
implementation of VP8. This grant does not include claims that would be
|
||||
infringed only as a consequence of further modification of this
|
||||
implementation. If you or your agent or exclusive licensee institute or
|
||||
order or agree to the institution of patent litigation against any
|
||||
entity (including a cross-claim or counterclaim in a lawsuit) alleging
|
||||
that this implementation of VP8 or any code incorporated within this
|
||||
implementation of VP8 constitutes direct or contributory patent
|
||||
infringement, or inducement of patent infringement, then any patent
|
||||
rights granted to you under this License for this implementation of VP8
|
||||
shall terminate as of the date such litigation is filed.
|
||||
Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge,
|
||||
royalty-free, irrevocable (except as stated in this section) patent license to
|
||||
make, have made, use, offer to sell, sell, import, transfer, and otherwise
|
||||
run, modify and propagate the contents of these implementations of WebM, where
|
||||
such license applies only to those patent claims, both currently owned by
|
||||
Google and acquired in the future, licensable by Google that are necessarily
|
||||
infringed by these implementations of WebM. This grant does not include claims
|
||||
that would be infringed only as a consequence of further modification of these
|
||||
implementations. If you or your agent or exclusive licensee institute or order
|
||||
or agree to the institution of patent litigation or any other patent
|
||||
enforcement activity against any entity (including a cross-claim or
|
||||
counterclaim in a lawsuit) alleging that any of these implementations of WebM
|
||||
or any code incorporated within any of these implementations of WebM
|
||||
constitutes direct or contributory patent infringement, or inducement of
|
||||
patent infringement, then any patent rights granted to you under this License
|
||||
for these implementations of WebM shall terminate as of the date such
|
||||
litigation is filed.
|
||||
|
@ -8,4 +8,4 @@ The libvpx git repository is:
|
||||
|
||||
https://gerrit.chromium.org/gerrit/webm/libvpx
|
||||
|
||||
The git commit ID used was 2e88f2f2ec777259bda1714e72f1ecd2519bceb5
|
||||
The git commit ID used was c731d6a4f19eea861ceb2ff31399420b2452eb74
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
##
|
||||
## Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
##
|
||||
|
@ -21,6 +21,7 @@ typedef enum {
|
||||
OUTPUT_FMT_PLAIN,
|
||||
OUTPUT_FMT_RVDS,
|
||||
OUTPUT_FMT_GAS,
|
||||
OUTPUT_FMT_C_HEADER,
|
||||
} output_fmt_t;
|
||||
|
||||
int log_msg(const char *fmt, ...) {
|
||||
@ -33,6 +34,18 @@ int log_msg(const char *fmt, ...) {
|
||||
}
|
||||
|
||||
#if defined(__GNUC__) && __GNUC__
|
||||
|
||||
#if defined(FORCE_PARSE_ELF)
|
||||
|
||||
#if defined(__MACH__)
|
||||
#undef __MACH__
|
||||
#endif
|
||||
|
||||
#if !defined(__ELF__)
|
||||
#define __ELF__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__MACH__)
|
||||
|
||||
#include <mach-o/loader.h>
|
||||
@ -43,9 +56,12 @@ int print_macho_equ(output_fmt_t mode, uint8_t* name, int val) {
|
||||
case OUTPUT_FMT_RVDS:
|
||||
printf("%-40s EQU %5d\n", name, val);
|
||||
return 0;
|
||||
case OUTPUT_FMT_GAS:
|
||||
case OUTPUT_FMT_GAS:
|
||||
printf(".set %-40s, %5d\n", name, val);
|
||||
return 0;
|
||||
case OUTPUT_FMT_C_HEADER:
|
||||
printf("#define %-40s %5d\n", name, val);
|
||||
return 0;
|
||||
default:
|
||||
log_msg("Unsupported mode: %d", mode);
|
||||
return 1;
|
||||
@ -321,7 +337,7 @@ bail:
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *parse_elf_string_table(elf_obj_t *elf, int s_idx, int idx) {
|
||||
const char *parse_elf_string_table(elf_obj_t *elf, int s_idx, int idx) {
|
||||
if (elf->bits == 32) {
|
||||
Elf32_Shdr shdr;
|
||||
|
||||
@ -491,6 +507,13 @@ int parse_elf(uint8_t *buf, size_t sz, output_fmt_t mode) {
|
||||
sym.st_name),
|
||||
val);
|
||||
break;
|
||||
case OUTPUT_FMT_C_HEADER:
|
||||
printf("#define %-40s %5d\n",
|
||||
parse_elf_string_table(&elf,
|
||||
shdr.sh_link,
|
||||
sym.st_name),
|
||||
val);
|
||||
break;
|
||||
default:
|
||||
printf("%s = %d\n",
|
||||
parse_elf_string_table(&elf,
|
||||
@ -655,7 +678,11 @@ int parse_coff(uint8_t *buf, size_t sz) {
|
||||
}
|
||||
strcpy(sectionlist[i], sectionname);
|
||||
|
||||
if (!strcmp(sectionname, ".rdata")) sectionrawdata_ptr = get_le32(ptr + 20);
|
||||
// check if it's .rdata and is not a COMDAT section.
|
||||
if (!strcmp(sectionname, ".rdata") &&
|
||||
(get_le32(ptr + 36) & 0x1000) == 0) {
|
||||
sectionrawdata_ptr = get_le32(ptr + 20);
|
||||
}
|
||||
|
||||
ptr += 40;
|
||||
}
|
||||
@ -762,6 +789,7 @@ int main(int argc, char **argv) {
|
||||
fprintf(stderr, "Output Formats:\n");
|
||||
fprintf(stderr, " gas - compatible with GNU assembler\n");
|
||||
fprintf(stderr, " rvds - compatible with armasm\n");
|
||||
fprintf(stderr, " cheader - c/c++ header file\n");
|
||||
goto bail;
|
||||
}
|
||||
|
||||
@ -771,6 +799,8 @@ int main(int argc, char **argv) {
|
||||
mode = OUTPUT_FMT_RVDS;
|
||||
else if (!strcmp(argv[1], "gas"))
|
||||
mode = OUTPUT_FMT_GAS;
|
||||
else if (!strcmp(argv[1], "cheader"))
|
||||
mode = OUTPUT_FMT_C_HEADER;
|
||||
else
|
||||
f = argv[1];
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
##
|
||||
## Copyright (c) 2013 The WebM project authors. All Rights Reserved.
|
||||
##
|
||||
@ -24,7 +24,7 @@ sub FixThumbInstructions($$)
|
||||
# with left shift, addition and a right shift (to restore the
|
||||
# register to the original value). Currently the right shift
|
||||
# isn't necessary in the code base since the values in these
|
||||
# registers aren't used, but doing the shift for consitency.
|
||||
# registers aren't used, but doing the shift for consistency.
|
||||
# This converts instructions such as "add r12, r12, r5, lsl r4"
|
||||
# into the sequence "lsl r5, r4", "add r12, r12, r5", "lsr r5, r4".
|
||||
s/^(\s*)(add)(\s+)(r\d+),\s*(r\d+),\s*(r\d+),\s*lsl (r\d+)/$1lsl$3$6, $7\n$1$2$3$4, $5, $6\n$1lsr$3$6, $7/g;
|
||||
@ -51,7 +51,7 @@ sub FixThumbInstructions($$)
|
||||
|
||||
# Convert register post indexing to a separate add instruction.
|
||||
# This converts "ldrneb r9, [r0], r2" into "ldrneb r9, [r0]",
|
||||
# "add r0, r2".
|
||||
# "addne r0, r0, r2".
|
||||
s/^(\s*)((ldr|str)(ne)?[bhd]?)(\s+)(\w+),(\s*\w+,)?\s*\[(\w+)\],\s*(\w+)/$1$2$5$6,$7 [$8]\n$1add$4$5$8, $8, $9/g;
|
||||
|
||||
# Convert a conditional addition to the pc register into a series of
|
||||
|
@ -1,244 +1,105 @@
|
||||
files = {
|
||||
'ARM_ASM': [
|
||||
'vp8/common/arm/armv6/bilinearfilter_v6.asm',
|
||||
'vp8/common/arm/armv6/copymem16x16_v6.asm',
|
||||
'vp8/common/arm/armv6/copymem8x4_v6.asm',
|
||||
'vp8/common/arm/armv6/copymem8x8_v6.asm',
|
||||
'vp8/common/arm/armv6/dc_only_idct_add_v6.asm',
|
||||
'vp8/common/arm/armv6/dequant_idct_v6.asm',
|
||||
'vp8/common/arm/armv6/dequantize_v6.asm',
|
||||
'vp8/common/arm/armv6/filter_v6.asm',
|
||||
'vp8/common/arm/armv6/idct_blk_v6.c',
|
||||
'vp8/common/arm/armv6/idct_v6.asm',
|
||||
'vp8/common/arm/armv6/intra4x4_predict_v6.asm',
|
||||
'vp8/common/arm/armv6/iwalsh_v6.asm',
|
||||
'vp8/common/arm/armv6/loopfilter_v6.asm',
|
||||
'vp8/common/arm/armv6/simpleloopfilter_v6.asm',
|
||||
'vp8/common/arm/armv6/sixtappredict8x4_v6.asm',
|
||||
'vp8/common/arm/armv6/vp8_sad16x16_armv6.asm',
|
||||
'vp8/common/arm/armv6/vp8_variance16x16_armv6.asm',
|
||||
'vp8/common/arm/armv6/vp8_variance8x8_armv6.asm',
|
||||
'vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm',
|
||||
'vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm',
|
||||
'vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm',
|
||||
'vp8/common/arm/bilinearfilter_arm.c',
|
||||
'vp8/common/arm/dequantize_arm.c',
|
||||
'vp8/common/arm/filter_arm.c',
|
||||
'vp8/common/arm/loopfilter_arm.c',
|
||||
'vp8/common/arm/neon/bilinearpredict16x16_neon.asm',
|
||||
'vp8/common/arm/neon/bilinearpredict4x4_neon.asm',
|
||||
'vp8/common/arm/neon/bilinearpredict8x4_neon.asm',
|
||||
'vp8/common/arm/neon/bilinearpredict8x8_neon.asm',
|
||||
'vp8/common/arm/neon/buildintrapredictorsmby_neon.asm',
|
||||
'vp8/common/arm/neon/copymem16x16_neon.asm',
|
||||
'vp8/common/arm/neon/copymem8x4_neon.asm',
|
||||
'vp8/common/arm/neon/copymem8x8_neon.asm',
|
||||
'vp8/common/arm/neon/dc_only_idct_add_neon.asm',
|
||||
'vp8/common/arm/neon/dequant_idct_neon.asm',
|
||||
'vp8/common/arm/neon/dequantizeb_neon.asm',
|
||||
'vp8/common/arm/neon/idct_blk_neon.c',
|
||||
'vp8/common/arm/neon/idct_dequant_0_2x_neon.asm',
|
||||
'vp8/common/arm/neon/idct_dequant_full_2x_neon.asm',
|
||||
'vp8/common/arm/neon/iwalsh_neon.asm',
|
||||
'vp8/common/arm/neon/loopfilter_neon.asm',
|
||||
'vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm',
|
||||
'vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm',
|
||||
'vp8/common/arm/neon/mbloopfilter_neon.asm',
|
||||
'vp8/common/arm/neon/sad16_neon.asm',
|
||||
'vp8/common/arm/neon/sad8_neon.asm',
|
||||
'vp8/common/arm/neon/save_reg_neon.asm',
|
||||
'vp8/common/arm/neon/shortidct4x4llm_neon.asm',
|
||||
'vp8/common/arm/neon/sixtappredict16x16_neon.asm',
|
||||
'vp8/common/arm/neon/sixtappredict4x4_neon.asm',
|
||||
'vp8/common/arm/neon/sixtappredict8x4_neon.asm',
|
||||
'vp8/common/arm/neon/sixtappredict8x8_neon.asm',
|
||||
'vp8/common/arm/neon/variance_neon.asm',
|
||||
'vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm',
|
||||
'vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm',
|
||||
'vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm',
|
||||
'vp8/common/arm/reconintra_arm.c',
|
||||
'vp8/common/arm/variance_arm.c',
|
||||
'vp8/encoder/arm/armv5te/boolhuff_armv5te.asm',
|
||||
'vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm',
|
||||
'vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm',
|
||||
'vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm',
|
||||
'vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm',
|
||||
'vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm',
|
||||
'vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm',
|
||||
'vp8/encoder/arm/armv6/vp8_subtract_armv6.asm',
|
||||
'vp8/encoder/arm/armv6/walsh_v6.asm',
|
||||
'vp8/encoder/arm/boolhuff_arm.c',
|
||||
'vp8/encoder/arm/dct_arm.c',
|
||||
'vp8/encoder/arm/neon/fastquantizeb_neon.asm',
|
||||
'vp8/encoder/arm/neon/picklpf_arm.c',
|
||||
'vp8/encoder/arm/neon/shortfdct_neon.asm',
|
||||
'vp8/encoder/arm/neon/subtract_neon.asm',
|
||||
'vp8/encoder/arm/neon/vp8_memcpy_neon.asm',
|
||||
'vp8/encoder/arm/neon/vp8_mse16x16_neon.asm',
|
||||
'vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm',
|
||||
'vp8/encoder/arm/quantize_arm.c',
|
||||
'vp9/common/arm/neon/vp9_avg_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_convolve8_avg_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_convolve8_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_convolve_neon.c',
|
||||
'vp9/common/arm/neon/vp9_copy_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_idct16x16_neon.c',
|
||||
'vp9/common/arm/neon/vp9_loopfilter_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_mb_lpf_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_save_reg_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_short_idct32x32_1_add_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm',
|
||||
'vpx_ports/arm_cpudetect.c',
|
||||
'vpx_scale/arm/neon/vp8_vpxyv12_copy_y_neon.asm',
|
||||
'vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm',
|
||||
'vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm',
|
||||
'vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm',
|
||||
'vpx_scale/arm/neon/yv12extend_arm.c'
|
||||
],
|
||||
'AVX2': ['vp9/common/x86/vp9_loopfilter_intrin_avx2.c'],
|
||||
'UNIFIED_SOURCES': [
|
||||
'vp8/common/alloccommon.c',
|
||||
'vp8/common/blockd.c',
|
||||
'vp8/common/debugmodes.c',
|
||||
'vp8/common/dequantize.c',
|
||||
'vp8/common/entropy.c',
|
||||
'vp8/common/entropymode.c',
|
||||
'vp8/common/entropymv.c',
|
||||
'vp8/common/extend.c',
|
||||
'vp8/common/filter.c',
|
||||
'vp8/common/findnearmv.c',
|
||||
'vp8/common/generic/systemdependent.c',
|
||||
'vp8/common/idct_blk.c',
|
||||
'vp8/common/idctllm.c',
|
||||
'vp8/common/loopfilter.c',
|
||||
'vp8/common/loopfilter_filters.c',
|
||||
'vp8/common/mbpitch.c',
|
||||
'vp8/common/modecont.c',
|
||||
'vp8/common/quant_common.c',
|
||||
'vp8/common/reconinter.c',
|
||||
'vp8/common/reconintra.c',
|
||||
'vp8/common/reconintra4x4.c',
|
||||
'vp8/common/setupintrarecon.c',
|
||||
'vp8/common/swapyv12buffer.c',
|
||||
'vp8/common/treecoder.c',
|
||||
'vp8/common/variance_c.c',
|
||||
'vp8/decoder/dboolhuff.c',
|
||||
'vp8/decoder/decodemv.c',
|
||||
'vp8/decoder/decodframe.c',
|
||||
'vp8/decoder/detokenize.c',
|
||||
'vp8/decoder/onyxd_if.c',
|
||||
'vp8/decoder/threading.c',
|
||||
'vp8/encoder/bitstream.c',
|
||||
'vp8/encoder/dct.c',
|
||||
'vp8/encoder/denoising.c',
|
||||
'vp8/encoder/encodeframe.c',
|
||||
'vp8/encoder/encodeintra.c',
|
||||
'vp8/encoder/encodemb.c',
|
||||
'vp8/encoder/encodemv.c',
|
||||
'vp8/encoder/ethreading.c',
|
||||
'vp8/encoder/firstpass.c',
|
||||
'vp8/encoder/lookahead.c',
|
||||
'vp8/encoder/mcomp.c',
|
||||
'vp8/encoder/modecosts.c',
|
||||
'vp8/encoder/mr_dissim.c',
|
||||
'vp8/encoder/onyx_if.c',
|
||||
'vp8/encoder/pickinter.c',
|
||||
'vp8/encoder/picklpf.c',
|
||||
'vp8/encoder/psnr.c',
|
||||
'vp8/encoder/quantize.c',
|
||||
'vp8/encoder/ratectrl.c',
|
||||
'vp8/encoder/rdopt.c',
|
||||
'vp8/encoder/segmentation.c',
|
||||
'vp8/encoder/temporal_filter.c',
|
||||
'vp8/encoder/tokenize.c',
|
||||
'vp8/encoder/treewriter.c',
|
||||
'vp8/vp8_cx_iface.c',
|
||||
'vp9/common/generic/vp9_systemdependent.c',
|
||||
'vp9/common/vp9_alloccommon.c',
|
||||
'vp9/common/vp9_common_data.c',
|
||||
'vp9/common/vp9_convolve.c',
|
||||
'vp9/common/vp9_debugmodes.c',
|
||||
'vp9/common/vp9_entropy.c',
|
||||
'vp9/common/vp9_entropymode.c',
|
||||
'vp9/common/vp9_extend.c',
|
||||
'vp9/common/vp9_filter.c',
|
||||
'vp9/common/vp9_findnearmv.c',
|
||||
'vp9/common/vp9_idct.c',
|
||||
'vp9/common/vp9_loopfilter.c',
|
||||
'vp9/common/vp9_loopfilter_filters.c',
|
||||
'vp9/common/vp9_mvref_common.c',
|
||||
'vp9/common/vp9_pred_common.c',
|
||||
'vp9/common/vp9_quant_common.c',
|
||||
'vp9/common/vp9_reconinter.c',
|
||||
'vp9/common/vp9_reconintra.c',
|
||||
'vp9/common/vp9_scale.c',
|
||||
'vp9/common/vp9_scan.c',
|
||||
'vp9/common/vp9_seg_common.c',
|
||||
'vp9/common/vp9_tile_common.c',
|
||||
'vp9/common/vp9_treecoder.c',
|
||||
'vp9/decoder/vp9_dboolhuff.c',
|
||||
'vp9/decoder/vp9_decodemv.c',
|
||||
'vp9/decoder/vp9_decodframe.c',
|
||||
'vp9/decoder/vp9_detokenize.c',
|
||||
'vp9/decoder/vp9_dsubexp.c',
|
||||
'vp9/decoder/vp9_onyxd_if.c',
|
||||
'vp9/decoder/vp9_thread.c',
|
||||
'vp9/encoder/vp9_boolhuff.c',
|
||||
'vp9/encoder/vp9_dct.c',
|
||||
'vp9/encoder/vp9_encodeframe.c',
|
||||
'vp9/encoder/vp9_encodeintra.c',
|
||||
'vp9/encoder/vp9_encodemb.c',
|
||||
'vp9/encoder/vp9_encodemv.c',
|
||||
'vp9/encoder/vp9_firstpass.c',
|
||||
'vp9/encoder/vp9_lookahead.c',
|
||||
'vp9/encoder/vp9_mbgraph.c',
|
||||
'vp9/encoder/vp9_mcomp.c',
|
||||
'vp9/encoder/vp9_modecosts.c',
|
||||
'vp9/encoder/vp9_onyx_if.c',
|
||||
'vp9/encoder/vp9_picklpf.c',
|
||||
'vp9/encoder/vp9_psnr.c',
|
||||
'vp9/encoder/vp9_quantize.c',
|
||||
'vp9/encoder/vp9_ratectrl.c',
|
||||
'vp9/encoder/vp9_rdopt.c',
|
||||
'vp9/encoder/vp9_sad_c.c',
|
||||
'vp9/encoder/vp9_segmentation.c',
|
||||
'vp9/encoder/vp9_subexp.c',
|
||||
'vp9/encoder/vp9_temporal_filter.c',
|
||||
'vp9/encoder/vp9_tokenize.c',
|
||||
'vp9/encoder/vp9_treewriter.c',
|
||||
'vp9/encoder/vp9_vaq.c',
|
||||
'vp9/encoder/vp9_variance_c.c',
|
||||
'vp9/vp9_cx_iface.c',
|
||||
'vp9/vp9_dx_iface.c',
|
||||
'vpx/src/vpx_codec.c',
|
||||
'vpx/src/vpx_decoder.c',
|
||||
'vpx/src/vpx_encoder.c',
|
||||
'vpx/src/vpx_image.c',
|
||||
'vpx_scale/generic/gen_scalers.c',
|
||||
'vpx_scale/generic/vpx_scale.c',
|
||||
'vpx_scale/generic/yv12config.c',
|
||||
'vpx_scale/generic/yv12extend.c',
|
||||
'vpx_scale/vpx_scale_rtcd.c'
|
||||
],
|
||||
'SOURCES': [
|
||||
'vp8/common/rtcd.c',
|
||||
'vp8/common/sad_c.c',
|
||||
'vp8/vp8_dx_iface.c',
|
||||
'vp9/common/vp9_entropymv.c',
|
||||
'vp9/common/vp9_rtcd.c',
|
||||
'vp9/encoder/vp9_bitstream.c',
|
||||
'vpx/src/svc_encodeframe.c',
|
||||
'vpx_mem/vpx_mem.c',
|
||||
],
|
||||
'ARM_ASM': ['vp8/common/arm/armv6/bilinearfilter_v6.asm',
|
||||
'vp8/common/arm/armv6/copymem16x16_v6.asm',
|
||||
'vp8/common/arm/armv6/copymem8x4_v6.asm',
|
||||
'vp8/common/arm/armv6/copymem8x8_v6.asm',
|
||||
'vp8/common/arm/armv6/dc_only_idct_add_v6.asm',
|
||||
'vp8/common/arm/armv6/dequant_idct_v6.asm',
|
||||
'vp8/common/arm/armv6/dequantize_v6.asm',
|
||||
'vp8/common/arm/armv6/filter_v6.asm',
|
||||
'vp8/common/arm/armv6/idct_blk_v6.c',
|
||||
'vp8/common/arm/armv6/idct_v6.asm',
|
||||
'vp8/common/arm/armv6/intra4x4_predict_v6.asm',
|
||||
'vp8/common/arm/armv6/iwalsh_v6.asm',
|
||||
'vp8/common/arm/armv6/loopfilter_v6.asm',
|
||||
'vp8/common/arm/armv6/simpleloopfilter_v6.asm',
|
||||
'vp8/common/arm/armv6/sixtappredict8x4_v6.asm',
|
||||
'vp8/common/arm/armv6/vp8_sad16x16_armv6.asm',
|
||||
'vp8/common/arm/armv6/vp8_variance16x16_armv6.asm',
|
||||
'vp8/common/arm/armv6/vp8_variance8x8_armv6.asm',
|
||||
'vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm',
|
||||
'vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm',
|
||||
'vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm',
|
||||
'vp8/common/arm/bilinearfilter_arm.c',
|
||||
'vp8/common/arm/dequantize_arm.c',
|
||||
'vp8/common/arm/filter_arm.c',
|
||||
'vp8/common/arm/loopfilter_arm.c',
|
||||
'vp8/common/arm/neon/bilinearpredict_neon.c',
|
||||
'vp8/common/arm/neon/copymem_neon.c',
|
||||
'vp8/common/arm/neon/dc_only_idct_add_neon.c',
|
||||
'vp8/common/arm/neon/dequant_idct_neon.c',
|
||||
'vp8/common/arm/neon/dequantizeb_neon.c',
|
||||
'vp8/common/arm/neon/idct_blk_neon.c',
|
||||
'vp8/common/arm/neon/idct_dequant_0_2x_neon.c',
|
||||
'vp8/common/arm/neon/idct_dequant_full_2x_neon.c',
|
||||
'vp8/common/arm/neon/iwalsh_neon.c',
|
||||
'vp8/common/arm/neon/loopfilter_neon.c',
|
||||
'vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c',
|
||||
'vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c',
|
||||
'vp8/common/arm/neon/mbloopfilter_neon.c',
|
||||
'vp8/common/arm/neon/reconintra_neon.c',
|
||||
'vp8/common/arm/neon/sad_neon.c',
|
||||
'vp8/common/arm/neon/shortidct4x4llm_neon.c',
|
||||
'vp8/common/arm/neon/sixtappredict_neon.c',
|
||||
'vp8/common/arm/neon/variance_neon.c',
|
||||
'vp8/common/arm/neon/vp8_subpixelvariance_neon.c',
|
||||
'vp8/common/arm/variance_arm.c',
|
||||
'vp8/encoder/arm/armv5te/boolhuff_armv5te.asm',
|
||||
'vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm',
|
||||
'vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm',
|
||||
'vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm',
|
||||
'vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm',
|
||||
'vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm',
|
||||
'vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm',
|
||||
'vp8/encoder/arm/armv6/vp8_subtract_armv6.asm',
|
||||
'vp8/encoder/arm/armv6/walsh_v6.asm',
|
||||
'vp8/encoder/arm/boolhuff_arm.c',
|
||||
'vp8/encoder/arm/dct_arm.c',
|
||||
'vp8/encoder/arm/neon/denoising_neon.c',
|
||||
'vp8/encoder/arm/neon/fastquantizeb_neon.asm',
|
||||
'vp8/encoder/arm/neon/shortfdct_neon.c',
|
||||
'vp8/encoder/arm/neon/subtract_neon.c',
|
||||
'vp8/encoder/arm/neon/vp8_mse16x16_neon.asm',
|
||||
'vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c',
|
||||
'vp8/encoder/arm/quantize_arm.c',
|
||||
'vp9/common/arm/neon/vp9_avg_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_convolve8_avg_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_convolve8_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_convolve_neon.c',
|
||||
'vp9/common/arm/neon/vp9_copy_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_idct16x16_add_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_idct16x16_neon.c',
|
||||
'vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_idct32x32_add_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_idct4x4_add_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_idct8x8_add_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_iht4x4_add_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_iht8x8_add_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_loopfilter_16_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_loopfilter_16_neon.c',
|
||||
'vp9/common/arm/neon/vp9_loopfilter_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_mb_lpf_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_reconintra_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_save_reg_neon.asm',
|
||||
'vp9/encoder/arm/neon/vp9_dct_neon.c',
|
||||
'vp9/encoder/arm/neon/vp9_quantize_neon.c',
|
||||
'vp9/encoder/arm/neon/vp9_sad_neon.c',
|
||||
'vp9/encoder/arm/neon/vp9_subtract_neon.c',
|
||||
'vp9/encoder/arm/neon/vp9_variance_neon.c',
|
||||
'vpx_ports/arm_cpudetect.c'],
|
||||
'AVX2': ['vp9/common/x86/vp9_loopfilter_intrin_avx2.c',
|
||||
'vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c',
|
||||
'vp9/encoder/x86/vp9_dct32x32_avx2.c',
|
||||
'vp9/encoder/x86/vp9_dct_avx2.c',
|
||||
'vp9/encoder/x86/vp9_error_intrin_avx2.c',
|
||||
'vp9/encoder/x86/vp9_sad4d_intrin_avx2.c',
|
||||
'vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c',
|
||||
'vp9/encoder/x86/vp9_variance_avx2.c',
|
||||
'vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c'],
|
||||
'ERROR_CONCEALMENT': ['vp8/decoder/error_concealment.c'],
|
||||
'EXPORTS': ['vpx/vp8.h',
|
||||
'vpx/vp8cx.h',
|
||||
@ -246,6 +107,7 @@ files = {
|
||||
'vpx/vpx_codec.h',
|
||||
'vpx/vpx_decoder.h',
|
||||
'vpx/vpx_encoder.h',
|
||||
'vpx/vpx_frame_buffer.h',
|
||||
'vpx/vpx_image.h',
|
||||
'vpx/vpx_integer.h',
|
||||
'vpx_mem/include/vpx_mem_intrnl.h',
|
||||
@ -256,86 +118,225 @@ files = {
|
||||
'vpx_ports/x86.h',
|
||||
'vpx_scale/vpx_scale.h',
|
||||
'vpx_scale/yv12config.h'],
|
||||
'SOURCES': ['vp8/common/rtcd.c',
|
||||
'vp8/common/sad_c.c',
|
||||
'vp8/encoder/bitstream.c',
|
||||
'vp8/encoder/onyx_if.c',
|
||||
'vp8/vp8_dx_iface.c',
|
||||
'vp9/common/vp9_alloccommon.c',
|
||||
'vp9/common/vp9_blockd.c',
|
||||
'vp9/common/vp9_common_data.c',
|
||||
'vp9/common/vp9_convolve.c',
|
||||
'vp9/common/vp9_debugmodes.c',
|
||||
'vp9/common/vp9_entropy.c',
|
||||
'vp9/common/vp9_entropymode.c',
|
||||
'vp9/common/vp9_entropymv.c',
|
||||
'vp9/common/vp9_filter.c',
|
||||
'vp9/common/vp9_frame_buffers.c',
|
||||
'vp9/common/vp9_idct.c',
|
||||
'vp9/common/vp9_loopfilter.c',
|
||||
'vp9/common/vp9_loopfilter_filters.c',
|
||||
'vp9/common/vp9_mvref_common.c',
|
||||
'vp9/common/vp9_pred_common.c',
|
||||
'vp9/common/vp9_prob.c',
|
||||
'vp9/common/vp9_quant_common.c',
|
||||
'vp9/common/vp9_reconinter.c',
|
||||
'vp9/common/vp9_reconintra.c',
|
||||
'vp9/common/vp9_rtcd.c',
|
||||
'vp9/common/vp9_scale.c',
|
||||
'vp9/common/vp9_scan.c',
|
||||
'vp9/common/vp9_seg_common.c',
|
||||
'vp9/common/vp9_thread.c',
|
||||
'vp9/common/vp9_tile_common.c',
|
||||
'vp9/decoder/vp9_decodeframe.c',
|
||||
'vp9/decoder/vp9_decodemv.c',
|
||||
'vp9/decoder/vp9_decoder.c',
|
||||
'vp9/decoder/vp9_detokenize.c',
|
||||
'vp9/decoder/vp9_dsubexp.c',
|
||||
'vp9/decoder/vp9_dthread.c',
|
||||
'vp9/decoder/vp9_reader.c',
|
||||
'vp9/encoder/vp9_aq_complexity.c',
|
||||
'vp9/encoder/vp9_aq_cyclicrefresh.c',
|
||||
'vp9/encoder/vp9_aq_variance.c',
|
||||
'vp9/encoder/vp9_bitstream.c',
|
||||
'vp9/encoder/vp9_context_tree.c',
|
||||
'vp9/encoder/vp9_cost.c',
|
||||
'vp9/encoder/vp9_dct.c',
|
||||
'vp9/encoder/vp9_encodeframe.c',
|
||||
'vp9/encoder/vp9_encodemb.c',
|
||||
'vp9/encoder/vp9_encodemv.c',
|
||||
'vp9/encoder/vp9_encoder.c',
|
||||
'vp9/encoder/vp9_extend.c',
|
||||
'vp9/encoder/vp9_firstpass.c',
|
||||
'vp9/encoder/vp9_lookahead.c',
|
||||
'vp9/encoder/vp9_mbgraph.c',
|
||||
'vp9/encoder/vp9_mcomp.c',
|
||||
'vp9/encoder/vp9_picklpf.c',
|
||||
'vp9/encoder/vp9_pickmode.c',
|
||||
'vp9/encoder/vp9_quantize.c',
|
||||
'vp9/encoder/vp9_ratectrl.c',
|
||||
'vp9/encoder/vp9_rd.c',
|
||||
'vp9/encoder/vp9_rdopt.c',
|
||||
'vp9/encoder/vp9_resize.c',
|
||||
'vp9/encoder/vp9_sad.c',
|
||||
'vp9/encoder/vp9_segmentation.c',
|
||||
'vp9/encoder/vp9_speed_features.c',
|
||||
'vp9/encoder/vp9_subexp.c',
|
||||
'vp9/encoder/vp9_svc_layercontext.c',
|
||||
'vp9/encoder/vp9_temporal_filter.c',
|
||||
'vp9/encoder/vp9_tokenize.c',
|
||||
'vp9/encoder/vp9_treewriter.c',
|
||||
'vp9/encoder/vp9_variance.c',
|
||||
'vp9/encoder/vp9_write_bit_buffer.c',
|
||||
'vp9/encoder/vp9_writer.c',
|
||||
'vp9/vp9_cx_iface.c',
|
||||
'vp9/vp9_dx_iface.c',
|
||||
'vpx/src/svc_encodeframe.c',
|
||||
'vpx/src/vpx_encoder.c',
|
||||
'vpx_mem/vpx_mem.c',
|
||||
'vpx_scale/generic/yv12config.c',
|
||||
'vpx_scale/generic/yv12extend.c',
|
||||
'vpx_scale/vpx_scale_rtcd.c'],
|
||||
'UNIFIED_SOURCES': ['vp8/common/alloccommon.c',
|
||||
'vp8/common/blockd.c',
|
||||
'vp8/common/debugmodes.c',
|
||||
'vp8/common/dequantize.c',
|
||||
'vp8/common/entropy.c',
|
||||
'vp8/common/entropymode.c',
|
||||
'vp8/common/entropymv.c',
|
||||
'vp8/common/extend.c',
|
||||
'vp8/common/filter.c',
|
||||
'vp8/common/findnearmv.c',
|
||||
'vp8/common/generic/systemdependent.c',
|
||||
'vp8/common/idct_blk.c',
|
||||
'vp8/common/idctllm.c',
|
||||
'vp8/common/loopfilter.c',
|
||||
'vp8/common/loopfilter_filters.c',
|
||||
'vp8/common/mbpitch.c',
|
||||
'vp8/common/modecont.c',
|
||||
'vp8/common/quant_common.c',
|
||||
'vp8/common/reconinter.c',
|
||||
'vp8/common/reconintra.c',
|
||||
'vp8/common/reconintra4x4.c',
|
||||
'vp8/common/setupintrarecon.c',
|
||||
'vp8/common/swapyv12buffer.c',
|
||||
'vp8/common/treecoder.c',
|
||||
'vp8/common/variance_c.c',
|
||||
'vp8/decoder/dboolhuff.c',
|
||||
'vp8/decoder/decodeframe.c',
|
||||
'vp8/decoder/decodemv.c',
|
||||
'vp8/decoder/detokenize.c',
|
||||
'vp8/decoder/onyxd_if.c',
|
||||
'vp8/decoder/threading.c',
|
||||
'vp8/encoder/dct.c',
|
||||
'vp8/encoder/denoising.c',
|
||||
'vp8/encoder/encodeframe.c',
|
||||
'vp8/encoder/encodeintra.c',
|
||||
'vp8/encoder/encodemb.c',
|
||||
'vp8/encoder/encodemv.c',
|
||||
'vp8/encoder/ethreading.c',
|
||||
'vp8/encoder/firstpass.c',
|
||||
'vp8/encoder/lookahead.c',
|
||||
'vp8/encoder/mcomp.c',
|
||||
'vp8/encoder/modecosts.c',
|
||||
'vp8/encoder/mr_dissim.c',
|
||||
'vp8/encoder/pickinter.c',
|
||||
'vp8/encoder/picklpf.c',
|
||||
'vp8/encoder/quantize.c',
|
||||
'vp8/encoder/ratectrl.c',
|
||||
'vp8/encoder/rdopt.c',
|
||||
'vp8/encoder/segmentation.c',
|
||||
'vp8/encoder/temporal_filter.c',
|
||||
'vp8/encoder/tokenize.c',
|
||||
'vp8/encoder/treewriter.c',
|
||||
'vp8/vp8_cx_iface.c',
|
||||
'vp9/decoder/vp9_read_bit_buffer.c',
|
||||
'vpx/src/vpx_codec.c',
|
||||
'vpx/src/vpx_decoder.c',
|
||||
'vpx/src/vpx_image.c',
|
||||
'vpx/src/vpx_psnr.c',
|
||||
'vpx_scale/generic/gen_scalers.c',
|
||||
'vpx_scale/generic/vpx_scale.c'],
|
||||
'VP8_POSTPROC': ['vp8/common/mfqe.c', 'vp8/common/postproc.c'],
|
||||
'VP9_POSTPROC': ['vp9/common/vp9_postproc.c'],
|
||||
'X86-64_ASM': ['third_party/x86inc/x86inc.asm',
|
||||
'vp8/common/x86/loopfilter_block_sse2.asm',
|
||||
'vp9/encoder/x86/vp9_quantize_ssse3.asm'],
|
||||
'X86_ASM': [
|
||||
'vp8/common/x86/dequantize_mmx.asm',
|
||||
'vp8/common/x86/filter_x86.c',
|
||||
'vp8/common/x86/idct_blk_mmx.c',
|
||||
'vp8/common/x86/idct_blk_sse2.c',
|
||||
'vp8/common/x86/idctllm_mmx.asm',
|
||||
'vp8/common/x86/idctllm_sse2.asm',
|
||||
'vp8/common/x86/iwalsh_mmx.asm',
|
||||
'vp8/common/x86/iwalsh_sse2.asm',
|
||||
'vp8/common/x86/loopfilter_mmx.asm',
|
||||
'vp8/common/x86/loopfilter_sse2.asm',
|
||||
'vp8/common/x86/loopfilter_x86.c',
|
||||
'vp8/common/x86/mfqe_sse2.asm',
|
||||
'vp8/common/x86/postproc_mmx.asm',
|
||||
'vp8/common/x86/postproc_sse2.asm',
|
||||
'vp8/common/x86/postproc_x86.c',
|
||||
'vp8/common/x86/recon_mmx.asm',
|
||||
'vp8/common/x86/recon_sse2.asm',
|
||||
'vp8/common/x86/recon_wrapper_sse2.c',
|
||||
'vp8/common/x86/sad_mmx.asm',
|
||||
'vp8/common/x86/sad_sse2.asm',
|
||||
'vp8/common/x86/sad_sse3.asm',
|
||||
'vp8/common/x86/sad_sse4.asm',
|
||||
'vp8/common/x86/sad_ssse3.asm',
|
||||
'vp8/common/x86/subpixel_mmx.asm',
|
||||
'vp8/common/x86/subpixel_sse2.asm',
|
||||
'vp8/common/x86/subpixel_ssse3.asm',
|
||||
'vp8/common/x86/variance_impl_mmx.asm',
|
||||
'vp8/common/x86/variance_impl_sse2.asm',
|
||||
'vp8/common/x86/variance_impl_ssse3.asm',
|
||||
'vp8/common/x86/variance_mmx.c',
|
||||
'vp8/common/x86/variance_sse2.c',
|
||||
'vp8/common/x86/variance_ssse3.c',
|
||||
'vp8/common/x86/vp8_asm_stubs.c',
|
||||
'vp8/encoder/x86/dct_mmx.asm',
|
||||
'vp8/encoder/x86/dct_sse2.asm',
|
||||
'vp8/encoder/x86/denoising_sse2.c',
|
||||
'vp8/encoder/x86/encodeopt.asm',
|
||||
'vp8/encoder/x86/fwalsh_sse2.asm',
|
||||
'vp8/encoder/x86/quantize_mmx.asm',
|
||||
'vp8/encoder/x86/quantize_sse2.c',
|
||||
'vp8/encoder/x86/quantize_sse4.asm',
|
||||
'vp8/encoder/x86/quantize_ssse3.asm',
|
||||
'vp8/encoder/x86/subtract_mmx.asm',
|
||||
'vp8/encoder/x86/subtract_sse2.asm',
|
||||
'vp8/encoder/x86/temporal_filter_apply_sse2.asm',
|
||||
'vp8/encoder/x86/vp8_enc_stubs_mmx.c',
|
||||
'vp8/encoder/x86/vp8_enc_stubs_sse2.c',
|
||||
'vp9/common/x86/vp9_asm_stubs.c',
|
||||
'vp9/common/x86/vp9_copy_sse2.asm',
|
||||
'vp9/common/x86/vp9_idct_intrin_sse2.c',
|
||||
'vp9/common/x86/vp9_intrapred_sse2.asm',
|
||||
'vp9/common/x86/vp9_intrapred_ssse3.asm',
|
||||
'vp9/common/x86/vp9_loopfilter_intrin_sse2.c',
|
||||
'vp9/common/x86/vp9_loopfilter_mmx.asm',
|
||||
'vp9/common/x86/vp9_subpixel_8t_sse2.asm',
|
||||
'vp9/common/x86/vp9_subpixel_8t_ssse3.asm',
|
||||
'vp9/encoder/x86/vp9_dct32x32_sse2.c',
|
||||
'vp9/encoder/x86/vp9_dct_sse2.c',
|
||||
'vp9/encoder/x86/vp9_error_sse2.asm',
|
||||
'vp9/encoder/x86/vp9_sad4d_sse2.asm',
|
||||
'vp9/encoder/x86/vp9_sad_mmx.asm',
|
||||
'vp9/encoder/x86/vp9_sad_sse2.asm',
|
||||
'vp9/encoder/x86/vp9_sad_sse3.asm',
|
||||
'vp9/encoder/x86/vp9_sad_sse4.asm',
|
||||
'vp9/encoder/x86/vp9_sad_ssse3.asm',
|
||||
'vp9/encoder/x86/vp9_subpel_variance.asm',
|
||||
'vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm',
|
||||
'vp9/encoder/x86/vp9_subtract_sse2.asm',
|
||||
'vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm',
|
||||
'vp9/encoder/x86/vp9_variance_impl_mmx.asm',
|
||||
'vp9/encoder/x86/vp9_variance_impl_sse2.asm',
|
||||
'vp9/encoder/x86/vp9_variance_mmx.c',
|
||||
'vp9/encoder/x86/vp9_variance_sse2.c',
|
||||
'vpx_ports/emms.asm',
|
||||
'vpx_ports/x86_cpuid.c',
|
||||
]
|
||||
'vp8/common/x86/loopfilter_block_sse2_x86_64.asm',
|
||||
'vp8/encoder/x86/ssim_opt_x86_64.asm',
|
||||
'vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm',
|
||||
'vp9/encoder/x86/vp9_ssim_opt_x86_64.asm'],
|
||||
'X86_ASM': ['vp8/common/x86/dequantize_mmx.asm',
|
||||
'vp8/common/x86/filter_x86.c',
|
||||
'vp8/common/x86/idct_blk_mmx.c',
|
||||
'vp8/common/x86/idct_blk_sse2.c',
|
||||
'vp8/common/x86/idctllm_mmx.asm',
|
||||
'vp8/common/x86/idctllm_sse2.asm',
|
||||
'vp8/common/x86/iwalsh_mmx.asm',
|
||||
'vp8/common/x86/iwalsh_sse2.asm',
|
||||
'vp8/common/x86/loopfilter_mmx.asm',
|
||||
'vp8/common/x86/loopfilter_sse2.asm',
|
||||
'vp8/common/x86/loopfilter_x86.c',
|
||||
'vp8/common/x86/mfqe_sse2.asm',
|
||||
'vp8/common/x86/postproc_mmx.asm',
|
||||
'vp8/common/x86/postproc_sse2.asm',
|
||||
'vp8/common/x86/recon_mmx.asm',
|
||||
'vp8/common/x86/recon_sse2.asm',
|
||||
'vp8/common/x86/recon_wrapper_sse2.c',
|
||||
'vp8/common/x86/sad_mmx.asm',
|
||||
'vp8/common/x86/sad_sse2.asm',
|
||||
'vp8/common/x86/sad_sse3.asm',
|
||||
'vp8/common/x86/sad_sse4.asm',
|
||||
'vp8/common/x86/sad_ssse3.asm',
|
||||
'vp8/common/x86/subpixel_mmx.asm',
|
||||
'vp8/common/x86/subpixel_sse2.asm',
|
||||
'vp8/common/x86/subpixel_ssse3.asm',
|
||||
'vp8/common/x86/variance_impl_mmx.asm',
|
||||
'vp8/common/x86/variance_impl_sse2.asm',
|
||||
'vp8/common/x86/variance_impl_ssse3.asm',
|
||||
'vp8/common/x86/variance_mmx.c',
|
||||
'vp8/common/x86/variance_sse2.c',
|
||||
'vp8/common/x86/variance_ssse3.c',
|
||||
'vp8/common/x86/vp8_asm_stubs.c',
|
||||
'vp8/encoder/x86/dct_mmx.asm',
|
||||
'vp8/encoder/x86/dct_sse2.asm',
|
||||
'vp8/encoder/x86/denoising_sse2.c',
|
||||
'vp8/encoder/x86/encodeopt.asm',
|
||||
'vp8/encoder/x86/fwalsh_sse2.asm',
|
||||
'vp8/encoder/x86/quantize_mmx.asm',
|
||||
'vp8/encoder/x86/quantize_sse2.c',
|
||||
'vp8/encoder/x86/quantize_sse4.c',
|
||||
'vp8/encoder/x86/quantize_ssse3.c',
|
||||
'vp8/encoder/x86/subtract_mmx.asm',
|
||||
'vp8/encoder/x86/subtract_sse2.asm',
|
||||
'vp8/encoder/x86/temporal_filter_apply_sse2.asm',
|
||||
'vp8/encoder/x86/vp8_enc_stubs_mmx.c',
|
||||
'vp8/encoder/x86/vp8_enc_stubs_sse2.c',
|
||||
'vp9/common/x86/vp9_asm_stubs.c',
|
||||
'vp9/common/x86/vp9_copy_sse2.asm',
|
||||
'vp9/common/x86/vp9_idct_intrin_sse2.c',
|
||||
'vp9/common/x86/vp9_idct_intrin_ssse3.c',
|
||||
'vp9/common/x86/vp9_idct_ssse3_x86_64.asm',
|
||||
'vp9/common/x86/vp9_intrapred_sse2.asm',
|
||||
'vp9/common/x86/vp9_intrapred_ssse3.asm',
|
||||
'vp9/common/x86/vp9_loopfilter_intrin_sse2.c',
|
||||
'vp9/common/x86/vp9_loopfilter_mmx.asm',
|
||||
'vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c',
|
||||
'vp9/common/x86/vp9_subpixel_8t_sse2.asm',
|
||||
'vp9/common/x86/vp9_subpixel_8t_ssse3.asm',
|
||||
'vp9/common/x86/vp9_subpixel_bilinear_sse2.asm',
|
||||
'vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm',
|
||||
'vp9/encoder/x86/vp9_dct32x32_sse2.c',
|
||||
'vp9/encoder/x86/vp9_dct_mmx.asm',
|
||||
'vp9/encoder/x86/vp9_dct_sse2.c',
|
||||
'vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm',
|
||||
'vp9/encoder/x86/vp9_error_sse2.asm',
|
||||
'vp9/encoder/x86/vp9_sad4d_sse2.asm',
|
||||
'vp9/encoder/x86/vp9_sad_sse2.asm',
|
||||
'vp9/encoder/x86/vp9_sad_sse3.asm',
|
||||
'vp9/encoder/x86/vp9_sad_sse4.asm',
|
||||
'vp9/encoder/x86/vp9_sad_ssse3.asm',
|
||||
'vp9/encoder/x86/vp9_subpel_variance.asm',
|
||||
'vp9/encoder/x86/vp9_subtract_sse2.asm',
|
||||
'vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm',
|
||||
'vp9/encoder/x86/vp9_variance_sse2.c',
|
||||
'vpx_ports/emms.asm']
|
||||
}
|
||||
|
45
media/libvpx/third_party/x86inc/x86inc.asm
vendored
45
media/libvpx/third_party/x86inc/x86inc.asm
vendored
@ -234,10 +234,10 @@ ALIGNMODE k7
|
||||
%define r%1mp %2
|
||||
%elif ARCH_X86_64 ; memory
|
||||
%define r%1m [rsp + stack_offset + %6]
|
||||
%define r%1mp qword r %+ %1m
|
||||
%define r%1mp qword r %+ %1 %+ m
|
||||
%else
|
||||
%define r%1m [esp + stack_offset + %6]
|
||||
%define r%1mp dword r %+ %1m
|
||||
%define r%1mp dword r %+ %1 %+ m
|
||||
%endif
|
||||
%define r%1 %2
|
||||
%endmacro
|
||||
@ -395,6 +395,23 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
|
||||
%assign n_arg_names %0
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_64
|
||||
%macro ALLOC_STACK 2 ; stack_size, num_regs
|
||||
%assign %%stack_aligment ((mmsize + 15) & ~15)
|
||||
%assign stack_size_padded %1
|
||||
|
||||
%assign %%reg_num (%2 - 1)
|
||||
%xdefine rsp_tmp r %+ %%reg_num
|
||||
mov rsp_tmp, rsp
|
||||
sub rsp, stack_size_padded
|
||||
and rsp, ~(%%stack_aligment - 1)
|
||||
%endmacro
|
||||
|
||||
%macro RESTORE_STACK 0 ; reset rsp register
|
||||
mov rsp, rsp_tmp
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%if WIN64 ; Windows x64 ;=================================================
|
||||
|
||||
DECLARE_REG 0, rcx, ecx, cx, cl
|
||||
@ -592,16 +609,20 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
|
||||
CAT_XDEFINE cglobaled_, %1, 1
|
||||
%endif
|
||||
%xdefine current_function %1
|
||||
%ifidn __OUTPUT_FORMAT__,elf
|
||||
global %1:function hidden
|
||||
%elifidn __OUTPUT_FORMAT__,elf32
|
||||
global %1:function hidden
|
||||
%elifidn __OUTPUT_FORMAT__,elf64
|
||||
global %1:function hidden
|
||||
%elifidn __OUTPUT_FORMAT__,macho32
|
||||
global %1:private_extern
|
||||
%elifidn __OUTPUT_FORMAT__,macho64
|
||||
global %1:private_extern
|
||||
%ifdef CHROMIUM
|
||||
%ifidn __OUTPUT_FORMAT__,elf
|
||||
global %1:function hidden
|
||||
%elifidn __OUTPUT_FORMAT__,elf32
|
||||
global %1:function hidden
|
||||
%elifidn __OUTPUT_FORMAT__,elf64
|
||||
global %1:function hidden
|
||||
%elifidn __OUTPUT_FORMAT__,macho32
|
||||
global %1:private_extern
|
||||
%elifidn __OUTPUT_FORMAT__,macho64
|
||||
global %1:private_extern
|
||||
%else
|
||||
global %1
|
||||
%endif
|
||||
%else
|
||||
global %1
|
||||
%endif
|
||||
|
@ -9,15 +9,23 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __INC_ALLOCCOMMON_H
|
||||
#define __INC_ALLOCCOMMON_H
|
||||
#ifndef VP8_COMMON_ALLOCCOMMON_H_
|
||||
#define VP8_COMMON_ALLOCCOMMON_H_
|
||||
|
||||
#include "onyxc_int.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void vp8_create_common(VP8_COMMON *oci);
|
||||
void vp8_remove_common(VP8_COMMON *oci);
|
||||
void vp8_de_alloc_frame_buffers(VP8_COMMON *oci);
|
||||
int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height);
|
||||
void vp8_setup_version(VP8_COMMON *oci);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_ALLOCCOMMON_H_
|
||||
|
@ -53,7 +53,7 @@ loop
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; substract negative differences from sum
|
||||
subs r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -77,7 +77,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -101,7 +101,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -127,7 +127,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
@ -51,7 +51,7 @@ loop
|
||||
orr r8, r8, r10 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
add r4, r4, r6 ; add positive differences to sum
|
||||
sub r4, r4, r7 ; substract negative differences from sum
|
||||
sub r4, r4, r7 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r7, r8 ; byte (two pixels) to halfwords
|
||||
@ -77,7 +77,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r4, r4, r6 ; add positive differences to sum
|
||||
sub r4, r4, r7 ; substract negative differences from sum
|
||||
sub r4, r4, r7 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r7, r8 ; byte (two pixels) to halfwords
|
||||
|
@ -58,7 +58,7 @@ loop
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; substract negative differences from sum
|
||||
subs r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -89,7 +89,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -120,7 +120,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -153,7 +153,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
@ -69,7 +69,7 @@ loop
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; substract negative differences from sum
|
||||
subs r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -111,7 +111,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -153,7 +153,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -195,7 +195,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
@ -59,7 +59,7 @@ loop
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; substract negative differences from sum
|
||||
subs r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -90,7 +90,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -121,7 +121,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
@ -154,7 +154,7 @@ loop
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
@ -9,8 +9,12 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef BILINEARFILTER_ARM_H
|
||||
#define BILINEARFILTER_ARM_H
|
||||
#ifndef VP8_COMMON_ARM_BILINEARFILTER_ARM_H_
|
||||
#define VP8_COMMON_ARM_BILINEARFILTER_ARM_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern void vp8_filter_block2d_bil_first_pass_armv6
|
||||
(
|
||||
@ -32,4 +36,8 @@ extern void vp8_filter_block2d_bil_second_pass_armv6
|
||||
const short *vp8_filter
|
||||
);
|
||||
|
||||
#endif /* BILINEARFILTER_ARM_H */
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_ARM_BILINEARFILTER_ARM_H_
|
||||
|
@ -12,26 +12,9 @@
|
||||
#include "vpx_config.h"
|
||||
#include "vp8/common/blockd.h"
|
||||
|
||||
#if HAVE_NEON
|
||||
extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
|
||||
#endif
|
||||
|
||||
#if HAVE_MEDIA
|
||||
extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
|
||||
#endif
|
||||
|
||||
#if HAVE_NEON
|
||||
|
||||
void vp8_dequantize_b_neon(BLOCKD *d, short *DQC)
|
||||
{
|
||||
short *DQ = d->dqcoeff;
|
||||
short *Q = d->qcoeff;
|
||||
|
||||
vp8_dequantize_b_loop_neon(Q, DQC, DQ);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if HAVE_MEDIA
|
||||
void vp8_dequantize_b_v6(BLOCKD *d, short *DQC)
|
||||
{
|
||||
short *DQ = d->dqcoeff;
|
||||
|
@ -34,11 +34,11 @@ typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
|
||||
|
||||
extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
|
||||
extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
|
||||
extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
|
||||
extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
|
||||
|
||||
extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
|
||||
extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
|
||||
|
||||
extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
|
||||
extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
|
||||
extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
|
||||
extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
|
||||
#endif
|
||||
|
@ -1,357 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_bilinear_predict16x16_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; r4 unsigned char *dst_ptr,
|
||||
; stack(r5) int dst_pitch
|
||||
|
||||
|vp8_bilinear_predict16x16_neon| PROC
|
||||
push {r4-r5, lr}
|
||||
|
||||
adr r12, bifilter16_coeff
|
||||
ldr r4, [sp, #12] ;load parameters from stack
|
||||
ldr r5, [sp, #16] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_bfilter16x16_only
|
||||
|
||||
add r2, r12, r2, lsl #3 ;calculate filter location
|
||||
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
|
||||
vld1.s32 {d31}, [r2] ;load first_pass filter
|
||||
|
||||
beq firstpass_bfilter16x16_only
|
||||
|
||||
sub sp, sp, #272 ;reserve space on stack for temporary storage
|
||||
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
|
||||
mov lr, sp
|
||||
vld1.u8 {d5, d6, d7}, [r0], r1
|
||||
|
||||
mov r2, #3 ;loop counter
|
||||
vld1.u8 {d8, d9, d10}, [r0], r1
|
||||
|
||||
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
|
||||
vld1.u8 {d11, d12, d13}, [r0], r1
|
||||
|
||||
vdup.8 d1, d31[4]
|
||||
|
||||
;First Pass: output_height lines x output_width columns (17x16)
|
||||
filt_blk2d_fp16x16_loop_neon
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0])
|
||||
vmull.u8 q8, d3, d0
|
||||
vmull.u8 q9, d5, d0
|
||||
vmull.u8 q10, d6, d0
|
||||
vmull.u8 q11, d8, d0
|
||||
vmull.u8 q12, d9, d0
|
||||
vmull.u8 q13, d11, d0
|
||||
vmull.u8 q14, d12, d0
|
||||
|
||||
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
|
||||
vext.8 d5, d5, d6, #1
|
||||
vext.8 d8, d8, d9, #1
|
||||
vext.8 d11, d11, d12, #1
|
||||
|
||||
vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1])
|
||||
vmlal.u8 q9, d5, d1
|
||||
vmlal.u8 q11, d8, d1
|
||||
vmlal.u8 q13, d11, d1
|
||||
|
||||
vext.8 d3, d3, d4, #1
|
||||
vext.8 d6, d6, d7, #1
|
||||
vext.8 d9, d9, d10, #1
|
||||
vext.8 d12, d12, d13, #1
|
||||
|
||||
vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1])
|
||||
vmlal.u8 q10, d6, d1
|
||||
vmlal.u8 q12, d9, d1
|
||||
vmlal.u8 q14, d12, d1
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d15, q8, #7
|
||||
vqrshrn.u16 d16, q9, #7
|
||||
vqrshrn.u16 d17, q10, #7
|
||||
vqrshrn.u16 d18, q11, #7
|
||||
vqrshrn.u16 d19, q12, #7
|
||||
vqrshrn.u16 d20, q13, #7
|
||||
|
||||
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
|
||||
vqrshrn.u16 d21, q14, #7
|
||||
vld1.u8 {d5, d6, d7}, [r0], r1
|
||||
|
||||
vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
|
||||
vld1.u8 {d8, d9, d10}, [r0], r1
|
||||
vst1.u8 {d18, d19, d20, d21}, [lr]!
|
||||
vld1.u8 {d11, d12, d13}, [r0], r1
|
||||
|
||||
bne filt_blk2d_fp16x16_loop_neon
|
||||
|
||||
;First-pass filtering for rest 5 lines
|
||||
vld1.u8 {d14, d15, d16}, [r0], r1
|
||||
|
||||
vmull.u8 q9, d2, d0 ;(src_ptr[0] * vp8_filter[0])
|
||||
vmull.u8 q10, d3, d0
|
||||
vmull.u8 q11, d5, d0
|
||||
vmull.u8 q12, d6, d0
|
||||
vmull.u8 q13, d8, d0
|
||||
vmull.u8 q14, d9, d0
|
||||
|
||||
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
|
||||
vext.8 d5, d5, d6, #1
|
||||
vext.8 d8, d8, d9, #1
|
||||
|
||||
vmlal.u8 q9, d2, d1 ;(src_ptr[0] * vp8_filter[1])
|
||||
vmlal.u8 q11, d5, d1
|
||||
vmlal.u8 q13, d8, d1
|
||||
|
||||
vext.8 d3, d3, d4, #1
|
||||
vext.8 d6, d6, d7, #1
|
||||
vext.8 d9, d9, d10, #1
|
||||
|
||||
vmlal.u8 q10, d3, d1 ;(src_ptr[0] * vp8_filter[1])
|
||||
vmlal.u8 q12, d6, d1
|
||||
vmlal.u8 q14, d9, d1
|
||||
|
||||
vmull.u8 q1, d11, d0
|
||||
vmull.u8 q2, d12, d0
|
||||
vmull.u8 q3, d14, d0
|
||||
vmull.u8 q4, d15, d0
|
||||
|
||||
vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
|
||||
vext.8 d14, d14, d15, #1
|
||||
|
||||
vmlal.u8 q1, d11, d1 ;(src_ptr[0] * vp8_filter[1])
|
||||
vmlal.u8 q3, d14, d1
|
||||
|
||||
vext.8 d12, d12, d13, #1
|
||||
vext.8 d15, d15, d16, #1
|
||||
|
||||
vmlal.u8 q2, d12, d1 ;(src_ptr[0] * vp8_filter[1])
|
||||
vmlal.u8 q4, d15, d1
|
||||
|
||||
vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d11, q10, #7
|
||||
vqrshrn.u16 d12, q11, #7
|
||||
vqrshrn.u16 d13, q12, #7
|
||||
vqrshrn.u16 d14, q13, #7
|
||||
vqrshrn.u16 d15, q14, #7
|
||||
vqrshrn.u16 d16, q1, #7
|
||||
vqrshrn.u16 d17, q2, #7
|
||||
vqrshrn.u16 d18, q3, #7
|
||||
vqrshrn.u16 d19, q4, #7
|
||||
|
||||
vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
|
||||
vst1.u8 {d14, d15, d16, d17}, [lr]!
|
||||
vst1.u8 {d18, d19}, [lr]!
|
||||
|
||||
;Second pass: 16x16
|
||||
;secondpass_filter
|
||||
add r3, r12, r3, lsl #3
|
||||
sub lr, lr, #272
|
||||
|
||||
vld1.u32 {d31}, [r3] ;load second_pass filter
|
||||
|
||||
vld1.u8 {d22, d23}, [lr]! ;load src data
|
||||
|
||||
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
|
||||
vdup.8 d1, d31[4]
|
||||
mov r12, #4 ;loop counter
|
||||
|
||||
filt_blk2d_sp16x16_loop_neon
|
||||
vld1.u8 {d24, d25}, [lr]!
|
||||
vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0])
|
||||
vld1.u8 {d26, d27}, [lr]!
|
||||
vmull.u8 q2, d23, d0
|
||||
vld1.u8 {d28, d29}, [lr]!
|
||||
vmull.u8 q3, d24, d0
|
||||
vld1.u8 {d30, d31}, [lr]!
|
||||
|
||||
vmull.u8 q4, d25, d0
|
||||
vmull.u8 q5, d26, d0
|
||||
vmull.u8 q6, d27, d0
|
||||
vmull.u8 q7, d28, d0
|
||||
vmull.u8 q8, d29, d0
|
||||
|
||||
vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1])
|
||||
vmlal.u8 q2, d25, d1
|
||||
vmlal.u8 q3, d26, d1
|
||||
vmlal.u8 q4, d27, d1
|
||||
vmlal.u8 q5, d28, d1
|
||||
vmlal.u8 q6, d29, d1
|
||||
vmlal.u8 q7, d30, d1
|
||||
vmlal.u8 q8, d31, d1
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d3, q2, #7
|
||||
vqrshrn.u16 d4, q3, #7
|
||||
vqrshrn.u16 d5, q4, #7
|
||||
vqrshrn.u16 d6, q5, #7
|
||||
vqrshrn.u16 d7, q6, #7
|
||||
vqrshrn.u16 d8, q7, #7
|
||||
vqrshrn.u16 d9, q8, #7
|
||||
|
||||
vst1.u8 {d2, d3}, [r4], r5 ;store result
|
||||
vst1.u8 {d4, d5}, [r4], r5
|
||||
vst1.u8 {d6, d7}, [r4], r5
|
||||
vmov q11, q15
|
||||
vst1.u8 {d8, d9}, [r4], r5
|
||||
|
||||
bne filt_blk2d_sp16x16_loop_neon
|
||||
|
||||
add sp, sp, #272
|
||||
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;--------------------
|
||||
firstpass_bfilter16x16_only
|
||||
mov r2, #4 ;loop counter
|
||||
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
|
||||
vdup.8 d1, d31[4]
|
||||
|
||||
;First Pass: output_height lines x output_width columns (16x16)
|
||||
filt_blk2d_fpo16x16_loop_neon
|
||||
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
|
||||
vld1.u8 {d5, d6, d7}, [r0], r1
|
||||
vld1.u8 {d8, d9, d10}, [r0], r1
|
||||
vld1.u8 {d11, d12, d13}, [r0], r1
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0])
|
||||
vmull.u8 q8, d3, d0
|
||||
vmull.u8 q9, d5, d0
|
||||
vmull.u8 q10, d6, d0
|
||||
vmull.u8 q11, d8, d0
|
||||
vmull.u8 q12, d9, d0
|
||||
vmull.u8 q13, d11, d0
|
||||
vmull.u8 q14, d12, d0
|
||||
|
||||
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
|
||||
vext.8 d5, d5, d6, #1
|
||||
vext.8 d8, d8, d9, #1
|
||||
vext.8 d11, d11, d12, #1
|
||||
|
||||
vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1])
|
||||
vmlal.u8 q9, d5, d1
|
||||
vmlal.u8 q11, d8, d1
|
||||
vmlal.u8 q13, d11, d1
|
||||
|
||||
vext.8 d3, d3, d4, #1
|
||||
vext.8 d6, d6, d7, #1
|
||||
vext.8 d9, d9, d10, #1
|
||||
vext.8 d12, d12, d13, #1
|
||||
|
||||
vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1])
|
||||
vmlal.u8 q10, d6, d1
|
||||
vmlal.u8 q12, d9, d1
|
||||
vmlal.u8 q14, d12, d1
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d15, q8, #7
|
||||
vqrshrn.u16 d16, q9, #7
|
||||
vqrshrn.u16 d17, q10, #7
|
||||
vqrshrn.u16 d18, q11, #7
|
||||
vqrshrn.u16 d19, q12, #7
|
||||
vqrshrn.u16 d20, q13, #7
|
||||
vst1.u8 {d14, d15}, [r4], r5 ;store result
|
||||
vqrshrn.u16 d21, q14, #7
|
||||
|
||||
vst1.u8 {d16, d17}, [r4], r5
|
||||
vst1.u8 {d18, d19}, [r4], r5
|
||||
vst1.u8 {d20, d21}, [r4], r5
|
||||
|
||||
bne filt_blk2d_fpo16x16_loop_neon
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;---------------------
|
||||
secondpass_bfilter16x16_only
|
||||
;Second pass: 16x16
|
||||
;secondpass_filter
|
||||
add r3, r12, r3, lsl #3
|
||||
mov r12, #4 ;loop counter
|
||||
vld1.u32 {d31}, [r3] ;load second_pass filter
|
||||
vld1.u8 {d22, d23}, [r0], r1 ;load src data
|
||||
|
||||
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
|
||||
vdup.8 d1, d31[4]
|
||||
|
||||
filt_blk2d_spo16x16_loop_neon
|
||||
vld1.u8 {d24, d25}, [r0], r1
|
||||
vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0])
|
||||
vld1.u8 {d26, d27}, [r0], r1
|
||||
vmull.u8 q2, d23, d0
|
||||
vld1.u8 {d28, d29}, [r0], r1
|
||||
vmull.u8 q3, d24, d0
|
||||
vld1.u8 {d30, d31}, [r0], r1
|
||||
|
||||
vmull.u8 q4, d25, d0
|
||||
vmull.u8 q5, d26, d0
|
||||
vmull.u8 q6, d27, d0
|
||||
vmull.u8 q7, d28, d0
|
||||
vmull.u8 q8, d29, d0
|
||||
|
||||
vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1])
|
||||
vmlal.u8 q2, d25, d1
|
||||
vmlal.u8 q3, d26, d1
|
||||
vmlal.u8 q4, d27, d1
|
||||
vmlal.u8 q5, d28, d1
|
||||
vmlal.u8 q6, d29, d1
|
||||
vmlal.u8 q7, d30, d1
|
||||
vmlal.u8 q8, d31, d1
|
||||
|
||||
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d3, q2, #7
|
||||
vqrshrn.u16 d4, q3, #7
|
||||
vqrshrn.u16 d5, q4, #7
|
||||
vqrshrn.u16 d6, q5, #7
|
||||
vqrshrn.u16 d7, q6, #7
|
||||
vqrshrn.u16 d8, q7, #7
|
||||
vqrshrn.u16 d9, q8, #7
|
||||
|
||||
vst1.u8 {d2, d3}, [r4], r5 ;store result
|
||||
subs r12, r12, #1
|
||||
vst1.u8 {d4, d5}, [r4], r5
|
||||
vmov q11, q15
|
||||
vst1.u8 {d6, d7}, [r4], r5
|
||||
vst1.u8 {d8, d9}, [r4], r5
|
||||
|
||||
bne filt_blk2d_spo16x16_loop_neon
|
||||
pop {r4-r5,pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
bifilter16_coeff
|
||||
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
|
||||
|
||||
END
|
@ -1,130 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_bilinear_predict4x4_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; r4 unsigned char *dst_ptr,
|
||||
; stack(lr) int dst_pitch
|
||||
|
||||
|vp8_bilinear_predict4x4_neon| PROC
|
||||
push {r4, lr}
|
||||
|
||||
adr r12, bifilter4_coeff
|
||||
ldr r4, [sp, #8] ;load parameters from stack
|
||||
ldr lr, [sp, #12] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq skip_firstpass_filter
|
||||
|
||||
;First pass: output_height lines x output_width columns (5x4)
|
||||
vld1.u8 {d2}, [r0], r1 ;load src data
|
||||
add r2, r12, r2, lsl #3 ;calculate Hfilter location (2coeffsx4bytes=8bytes)
|
||||
|
||||
vld1.u8 {d3}, [r0], r1
|
||||
vld1.u32 {d31}, [r2] ;first_pass filter
|
||||
|
||||
vld1.u8 {d4}, [r0], r1
|
||||
vdup.8 d0, d31[0] ;first_pass filter (d0-d1)
|
||||
vld1.u8 {d5}, [r0], r1
|
||||
vdup.8 d1, d31[4]
|
||||
vld1.u8 {d6}, [r0], r1
|
||||
|
||||
vshr.u64 q4, q1, #8 ;construct src_ptr[1]
|
||||
vshr.u64 q5, q2, #8
|
||||
vshr.u64 d12, d6, #8
|
||||
|
||||
vzip.32 d2, d3 ;put 2-line data in 1 register (src_ptr[0])
|
||||
vzip.32 d4, d5
|
||||
vzip.32 d8, d9 ;put 2-line data in 1 register (src_ptr[1])
|
||||
vzip.32 d10, d11
|
||||
|
||||
vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0])
|
||||
vmull.u8 q8, d4, d0
|
||||
vmull.u8 q9, d6, d0
|
||||
|
||||
vmlal.u8 q7, d8, d1 ;(src_ptr[1] * vp8_filter[1])
|
||||
vmlal.u8 q8, d10, d1
|
||||
vmlal.u8 q9, d12, d1
|
||||
|
||||
vqrshrn.u16 d28, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d29, q8, #7
|
||||
vqrshrn.u16 d30, q9, #7
|
||||
|
||||
;Second pass: 4x4
|
||||
secondpass_filter
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
beq skip_secondpass_filter
|
||||
|
||||
add r3, r12, r3, lsl #3 ;calculate Vfilter location
|
||||
vld1.u32 {d31}, [r3] ;load second_pass filter
|
||||
|
||||
vdup.8 d0, d31[0] ;second_pass filter parameters (d0-d5)
|
||||
vdup.8 d1, d31[4]
|
||||
|
||||
vmull.u8 q1, d28, d0
|
||||
vmull.u8 q2, d29, d0
|
||||
|
||||
vext.8 d26, d28, d29, #4 ;construct src_ptr[pixel_step]
|
||||
vext.8 d27, d29, d30, #4
|
||||
|
||||
vmlal.u8 q1, d26, d1
|
||||
vmlal.u8 q2, d27, d1
|
||||
|
||||
add r0, r4, lr
|
||||
add r1, r0, lr
|
||||
add r2, r1, lr
|
||||
|
||||
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d3, q2, #7
|
||||
|
||||
vst1.32 {d2[0]}, [r4] ;store result
|
||||
vst1.32 {d2[1]}, [r0]
|
||||
vst1.32 {d3[0]}, [r1]
|
||||
vst1.32 {d3[1]}, [r2]
|
||||
|
||||
pop {r4, pc}
|
||||
|
||||
;--------------------
|
||||
skip_firstpass_filter
|
||||
|
||||
vld1.32 {d28[0]}, [r0], r1 ;load src data
|
||||
vld1.32 {d28[1]}, [r0], r1
|
||||
vld1.32 {d29[0]}, [r0], r1
|
||||
vld1.32 {d29[1]}, [r0], r1
|
||||
vld1.32 {d30[0]}, [r0], r1
|
||||
|
||||
b secondpass_filter
|
||||
|
||||
;---------------------
|
||||
skip_secondpass_filter
|
||||
vst1.32 {d28[0]}, [r4], lr ;store result
|
||||
vst1.32 {d28[1]}, [r4], lr
|
||||
vst1.32 {d29[0]}, [r4], lr
|
||||
vst1.32 {d29[1]}, [r4], lr
|
||||
|
||||
pop {r4, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
bifilter4_coeff
|
||||
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
|
||||
|
||||
END
|
@ -1,135 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_bilinear_predict8x4_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; r4 unsigned char *dst_ptr,
|
||||
; stack(lr) int dst_pitch
|
||||
|
||||
|vp8_bilinear_predict8x4_neon| PROC
|
||||
push {r4, lr}
|
||||
|
||||
adr r12, bifilter8x4_coeff
|
||||
ldr r4, [sp, #8] ;load parameters from stack
|
||||
ldr lr, [sp, #12] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq skip_firstpass_filter
|
||||
|
||||
;First pass: output_height lines x output_width columns (5x8)
|
||||
add r2, r12, r2, lsl #3 ;calculate filter location
|
||||
|
||||
vld1.u8 {q1}, [r0], r1 ;load src data
|
||||
vld1.u32 {d31}, [r2] ;load first_pass filter
|
||||
vld1.u8 {q2}, [r0], r1
|
||||
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
|
||||
vld1.u8 {q3}, [r0], r1
|
||||
vdup.8 d1, d31[4]
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
|
||||
vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0])
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vmull.u8 q7, d4, d0
|
||||
vmull.u8 q8, d6, d0
|
||||
vmull.u8 q9, d8, d0
|
||||
vmull.u8 q10, d10, d0
|
||||
|
||||
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
|
||||
vext.8 d5, d4, d5, #1
|
||||
vext.8 d7, d6, d7, #1
|
||||
vext.8 d9, d8, d9, #1
|
||||
vext.8 d11, d10, d11, #1
|
||||
|
||||
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1])
|
||||
vmlal.u8 q7, d5, d1
|
||||
vmlal.u8 q8, d7, d1
|
||||
vmlal.u8 q9, d9, d1
|
||||
vmlal.u8 q10, d11, d1
|
||||
|
||||
vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d23, q7, #7
|
||||
vqrshrn.u16 d24, q8, #7
|
||||
vqrshrn.u16 d25, q9, #7
|
||||
vqrshrn.u16 d26, q10, #7
|
||||
|
||||
;Second pass: 4x8
|
||||
secondpass_filter
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
beq skip_secondpass_filter
|
||||
|
||||
add r3, r12, r3, lsl #3
|
||||
add r0, r4, lr
|
||||
|
||||
vld1.u32 {d31}, [r3] ;load second_pass filter
|
||||
add r1, r0, lr
|
||||
|
||||
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
|
||||
vdup.8 d1, d31[4]
|
||||
|
||||
vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0])
|
||||
vmull.u8 q2, d23, d0
|
||||
vmull.u8 q3, d24, d0
|
||||
vmull.u8 q4, d25, d0
|
||||
|
||||
vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1])
|
||||
vmlal.u8 q2, d24, d1
|
||||
vmlal.u8 q3, d25, d1
|
||||
vmlal.u8 q4, d26, d1
|
||||
|
||||
add r2, r1, lr
|
||||
|
||||
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d3, q2, #7
|
||||
vqrshrn.u16 d4, q3, #7
|
||||
vqrshrn.u16 d5, q4, #7
|
||||
|
||||
vst1.u8 {d2}, [r4] ;store result
|
||||
vst1.u8 {d3}, [r0]
|
||||
vst1.u8 {d4}, [r1]
|
||||
vst1.u8 {d5}, [r2]
|
||||
|
||||
pop {r4, pc}
|
||||
|
||||
;--------------------
|
||||
skip_firstpass_filter
|
||||
vld1.u8 {d22}, [r0], r1 ;load src data
|
||||
vld1.u8 {d23}, [r0], r1
|
||||
vld1.u8 {d24}, [r0], r1
|
||||
vld1.u8 {d25}, [r0], r1
|
||||
vld1.u8 {d26}, [r0], r1
|
||||
|
||||
b secondpass_filter
|
||||
|
||||
;---------------------
|
||||
skip_secondpass_filter
|
||||
vst1.u8 {d22}, [r4], lr ;store result
|
||||
vst1.u8 {d23}, [r4], lr
|
||||
vst1.u8 {d24}, [r4], lr
|
||||
vst1.u8 {d25}, [r4], lr
|
||||
|
||||
pop {r4, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
bifilter8x4_coeff
|
||||
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
|
||||
|
||||
END
|
@ -1,183 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_bilinear_predict8x8_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; r4 unsigned char *dst_ptr,
|
||||
; stack(lr) int dst_pitch
|
||||
|
||||
|vp8_bilinear_predict8x8_neon| PROC
|
||||
push {r4, lr}
|
||||
|
||||
adr r12, bifilter8_coeff
|
||||
ldr r4, [sp, #8] ;load parameters from stack
|
||||
ldr lr, [sp, #12] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq skip_firstpass_filter
|
||||
|
||||
;First pass: output_height lines x output_width columns (9x8)
|
||||
add r2, r12, r2, lsl #3 ;calculate filter location
|
||||
|
||||
vld1.u8 {q1}, [r0], r1 ;load src data
|
||||
vld1.u32 {d31}, [r2] ;load first_pass filter
|
||||
vld1.u8 {q2}, [r0], r1
|
||||
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
|
||||
vld1.u8 {q3}, [r0], r1
|
||||
vdup.8 d1, d31[4]
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
|
||||
vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0])
|
||||
vmull.u8 q7, d4, d0
|
||||
vmull.u8 q8, d6, d0
|
||||
vmull.u8 q9, d8, d0
|
||||
|
||||
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
|
||||
vext.8 d5, d4, d5, #1
|
||||
vext.8 d7, d6, d7, #1
|
||||
vext.8 d9, d8, d9, #1
|
||||
|
||||
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1])
|
||||
vmlal.u8 q7, d5, d1
|
||||
vmlal.u8 q8, d7, d1
|
||||
vmlal.u8 q9, d9, d1
|
||||
|
||||
vld1.u8 {q1}, [r0], r1 ;load src data
|
||||
vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
|
||||
vld1.u8 {q2}, [r0], r1
|
||||
vqrshrn.u16 d23, q7, #7
|
||||
vld1.u8 {q3}, [r0], r1
|
||||
vqrshrn.u16 d24, q8, #7
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vqrshrn.u16 d25, q9, #7
|
||||
|
||||
;first_pass filtering on the rest 5-line data
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
|
||||
vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0])
|
||||
vmull.u8 q7, d4, d0
|
||||
vmull.u8 q8, d6, d0
|
||||
vmull.u8 q9, d8, d0
|
||||
vmull.u8 q10, d10, d0
|
||||
|
||||
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
|
||||
vext.8 d5, d4, d5, #1
|
||||
vext.8 d7, d6, d7, #1
|
||||
vext.8 d9, d8, d9, #1
|
||||
vext.8 d11, d10, d11, #1
|
||||
|
||||
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1])
|
||||
vmlal.u8 q7, d5, d1
|
||||
vmlal.u8 q8, d7, d1
|
||||
vmlal.u8 q9, d9, d1
|
||||
vmlal.u8 q10, d11, d1
|
||||
|
||||
vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d27, q7, #7
|
||||
vqrshrn.u16 d28, q8, #7
|
||||
vqrshrn.u16 d29, q9, #7
|
||||
vqrshrn.u16 d30, q10, #7
|
||||
|
||||
;Second pass: 8x8
|
||||
secondpass_filter
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
beq skip_secondpass_filter
|
||||
|
||||
add r3, r12, r3, lsl #3
|
||||
add r0, r4, lr
|
||||
|
||||
vld1.u32 {d31}, [r3] ;load second_pass filter
|
||||
add r1, r0, lr
|
||||
|
||||
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
|
||||
vdup.8 d1, d31[4]
|
||||
|
||||
vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0])
|
||||
vmull.u8 q2, d23, d0
|
||||
vmull.u8 q3, d24, d0
|
||||
vmull.u8 q4, d25, d0
|
||||
vmull.u8 q5, d26, d0
|
||||
vmull.u8 q6, d27, d0
|
||||
vmull.u8 q7, d28, d0
|
||||
vmull.u8 q8, d29, d0
|
||||
|
||||
vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1])
|
||||
vmlal.u8 q2, d24, d1
|
||||
vmlal.u8 q3, d25, d1
|
||||
vmlal.u8 q4, d26, d1
|
||||
vmlal.u8 q5, d27, d1
|
||||
vmlal.u8 q6, d28, d1
|
||||
vmlal.u8 q7, d29, d1
|
||||
vmlal.u8 q8, d30, d1
|
||||
|
||||
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d3, q2, #7
|
||||
vqrshrn.u16 d4, q3, #7
|
||||
vqrshrn.u16 d5, q4, #7
|
||||
vqrshrn.u16 d6, q5, #7
|
||||
vqrshrn.u16 d7, q6, #7
|
||||
vqrshrn.u16 d8, q7, #7
|
||||
vqrshrn.u16 d9, q8, #7
|
||||
|
||||
vst1.u8 {d2}, [r4] ;store result
|
||||
vst1.u8 {d3}, [r0]
|
||||
vst1.u8 {d4}, [r1], lr
|
||||
vst1.u8 {d5}, [r1], lr
|
||||
vst1.u8 {d6}, [r1], lr
|
||||
vst1.u8 {d7}, [r1], lr
|
||||
vst1.u8 {d8}, [r1], lr
|
||||
vst1.u8 {d9}, [r1], lr
|
||||
|
||||
pop {r4, pc}
|
||||
|
||||
;--------------------
|
||||
skip_firstpass_filter
|
||||
vld1.u8 {d22}, [r0], r1 ;load src data
|
||||
vld1.u8 {d23}, [r0], r1
|
||||
vld1.u8 {d24}, [r0], r1
|
||||
vld1.u8 {d25}, [r0], r1
|
||||
vld1.u8 {d26}, [r0], r1
|
||||
vld1.u8 {d27}, [r0], r1
|
||||
vld1.u8 {d28}, [r0], r1
|
||||
vld1.u8 {d29}, [r0], r1
|
||||
vld1.u8 {d30}, [r0], r1
|
||||
|
||||
b secondpass_filter
|
||||
|
||||
;---------------------
|
||||
skip_secondpass_filter
|
||||
vst1.u8 {d22}, [r4], lr ;store result
|
||||
vst1.u8 {d23}, [r4], lr
|
||||
vst1.u8 {d24}, [r4], lr
|
||||
vst1.u8 {d25}, [r4], lr
|
||||
vst1.u8 {d26}, [r4], lr
|
||||
vst1.u8 {d27}, [r4], lr
|
||||
vst1.u8 {d28}, [r4], lr
|
||||
vst1.u8 {d29}, [r4], lr
|
||||
|
||||
pop {r4, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
bifilter8_coeff
|
||||
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
|
||||
|
||||
END
|
699
media/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
Normal file
699
media/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
Normal file
@ -0,0 +1,699 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
static const uint8_t bifilter4_coeff[8][2] = {
|
||||
{128, 0},
|
||||
{112, 16},
|
||||
{ 96, 32},
|
||||
{ 80, 48},
|
||||
{ 64, 64},
|
||||
{ 48, 80},
|
||||
{ 32, 96},
|
||||
{ 16, 112}
|
||||
};
|
||||
|
||||
void vp8_bilinear_predict4x4_neon(
|
||||
unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
unsigned char *dst_ptr,
|
||||
int dst_pitch) {
|
||||
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8;
|
||||
uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8;
|
||||
uint8x16_t q1u8, q2u8;
|
||||
uint16x8_t q1u16, q2u16;
|
||||
uint16x8_t q7u16, q8u16, q9u16;
|
||||
uint64x2_t q4u64, q5u64;
|
||||
uint64x1_t d12u64;
|
||||
uint32x2x2_t d0u32x2, d1u32x2, d2u32x2, d3u32x2;
|
||||
|
||||
if (xoffset == 0) { // skip_1stpass_filter
|
||||
uint32x2_t d28u32 = vdup_n_u32(0);
|
||||
uint32x2_t d29u32 = vdup_n_u32(0);
|
||||
uint32x2_t d30u32 = vdup_n_u32(0);
|
||||
|
||||
d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 0);
|
||||
src_ptr += src_pixels_per_line;
|
||||
d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 1);
|
||||
src_ptr += src_pixels_per_line;
|
||||
d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 0);
|
||||
src_ptr += src_pixels_per_line;
|
||||
d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 1);
|
||||
src_ptr += src_pixels_per_line;
|
||||
d30u32 = vld1_lane_u32((const uint32_t *)src_ptr, d30u32, 0);
|
||||
d28u8 = vreinterpret_u8_u32(d28u32);
|
||||
d29u8 = vreinterpret_u8_u32(d29u32);
|
||||
d30u8 = vreinterpret_u8_u32(d30u32);
|
||||
} else {
|
||||
d2u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
d3u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
d4u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
d5u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
d6u8 = vld1_u8(src_ptr);
|
||||
|
||||
q1u8 = vcombine_u8(d2u8, d3u8);
|
||||
q2u8 = vcombine_u8(d4u8, d5u8);
|
||||
|
||||
d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
|
||||
d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
|
||||
|
||||
q4u64 = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8);
|
||||
q5u64 = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8);
|
||||
d12u64 = vshr_n_u64(vreinterpret_u64_u8(d6u8), 8);
|
||||
|
||||
d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q1u8)),
|
||||
vreinterpret_u32_u8(vget_high_u8(q1u8)));
|
||||
d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q2u8)),
|
||||
vreinterpret_u32_u8(vget_high_u8(q2u8)));
|
||||
d2u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q4u64)),
|
||||
vreinterpret_u32_u64(vget_high_u64(q4u64)));
|
||||
d3u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),
|
||||
vreinterpret_u32_u64(vget_high_u64(q5u64)));
|
||||
|
||||
q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
|
||||
q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
|
||||
q9u16 = vmull_u8(d6u8, d0u8);
|
||||
|
||||
q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d2u32x2.val[0]), d1u8);
|
||||
q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d3u32x2.val[0]), d1u8);
|
||||
q9u16 = vmlal_u8(q9u16, vreinterpret_u8_u64(d12u64), d1u8);
|
||||
|
||||
d28u8 = vqrshrn_n_u16(q7u16, 7);
|
||||
d29u8 = vqrshrn_n_u16(q8u16, 7);
|
||||
d30u8 = vqrshrn_n_u16(q9u16, 7);
|
||||
}
|
||||
|
||||
// secondpass_filter
|
||||
if (yoffset == 0) { // skip_2ndpass_filter
|
||||
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0);
|
||||
dst_ptr += dst_pitch;
|
||||
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1);
|
||||
dst_ptr += dst_pitch;
|
||||
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 0);
|
||||
dst_ptr += dst_pitch;
|
||||
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 1);
|
||||
} else {
|
||||
d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
|
||||
d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
|
||||
|
||||
q1u16 = vmull_u8(d28u8, d0u8);
|
||||
q2u16 = vmull_u8(d29u8, d0u8);
|
||||
|
||||
d26u8 = vext_u8(d28u8, d29u8, 4);
|
||||
d27u8 = vext_u8(d29u8, d30u8, 4);
|
||||
|
||||
q1u16 = vmlal_u8(q1u16, d26u8, d1u8);
|
||||
q2u16 = vmlal_u8(q2u16, d27u8, d1u8);
|
||||
|
||||
d2u8 = vqrshrn_n_u16(q1u16, 7);
|
||||
d3u8 = vqrshrn_n_u16(q2u16, 7);
|
||||
|
||||
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0);
|
||||
dst_ptr += dst_pitch;
|
||||
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1);
|
||||
dst_ptr += dst_pitch;
|
||||
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
|
||||
dst_ptr += dst_pitch;
|
||||
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_bilinear_predict8x4_neon(
|
||||
unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
unsigned char *dst_ptr,
|
||||
int dst_pitch) {
|
||||
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8;
|
||||
uint8x8_t d7u8, d9u8, d11u8, d22u8, d23u8, d24u8, d25u8, d26u8;
|
||||
uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8;
|
||||
uint16x8_t q1u16, q2u16, q3u16, q4u16;
|
||||
uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16;
|
||||
|
||||
if (xoffset == 0) { // skip_1stpass_filter
|
||||
d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
d26u8 = vld1_u8(src_ptr);
|
||||
} else {
|
||||
q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
q5u8 = vld1q_u8(src_ptr);
|
||||
|
||||
d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
|
||||
d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
|
||||
|
||||
q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
|
||||
q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
|
||||
q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
|
||||
q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
|
||||
q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
|
||||
|
||||
d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
|
||||
d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
|
||||
d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
|
||||
d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
|
||||
d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
|
||||
|
||||
q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
|
||||
q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
|
||||
q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
|
||||
q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
|
||||
q10u16 = vmlal_u8(q10u16, d11u8, d1u8);
|
||||
|
||||
d22u8 = vqrshrn_n_u16(q6u16, 7);
|
||||
d23u8 = vqrshrn_n_u16(q7u16, 7);
|
||||
d24u8 = vqrshrn_n_u16(q8u16, 7);
|
||||
d25u8 = vqrshrn_n_u16(q9u16, 7);
|
||||
d26u8 = vqrshrn_n_u16(q10u16, 7);
|
||||
}
|
||||
|
||||
// secondpass_filter
|
||||
if (yoffset == 0) { // skip_2ndpass_filter
|
||||
vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch;
|
||||
vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch;
|
||||
vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch;
|
||||
vst1_u8((uint8_t *)dst_ptr, d25u8);
|
||||
} else {
|
||||
d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
|
||||
d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
|
||||
|
||||
q1u16 = vmull_u8(d22u8, d0u8);
|
||||
q2u16 = vmull_u8(d23u8, d0u8);
|
||||
q3u16 = vmull_u8(d24u8, d0u8);
|
||||
q4u16 = vmull_u8(d25u8, d0u8);
|
||||
|
||||
q1u16 = vmlal_u8(q1u16, d23u8, d1u8);
|
||||
q2u16 = vmlal_u8(q2u16, d24u8, d1u8);
|
||||
q3u16 = vmlal_u8(q3u16, d25u8, d1u8);
|
||||
q4u16 = vmlal_u8(q4u16, d26u8, d1u8);
|
||||
|
||||
d2u8 = vqrshrn_n_u16(q1u16, 7);
|
||||
d3u8 = vqrshrn_n_u16(q2u16, 7);
|
||||
d4u8 = vqrshrn_n_u16(q3u16, 7);
|
||||
d5u8 = vqrshrn_n_u16(q4u16, 7);
|
||||
|
||||
vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch;
|
||||
vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch;
|
||||
vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch;
|
||||
vst1_u8((uint8_t *)dst_ptr, d5u8);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_bilinear_predict8x8_neon(
|
||||
unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
unsigned char *dst_ptr,
|
||||
int dst_pitch) {
|
||||
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8, d11u8;
|
||||
uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8;
|
||||
uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8;
|
||||
uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16;
|
||||
uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16;
|
||||
|
||||
if (xoffset == 0) { // skip_1stpass_filter
|
||||
d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
d26u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
d27u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
d28u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
d29u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
d30u8 = vld1_u8(src_ptr);
|
||||
} else {
|
||||
q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
|
||||
d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
|
||||
d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
|
||||
|
||||
q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
|
||||
q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
|
||||
q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
|
||||
q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
|
||||
|
||||
d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
|
||||
d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
|
||||
d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
|
||||
d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
|
||||
|
||||
q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
|
||||
q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
|
||||
q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
|
||||
q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
|
||||
|
||||
d22u8 = vqrshrn_n_u16(q6u16, 7);
|
||||
d23u8 = vqrshrn_n_u16(q7u16, 7);
|
||||
d24u8 = vqrshrn_n_u16(q8u16, 7);
|
||||
d25u8 = vqrshrn_n_u16(q9u16, 7);
|
||||
|
||||
// first_pass filtering on the rest 5-line data
|
||||
q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
q5u8 = vld1q_u8(src_ptr);
|
||||
|
||||
q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
|
||||
q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
|
||||
q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
|
||||
q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
|
||||
q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
|
||||
|
||||
d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1);
|
||||
d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1);
|
||||
d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
|
||||
d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
|
||||
d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
|
||||
|
||||
q6u16 = vmlal_u8(q6u16, d3u8, d1u8);
|
||||
q7u16 = vmlal_u8(q7u16, d5u8, d1u8);
|
||||
q8u16 = vmlal_u8(q8u16, d7u8, d1u8);
|
||||
q9u16 = vmlal_u8(q9u16, d9u8, d1u8);
|
||||
q10u16 = vmlal_u8(q10u16, d11u8, d1u8);
|
||||
|
||||
d26u8 = vqrshrn_n_u16(q6u16, 7);
|
||||
d27u8 = vqrshrn_n_u16(q7u16, 7);
|
||||
d28u8 = vqrshrn_n_u16(q8u16, 7);
|
||||
d29u8 = vqrshrn_n_u16(q9u16, 7);
|
||||
d30u8 = vqrshrn_n_u16(q10u16, 7);
|
||||
}
|
||||
|
||||
// secondpass_filter
|
||||
if (yoffset == 0) { // skip_2ndpass_filter
|
||||
vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch;
|
||||
vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch;
|
||||
vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch;
|
||||
vst1_u8((uint8_t *)dst_ptr, d25u8); dst_ptr += dst_pitch;
|
||||
vst1_u8((uint8_t *)dst_ptr, d26u8); dst_ptr += dst_pitch;
|
||||
vst1_u8((uint8_t *)dst_ptr, d27u8); dst_ptr += dst_pitch;
|
||||
vst1_u8((uint8_t *)dst_ptr, d28u8); dst_ptr += dst_pitch;
|
||||
vst1_u8((uint8_t *)dst_ptr, d29u8);
|
||||
} else {
|
||||
d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
|
||||
d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
|
||||
|
||||
q1u16 = vmull_u8(d22u8, d0u8);
|
||||
q2u16 = vmull_u8(d23u8, d0u8);
|
||||
q3u16 = vmull_u8(d24u8, d0u8);
|
||||
q4u16 = vmull_u8(d25u8, d0u8);
|
||||
q5u16 = vmull_u8(d26u8, d0u8);
|
||||
q6u16 = vmull_u8(d27u8, d0u8);
|
||||
q7u16 = vmull_u8(d28u8, d0u8);
|
||||
q8u16 = vmull_u8(d29u8, d0u8);
|
||||
|
||||
q1u16 = vmlal_u8(q1u16, d23u8, d1u8);
|
||||
q2u16 = vmlal_u8(q2u16, d24u8, d1u8);
|
||||
q3u16 = vmlal_u8(q3u16, d25u8, d1u8);
|
||||
q4u16 = vmlal_u8(q4u16, d26u8, d1u8);
|
||||
q5u16 = vmlal_u8(q5u16, d27u8, d1u8);
|
||||
q6u16 = vmlal_u8(q6u16, d28u8, d1u8);
|
||||
q7u16 = vmlal_u8(q7u16, d29u8, d1u8);
|
||||
q8u16 = vmlal_u8(q8u16, d30u8, d1u8);
|
||||
|
||||
d2u8 = vqrshrn_n_u16(q1u16, 7);
|
||||
d3u8 = vqrshrn_n_u16(q2u16, 7);
|
||||
d4u8 = vqrshrn_n_u16(q3u16, 7);
|
||||
d5u8 = vqrshrn_n_u16(q4u16, 7);
|
||||
d6u8 = vqrshrn_n_u16(q5u16, 7);
|
||||
d7u8 = vqrshrn_n_u16(q6u16, 7);
|
||||
d8u8 = vqrshrn_n_u16(q7u16, 7);
|
||||
d9u8 = vqrshrn_n_u16(q8u16, 7);
|
||||
|
||||
vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch;
|
||||
vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch;
|
||||
vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch;
|
||||
vst1_u8((uint8_t *)dst_ptr, d5u8); dst_ptr += dst_pitch;
|
||||
vst1_u8((uint8_t *)dst_ptr, d6u8); dst_ptr += dst_pitch;
|
||||
vst1_u8((uint8_t *)dst_ptr, d7u8); dst_ptr += dst_pitch;
|
||||
vst1_u8((uint8_t *)dst_ptr, d8u8); dst_ptr += dst_pitch;
|
||||
vst1_u8((uint8_t *)dst_ptr, d9u8);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_bilinear_predict16x16_neon(
|
||||
unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
unsigned char *dst_ptr,
|
||||
int dst_pitch) {
|
||||
int i;
|
||||
unsigned char tmp[272];
|
||||
unsigned char *tmpp;
|
||||
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
|
||||
uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;
|
||||
uint8x8_t d19u8, d20u8, d21u8;
|
||||
uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
|
||||
uint8x16_t q11u8, q12u8, q13u8, q14u8, q15u8;
|
||||
uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16;
|
||||
uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16;
|
||||
|
||||
if (xoffset == 0) { // secondpass_bfilter16x16_only
|
||||
d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
|
||||
d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
|
||||
|
||||
q11u8 = vld1q_u8(src_ptr);
|
||||
src_ptr += src_pixels_per_line;
|
||||
for (i = 4; i > 0; i--) {
|
||||
q12u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
q13u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
q14u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
q15u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
|
||||
q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
|
||||
q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
|
||||
q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
|
||||
q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
|
||||
q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
|
||||
q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
|
||||
q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
|
||||
q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
|
||||
|
||||
q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
|
||||
q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
|
||||
q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
|
||||
q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
|
||||
q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
|
||||
q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
|
||||
q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
|
||||
q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
|
||||
|
||||
d2u8 = vqrshrn_n_u16(q1u16, 7);
|
||||
d3u8 = vqrshrn_n_u16(q2u16, 7);
|
||||
d4u8 = vqrshrn_n_u16(q3u16, 7);
|
||||
d5u8 = vqrshrn_n_u16(q4u16, 7);
|
||||
d6u8 = vqrshrn_n_u16(q5u16, 7);
|
||||
d7u8 = vqrshrn_n_u16(q6u16, 7);
|
||||
d8u8 = vqrshrn_n_u16(q7u16, 7);
|
||||
d9u8 = vqrshrn_n_u16(q8u16, 7);
|
||||
|
||||
q1u8 = vcombine_u8(d2u8, d3u8);
|
||||
q2u8 = vcombine_u8(d4u8, d5u8);
|
||||
q3u8 = vcombine_u8(d6u8, d7u8);
|
||||
q4u8 = vcombine_u8(d8u8, d9u8);
|
||||
|
||||
q11u8 = q15u8;
|
||||
|
||||
vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch;
|
||||
vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch;
|
||||
vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch;
|
||||
vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (yoffset == 0) { // firstpass_bfilter16x16_only
|
||||
d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
|
||||
d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
|
||||
|
||||
for (i = 4; i > 0 ; i--) {
|
||||
d2u8 = vld1_u8(src_ptr);
|
||||
d3u8 = vld1_u8(src_ptr + 8);
|
||||
d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
|
||||
d5u8 = vld1_u8(src_ptr);
|
||||
d6u8 = vld1_u8(src_ptr + 8);
|
||||
d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
|
||||
d8u8 = vld1_u8(src_ptr);
|
||||
d9u8 = vld1_u8(src_ptr + 8);
|
||||
d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
|
||||
d11u8 = vld1_u8(src_ptr);
|
||||
d12u8 = vld1_u8(src_ptr + 8);
|
||||
d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
|
||||
|
||||
q7u16 = vmull_u8(d2u8, d0u8);
|
||||
q8u16 = vmull_u8(d3u8, d0u8);
|
||||
q9u16 = vmull_u8(d5u8, d0u8);
|
||||
q10u16 = vmull_u8(d6u8, d0u8);
|
||||
q11u16 = vmull_u8(d8u8, d0u8);
|
||||
q12u16 = vmull_u8(d9u8, d0u8);
|
||||
q13u16 = vmull_u8(d11u8, d0u8);
|
||||
q14u16 = vmull_u8(d12u8, d0u8);
|
||||
|
||||
d2u8 = vext_u8(d2u8, d3u8, 1);
|
||||
d5u8 = vext_u8(d5u8, d6u8, 1);
|
||||
d8u8 = vext_u8(d8u8, d9u8, 1);
|
||||
d11u8 = vext_u8(d11u8, d12u8, 1);
|
||||
|
||||
q7u16 = vmlal_u8(q7u16, d2u8, d1u8);
|
||||
q9u16 = vmlal_u8(q9u16, d5u8, d1u8);
|
||||
q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
|
||||
q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
|
||||
|
||||
d3u8 = vext_u8(d3u8, d4u8, 1);
|
||||
d6u8 = vext_u8(d6u8, d7u8, 1);
|
||||
d9u8 = vext_u8(d9u8, d10u8, 1);
|
||||
d12u8 = vext_u8(d12u8, d13u8, 1);
|
||||
|
||||
q8u16 = vmlal_u8(q8u16, d3u8, d1u8);
|
||||
q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
|
||||
q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
|
||||
q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
|
||||
|
||||
d14u8 = vqrshrn_n_u16(q7u16, 7);
|
||||
d15u8 = vqrshrn_n_u16(q8u16, 7);
|
||||
d16u8 = vqrshrn_n_u16(q9u16, 7);
|
||||
d17u8 = vqrshrn_n_u16(q10u16, 7);
|
||||
d18u8 = vqrshrn_n_u16(q11u16, 7);
|
||||
d19u8 = vqrshrn_n_u16(q12u16, 7);
|
||||
d20u8 = vqrshrn_n_u16(q13u16, 7);
|
||||
d21u8 = vqrshrn_n_u16(q14u16, 7);
|
||||
|
||||
q7u8 = vcombine_u8(d14u8, d15u8);
|
||||
q8u8 = vcombine_u8(d16u8, d17u8);
|
||||
q9u8 = vcombine_u8(d18u8, d19u8);
|
||||
q10u8 =vcombine_u8(d20u8, d21u8);
|
||||
|
||||
vst1q_u8((uint8_t *)dst_ptr, q7u8); dst_ptr += dst_pitch;
|
||||
vst1q_u8((uint8_t *)dst_ptr, q8u8); dst_ptr += dst_pitch;
|
||||
vst1q_u8((uint8_t *)dst_ptr, q9u8); dst_ptr += dst_pitch;
|
||||
vst1q_u8((uint8_t *)dst_ptr, q10u8); dst_ptr += dst_pitch;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
|
||||
d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
|
||||
|
||||
d2u8 = vld1_u8(src_ptr);
|
||||
d3u8 = vld1_u8(src_ptr + 8);
|
||||
d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
|
||||
d5u8 = vld1_u8(src_ptr);
|
||||
d6u8 = vld1_u8(src_ptr + 8);
|
||||
d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
|
||||
d8u8 = vld1_u8(src_ptr);
|
||||
d9u8 = vld1_u8(src_ptr + 8);
|
||||
d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
|
||||
d11u8 = vld1_u8(src_ptr);
|
||||
d12u8 = vld1_u8(src_ptr + 8);
|
||||
d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
|
||||
|
||||
// First Pass: output_height lines x output_width columns (17x16)
|
||||
tmpp = tmp;
|
||||
for (i = 3; i > 0; i--) {
|
||||
q7u16 = vmull_u8(d2u8, d0u8);
|
||||
q8u16 = vmull_u8(d3u8, d0u8);
|
||||
q9u16 = vmull_u8(d5u8, d0u8);
|
||||
q10u16 = vmull_u8(d6u8, d0u8);
|
||||
q11u16 = vmull_u8(d8u8, d0u8);
|
||||
q12u16 = vmull_u8(d9u8, d0u8);
|
||||
q13u16 = vmull_u8(d11u8, d0u8);
|
||||
q14u16 = vmull_u8(d12u8, d0u8);
|
||||
|
||||
d2u8 = vext_u8(d2u8, d3u8, 1);
|
||||
d5u8 = vext_u8(d5u8, d6u8, 1);
|
||||
d8u8 = vext_u8(d8u8, d9u8, 1);
|
||||
d11u8 = vext_u8(d11u8, d12u8, 1);
|
||||
|
||||
q7u16 = vmlal_u8(q7u16, d2u8, d1u8);
|
||||
q9u16 = vmlal_u8(q9u16, d5u8, d1u8);
|
||||
q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
|
||||
q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
|
||||
|
||||
d3u8 = vext_u8(d3u8, d4u8, 1);
|
||||
d6u8 = vext_u8(d6u8, d7u8, 1);
|
||||
d9u8 = vext_u8(d9u8, d10u8, 1);
|
||||
d12u8 = vext_u8(d12u8, d13u8, 1);
|
||||
|
||||
q8u16 = vmlal_u8(q8u16, d3u8, d1u8);
|
||||
q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
|
||||
q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
|
||||
q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
|
||||
|
||||
d14u8 = vqrshrn_n_u16(q7u16, 7);
|
||||
d15u8 = vqrshrn_n_u16(q8u16, 7);
|
||||
d16u8 = vqrshrn_n_u16(q9u16, 7);
|
||||
d17u8 = vqrshrn_n_u16(q10u16, 7);
|
||||
d18u8 = vqrshrn_n_u16(q11u16, 7);
|
||||
d19u8 = vqrshrn_n_u16(q12u16, 7);
|
||||
d20u8 = vqrshrn_n_u16(q13u16, 7);
|
||||
d21u8 = vqrshrn_n_u16(q14u16, 7);
|
||||
|
||||
d2u8 = vld1_u8(src_ptr);
|
||||
d3u8 = vld1_u8(src_ptr + 8);
|
||||
d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
|
||||
d5u8 = vld1_u8(src_ptr);
|
||||
d6u8 = vld1_u8(src_ptr + 8);
|
||||
d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
|
||||
d8u8 = vld1_u8(src_ptr);
|
||||
d9u8 = vld1_u8(src_ptr + 8);
|
||||
d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
|
||||
d11u8 = vld1_u8(src_ptr);
|
||||
d12u8 = vld1_u8(src_ptr + 8);
|
||||
d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
|
||||
|
||||
q7u8 = vcombine_u8(d14u8, d15u8);
|
||||
q8u8 = vcombine_u8(d16u8, d17u8);
|
||||
q9u8 = vcombine_u8(d18u8, d19u8);
|
||||
q10u8 = vcombine_u8(d20u8, d21u8);
|
||||
|
||||
vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16;
|
||||
vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16;
|
||||
vst1q_u8((uint8_t *)tmpp, q9u8); tmpp += 16;
|
||||
vst1q_u8((uint8_t *)tmpp, q10u8); tmpp += 16;
|
||||
}
|
||||
|
||||
// First-pass filtering for rest 5 lines
|
||||
d14u8 = vld1_u8(src_ptr);
|
||||
d15u8 = vld1_u8(src_ptr + 8);
|
||||
d16u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line;
|
||||
|
||||
q9u16 = vmull_u8(d2u8, d0u8);
|
||||
q10u16 = vmull_u8(d3u8, d0u8);
|
||||
q11u16 = vmull_u8(d5u8, d0u8);
|
||||
q12u16 = vmull_u8(d6u8, d0u8);
|
||||
q13u16 = vmull_u8(d8u8, d0u8);
|
||||
q14u16 = vmull_u8(d9u8, d0u8);
|
||||
|
||||
d2u8 = vext_u8(d2u8, d3u8, 1);
|
||||
d5u8 = vext_u8(d5u8, d6u8, 1);
|
||||
d8u8 = vext_u8(d8u8, d9u8, 1);
|
||||
|
||||
q9u16 = vmlal_u8(q9u16, d2u8, d1u8);
|
||||
q11u16 = vmlal_u8(q11u16, d5u8, d1u8);
|
||||
q13u16 = vmlal_u8(q13u16, d8u8, d1u8);
|
||||
|
||||
d3u8 = vext_u8(d3u8, d4u8, 1);
|
||||
d6u8 = vext_u8(d6u8, d7u8, 1);
|
||||
d9u8 = vext_u8(d9u8, d10u8, 1);
|
||||
|
||||
q10u16 = vmlal_u8(q10u16, d3u8, d1u8);
|
||||
q12u16 = vmlal_u8(q12u16, d6u8, d1u8);
|
||||
q14u16 = vmlal_u8(q14u16, d9u8, d1u8);
|
||||
|
||||
q1u16 = vmull_u8(d11u8, d0u8);
|
||||
q2u16 = vmull_u8(d12u8, d0u8);
|
||||
q3u16 = vmull_u8(d14u8, d0u8);
|
||||
q4u16 = vmull_u8(d15u8, d0u8);
|
||||
|
||||
d11u8 = vext_u8(d11u8, d12u8, 1);
|
||||
d14u8 = vext_u8(d14u8, d15u8, 1);
|
||||
|
||||
q1u16 = vmlal_u8(q1u16, d11u8, d1u8);
|
||||
q3u16 = vmlal_u8(q3u16, d14u8, d1u8);
|
||||
|
||||
d12u8 = vext_u8(d12u8, d13u8, 1);
|
||||
d15u8 = vext_u8(d15u8, d16u8, 1);
|
||||
|
||||
q2u16 = vmlal_u8(q2u16, d12u8, d1u8);
|
||||
q4u16 = vmlal_u8(q4u16, d15u8, d1u8);
|
||||
|
||||
d10u8 = vqrshrn_n_u16(q9u16, 7);
|
||||
d11u8 = vqrshrn_n_u16(q10u16, 7);
|
||||
d12u8 = vqrshrn_n_u16(q11u16, 7);
|
||||
d13u8 = vqrshrn_n_u16(q12u16, 7);
|
||||
d14u8 = vqrshrn_n_u16(q13u16, 7);
|
||||
d15u8 = vqrshrn_n_u16(q14u16, 7);
|
||||
d16u8 = vqrshrn_n_u16(q1u16, 7);
|
||||
d17u8 = vqrshrn_n_u16(q2u16, 7);
|
||||
d18u8 = vqrshrn_n_u16(q3u16, 7);
|
||||
d19u8 = vqrshrn_n_u16(q4u16, 7);
|
||||
|
||||
q5u8 = vcombine_u8(d10u8, d11u8);
|
||||
q6u8 = vcombine_u8(d12u8, d13u8);
|
||||
q7u8 = vcombine_u8(d14u8, d15u8);
|
||||
q8u8 = vcombine_u8(d16u8, d17u8);
|
||||
q9u8 = vcombine_u8(d18u8, d19u8);
|
||||
|
||||
vst1q_u8((uint8_t *)tmpp, q5u8); tmpp += 16;
|
||||
vst1q_u8((uint8_t *)tmpp, q6u8); tmpp += 16;
|
||||
vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16;
|
||||
vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16;
|
||||
vst1q_u8((uint8_t *)tmpp, q9u8);
|
||||
|
||||
// secondpass_filter
|
||||
d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
|
||||
d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
|
||||
|
||||
tmpp = tmp;
|
||||
q11u8 = vld1q_u8(tmpp);
|
||||
tmpp += 16;
|
||||
for (i = 4; i > 0; i--) {
|
||||
q12u8 = vld1q_u8(tmpp); tmpp += 16;
|
||||
q13u8 = vld1q_u8(tmpp); tmpp += 16;
|
||||
q14u8 = vld1q_u8(tmpp); tmpp += 16;
|
||||
q15u8 = vld1q_u8(tmpp); tmpp += 16;
|
||||
|
||||
q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
|
||||
q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
|
||||
q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
|
||||
q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
|
||||
q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
|
||||
q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
|
||||
q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
|
||||
q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
|
||||
|
||||
q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
|
||||
q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
|
||||
q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
|
||||
q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
|
||||
q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
|
||||
q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
|
||||
q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
|
||||
q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
|
||||
|
||||
d2u8 = vqrshrn_n_u16(q1u16, 7);
|
||||
d3u8 = vqrshrn_n_u16(q2u16, 7);
|
||||
d4u8 = vqrshrn_n_u16(q3u16, 7);
|
||||
d5u8 = vqrshrn_n_u16(q4u16, 7);
|
||||
d6u8 = vqrshrn_n_u16(q5u16, 7);
|
||||
d7u8 = vqrshrn_n_u16(q6u16, 7);
|
||||
d8u8 = vqrshrn_n_u16(q7u16, 7);
|
||||
d9u8 = vqrshrn_n_u16(q8u16, 7);
|
||||
|
||||
q1u8 = vcombine_u8(d2u8, d3u8);
|
||||
q2u8 = vcombine_u8(d4u8, d5u8);
|
||||
q3u8 = vcombine_u8(d6u8, d7u8);
|
||||
q4u8 = vcombine_u8(d8u8, d9u8);
|
||||
|
||||
q11u8 = q15u8;
|
||||
|
||||
vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch;
|
||||
vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch;
|
||||
vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch;
|
||||
vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch;
|
||||
}
|
||||
return;
|
||||
}
|
@ -1,584 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_build_intra_predictors_mby_neon_func|
|
||||
EXPORT |vp8_build_intra_predictors_mby_s_neon_func|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; r0 unsigned char *y_buffer
|
||||
; r1 unsigned char *ypred_ptr
|
||||
; r2 int y_stride
|
||||
; r3 int mode
|
||||
; stack int Up
|
||||
; stack int Left
|
||||
|
||||
|vp8_build_intra_predictors_mby_neon_func| PROC
|
||||
push {r4-r8, lr}
|
||||
|
||||
cmp r3, #0
|
||||
beq case_dc_pred
|
||||
cmp r3, #1
|
||||
beq case_v_pred
|
||||
cmp r3, #2
|
||||
beq case_h_pred
|
||||
cmp r3, #3
|
||||
beq case_tm_pred
|
||||
|
||||
case_dc_pred
|
||||
ldr r4, [sp, #24] ; Up
|
||||
ldr r5, [sp, #28] ; Left
|
||||
|
||||
; Default the DC average to 128
|
||||
mov r12, #128
|
||||
vdup.u8 q0, r12
|
||||
|
||||
; Zero out running sum
|
||||
mov r12, #0
|
||||
|
||||
; compute shift and jump
|
||||
adds r7, r4, r5
|
||||
beq skip_dc_pred_up_left
|
||||
|
||||
; Load above row, if it exists
|
||||
cmp r4, #0
|
||||
beq skip_dc_pred_up
|
||||
|
||||
sub r6, r0, r2
|
||||
vld1.8 {q1}, [r6]
|
||||
vpaddl.u8 q2, q1
|
||||
vpaddl.u16 q3, q2
|
||||
vpaddl.u32 q4, q3
|
||||
|
||||
vmov.32 r4, d8[0]
|
||||
vmov.32 r6, d9[0]
|
||||
|
||||
add r12, r4, r6
|
||||
|
||||
; Move back to interger registers
|
||||
|
||||
skip_dc_pred_up
|
||||
|
||||
cmp r5, #0
|
||||
beq skip_dc_pred_left
|
||||
|
||||
sub r0, r0, #1
|
||||
|
||||
; Load left row, if it exists
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0]
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
skip_dc_pred_left
|
||||
add r7, r7, #3 ; Shift
|
||||
sub r4, r7, #1
|
||||
mov r5, #1
|
||||
add r12, r12, r5, lsl r4
|
||||
mov r5, r12, lsr r7 ; expected_dc
|
||||
|
||||
vdup.u8 q0, r5
|
||||
|
||||
skip_dc_pred_up_left
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
|
||||
pop {r4-r8,pc}
|
||||
case_v_pred
|
||||
; Copy down above row
|
||||
sub r6, r0, r2
|
||||
vld1.8 {q0}, [r6]
|
||||
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
pop {r4-r8,pc}
|
||||
|
||||
case_h_pred
|
||||
; Load 4x yleft_col
|
||||
sub r0, r0, #1
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q1}, [r1]!
|
||||
vst1.u8 {q2}, [r1]!
|
||||
vst1.u8 {q3}, [r1]!
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q1}, [r1]!
|
||||
vst1.u8 {q2}, [r1]!
|
||||
vst1.u8 {q3}, [r1]!
|
||||
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q1}, [r1]!
|
||||
vst1.u8 {q2}, [r1]!
|
||||
vst1.u8 {q3}, [r1]!
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q1}, [r1]!
|
||||
vst1.u8 {q2}, [r1]!
|
||||
vst1.u8 {q3}, [r1]!
|
||||
|
||||
pop {r4-r8,pc}
|
||||
|
||||
case_tm_pred
|
||||
; Load yabove_row
|
||||
sub r3, r0, r2
|
||||
vld1.8 {q8}, [r3]
|
||||
|
||||
; Load ytop_left
|
||||
sub r3, r3, #1
|
||||
ldrb r7, [r3]
|
||||
|
||||
vdup.u16 q7, r7
|
||||
|
||||
; Compute yabove_row - ytop_left
|
||||
mov r3, #1
|
||||
vdup.u8 q0, r3
|
||||
|
||||
vmull.u8 q4, d16, d0
|
||||
vmull.u8 q5, d17, d0
|
||||
|
||||
vsub.s16 q4, q4, q7
|
||||
vsub.s16 q5, q5, q7
|
||||
|
||||
; Load 4x yleft_col
|
||||
sub r0, r0, #1
|
||||
mov r12, #4
|
||||
|
||||
case_tm_pred_loop
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u16 q0, r3
|
||||
vdup.u16 q1, r4
|
||||
vdup.u16 q2, r5
|
||||
vdup.u16 q3, r6
|
||||
|
||||
vqadd.s16 q8, q0, q4
|
||||
vqadd.s16 q9, q0, q5
|
||||
|
||||
vqadd.s16 q10, q1, q4
|
||||
vqadd.s16 q11, q1, q5
|
||||
|
||||
vqadd.s16 q12, q2, q4
|
||||
vqadd.s16 q13, q2, q5
|
||||
|
||||
vqadd.s16 q14, q3, q4
|
||||
vqadd.s16 q15, q3, q5
|
||||
|
||||
vqshrun.s16 d0, q8, #0
|
||||
vqshrun.s16 d1, q9, #0
|
||||
|
||||
vqshrun.s16 d2, q10, #0
|
||||
vqshrun.s16 d3, q11, #0
|
||||
|
||||
vqshrun.s16 d4, q12, #0
|
||||
vqshrun.s16 d5, q13, #0
|
||||
|
||||
vqshrun.s16 d6, q14, #0
|
||||
vqshrun.s16 d7, q15, #0
|
||||
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q1}, [r1]!
|
||||
vst1.u8 {q2}, [r1]!
|
||||
vst1.u8 {q3}, [r1]!
|
||||
|
||||
subs r12, r12, #1
|
||||
bne case_tm_pred_loop
|
||||
|
||||
pop {r4-r8,pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; r0 unsigned char *y_buffer
|
||||
; r1 unsigned char *ypred_ptr
|
||||
; r2 int y_stride
|
||||
; r3 int mode
|
||||
; stack int Up
|
||||
; stack int Left
|
||||
|
||||
|vp8_build_intra_predictors_mby_s_neon_func| PROC
|
||||
push {r4-r8, lr}
|
||||
|
||||
mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
|
||||
|
||||
cmp r3, #0
|
||||
beq case_dc_pred_s
|
||||
cmp r3, #1
|
||||
beq case_v_pred_s
|
||||
cmp r3, #2
|
||||
beq case_h_pred_s
|
||||
cmp r3, #3
|
||||
beq case_tm_pred_s
|
||||
|
||||
case_dc_pred_s
|
||||
ldr r4, [sp, #24] ; Up
|
||||
ldr r5, [sp, #28] ; Left
|
||||
|
||||
; Default the DC average to 128
|
||||
mov r12, #128
|
||||
vdup.u8 q0, r12
|
||||
|
||||
; Zero out running sum
|
||||
mov r12, #0
|
||||
|
||||
; compute shift and jump
|
||||
adds r7, r4, r5
|
||||
beq skip_dc_pred_up_left_s
|
||||
|
||||
; Load above row, if it exists
|
||||
cmp r4, #0
|
||||
beq skip_dc_pred_up_s
|
||||
|
||||
sub r6, r0, r2
|
||||
vld1.8 {q1}, [r6]
|
||||
vpaddl.u8 q2, q1
|
||||
vpaddl.u16 q3, q2
|
||||
vpaddl.u32 q4, q3
|
||||
|
||||
vmov.32 r4, d8[0]
|
||||
vmov.32 r6, d9[0]
|
||||
|
||||
add r12, r4, r6
|
||||
|
||||
; Move back to interger registers
|
||||
|
||||
skip_dc_pred_up_s
|
||||
|
||||
cmp r5, #0
|
||||
beq skip_dc_pred_left_s
|
||||
|
||||
sub r0, r0, #1
|
||||
|
||||
; Load left row, if it exists
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0]
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
skip_dc_pred_left_s
|
||||
add r7, r7, #3 ; Shift
|
||||
sub r4, r7, #1
|
||||
mov r5, #1
|
||||
add r12, r12, r5, lsl r4
|
||||
mov r5, r12, lsr r7 ; expected_dc
|
||||
|
||||
vdup.u8 q0, r5
|
||||
|
||||
skip_dc_pred_up_left_s
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
|
||||
pop {r4-r8,pc}
|
||||
case_v_pred_s
|
||||
; Copy down above row
|
||||
sub r6, r0, r2
|
||||
vld1.8 {q0}, [r6]
|
||||
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
pop {r4-r8,pc}
|
||||
|
||||
case_h_pred_s
|
||||
; Load 4x yleft_col
|
||||
sub r0, r0, #1
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q1}, [r1], r2
|
||||
vst1.u8 {q2}, [r1], r2
|
||||
vst1.u8 {q3}, [r1], r2
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q1}, [r1], r2
|
||||
vst1.u8 {q2}, [r1], r2
|
||||
vst1.u8 {q3}, [r1], r2
|
||||
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q1}, [r1], r2
|
||||
vst1.u8 {q2}, [r1], r2
|
||||
vst1.u8 {q3}, [r1], r2
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q1}, [r1], r2
|
||||
vst1.u8 {q2}, [r1], r2
|
||||
vst1.u8 {q3}, [r1], r2
|
||||
|
||||
pop {r4-r8,pc}
|
||||
|
||||
case_tm_pred_s
|
||||
; Load yabove_row
|
||||
sub r3, r0, r2
|
||||
vld1.8 {q8}, [r3]
|
||||
|
||||
; Load ytop_left
|
||||
sub r3, r3, #1
|
||||
ldrb r7, [r3]
|
||||
|
||||
vdup.u16 q7, r7
|
||||
|
||||
; Compute yabove_row - ytop_left
|
||||
mov r3, #1
|
||||
vdup.u8 q0, r3
|
||||
|
||||
vmull.u8 q4, d16, d0
|
||||
vmull.u8 q5, d17, d0
|
||||
|
||||
vsub.s16 q4, q4, q7
|
||||
vsub.s16 q5, q5, q7
|
||||
|
||||
; Load 4x yleft_col
|
||||
sub r0, r0, #1
|
||||
mov r12, #4
|
||||
|
||||
case_tm_pred_loop_s
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u16 q0, r3
|
||||
vdup.u16 q1, r4
|
||||
vdup.u16 q2, r5
|
||||
vdup.u16 q3, r6
|
||||
|
||||
vqadd.s16 q8, q0, q4
|
||||
vqadd.s16 q9, q0, q5
|
||||
|
||||
vqadd.s16 q10, q1, q4
|
||||
vqadd.s16 q11, q1, q5
|
||||
|
||||
vqadd.s16 q12, q2, q4
|
||||
vqadd.s16 q13, q2, q5
|
||||
|
||||
vqadd.s16 q14, q3, q4
|
||||
vqadd.s16 q15, q3, q5
|
||||
|
||||
vqshrun.s16 d0, q8, #0
|
||||
vqshrun.s16 d1, q9, #0
|
||||
|
||||
vqshrun.s16 d2, q10, #0
|
||||
vqshrun.s16 d3, q11, #0
|
||||
|
||||
vqshrun.s16 d4, q12, #0
|
||||
vqshrun.s16 d5, q13, #0
|
||||
|
||||
vqshrun.s16 d6, q14, #0
|
||||
vqshrun.s16 d7, q15, #0
|
||||
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q1}, [r1], r2
|
||||
vst1.u8 {q2}, [r1], r2
|
||||
vst1.u8 {q3}, [r1], r2
|
||||
|
||||
subs r12, r12, #1
|
||||
bne case_tm_pred_loop_s
|
||||
|
||||
pop {r4-r8,pc}
|
||||
|
||||
ENDP
|
||||
|
||||
|
||||
END
|
@ -1,59 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_copy_mem16x16_neon|
|
||||
; ARM
|
||||
; REQUIRE8
|
||||
; PRESERVE8
|
||||
|
||||
AREA Block, CODE, READONLY ; name this block of code
|
||||
;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
|vp8_copy_mem16x16_neon| PROC
|
||||
|
||||
vld1.u8 {q0}, [r0], r1
|
||||
vld1.u8 {q1}, [r0], r1
|
||||
vld1.u8 {q2}, [r0], r1
|
||||
vst1.u8 {q0}, [r2], r3
|
||||
vld1.u8 {q3}, [r0], r1
|
||||
vst1.u8 {q1}, [r2], r3
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vst1.u8 {q2}, [r2], r3
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vst1.u8 {q3}, [r2], r3
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
vst1.u8 {q4}, [r2], r3
|
||||
vld1.u8 {q7}, [r0], r1
|
||||
vst1.u8 {q5}, [r2], r3
|
||||
vld1.u8 {q8}, [r0], r1
|
||||
vst1.u8 {q6}, [r2], r3
|
||||
vld1.u8 {q9}, [r0], r1
|
||||
vst1.u8 {q7}, [r2], r3
|
||||
vld1.u8 {q10}, [r0], r1
|
||||
vst1.u8 {q8}, [r2], r3
|
||||
vld1.u8 {q11}, [r0], r1
|
||||
vst1.u8 {q9}, [r2], r3
|
||||
vld1.u8 {q12}, [r0], r1
|
||||
vst1.u8 {q10}, [r2], r3
|
||||
vld1.u8 {q13}, [r0], r1
|
||||
vst1.u8 {q11}, [r2], r3
|
||||
vld1.u8 {q14}, [r0], r1
|
||||
vst1.u8 {q12}, [r2], r3
|
||||
vld1.u8 {q15}, [r0], r1
|
||||
vst1.u8 {q13}, [r2], r3
|
||||
vst1.u8 {q14}, [r2], r3
|
||||
vst1.u8 {q15}, [r2], r3
|
||||
|
||||
mov pc, lr
|
||||
|
||||
ENDP ; |vp8_copy_mem16x16_neon|
|
||||
|
||||
END
|
@ -1,34 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_copy_mem8x4_neon|
|
||||
; ARM
|
||||
; REQUIRE8
|
||||
; PRESERVE8
|
||||
|
||||
AREA Block, CODE, READONLY ; name this block of code
|
||||
;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
|vp8_copy_mem8x4_neon| PROC
|
||||
vld1.u8 {d0}, [r0], r1
|
||||
vld1.u8 {d1}, [r0], r1
|
||||
vst1.u8 {d0}, [r2], r3
|
||||
vld1.u8 {d2}, [r0], r1
|
||||
vst1.u8 {d1}, [r2], r3
|
||||
vld1.u8 {d3}, [r0], r1
|
||||
vst1.u8 {d2}, [r2], r3
|
||||
vst1.u8 {d3}, [r2], r3
|
||||
|
||||
mov pc, lr
|
||||
|
||||
ENDP ; |vp8_copy_mem8x4_neon|
|
||||
|
||||
END
|
@ -1,43 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_copy_mem8x8_neon|
|
||||
; ARM
|
||||
; REQUIRE8
|
||||
; PRESERVE8
|
||||
|
||||
AREA Block, CODE, READONLY ; name this block of code
|
||||
;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
|vp8_copy_mem8x8_neon| PROC
|
||||
|
||||
vld1.u8 {d0}, [r0], r1
|
||||
vld1.u8 {d1}, [r0], r1
|
||||
vst1.u8 {d0}, [r2], r3
|
||||
vld1.u8 {d2}, [r0], r1
|
||||
vst1.u8 {d1}, [r2], r3
|
||||
vld1.u8 {d3}, [r0], r1
|
||||
vst1.u8 {d2}, [r2], r3
|
||||
vld1.u8 {d4}, [r0], r1
|
||||
vst1.u8 {d3}, [r2], r3
|
||||
vld1.u8 {d5}, [r0], r1
|
||||
vst1.u8 {d4}, [r2], r3
|
||||
vld1.u8 {d6}, [r0], r1
|
||||
vst1.u8 {d5}, [r2], r3
|
||||
vld1.u8 {d7}, [r0], r1
|
||||
vst1.u8 {d6}, [r2], r3
|
||||
vst1.u8 {d7}, [r2], r3
|
||||
|
||||
mov pc, lr
|
||||
|
||||
ENDP ; |vp8_copy_mem8x8_neon|
|
||||
|
||||
END
|
59
media/libvpx/vp8/common/arm/neon/copymem_neon.c
Normal file
59
media/libvpx/vp8/common/arm/neon/copymem_neon.c
Normal file
@ -0,0 +1,59 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
void vp8_copy_mem8x4_neon(
|
||||
unsigned char *src,
|
||||
int src_stride,
|
||||
unsigned char *dst,
|
||||
int dst_stride) {
|
||||
uint8x8_t vtmp;
|
||||
int r;
|
||||
|
||||
for (r = 0; r < 4; r++) {
|
||||
vtmp = vld1_u8(src);
|
||||
vst1_u8(dst, vtmp);
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_copy_mem8x8_neon(
|
||||
unsigned char *src,
|
||||
int src_stride,
|
||||
unsigned char *dst,
|
||||
int dst_stride) {
|
||||
uint8x8_t vtmp;
|
||||
int r;
|
||||
|
||||
for (r = 0; r < 8; r++) {
|
||||
vtmp = vld1_u8(src);
|
||||
vst1_u8(dst, vtmp);
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_copy_mem16x16_neon(
|
||||
unsigned char *src,
|
||||
int src_stride,
|
||||
unsigned char *dst,
|
||||
int dst_stride) {
|
||||
int r;
|
||||
uint8x16_t qtmp;
|
||||
|
||||
for (r = 0; r < 16; r++) {
|
||||
qtmp = vld1q_u8(src);
|
||||
vst1q_u8(dst, qtmp);
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
@ -1,54 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license and patent
|
||||
; grant that can be found in the LICENSE file in the root of the source
|
||||
; tree. All contributing project authors may be found in the AUTHORS
|
||||
; file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_dc_only_idct_add_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
|
||||
; int pred_stride, unsigned char *dst_ptr,
|
||||
; int dst_stride)
|
||||
|
||||
; r0 input_dc
|
||||
; r1 pred_ptr
|
||||
; r2 pred_stride
|
||||
; r3 dst_ptr
|
||||
; sp dst_stride
|
||||
|
||||
|vp8_dc_only_idct_add_neon| PROC
|
||||
add r0, r0, #4
|
||||
asr r0, r0, #3
|
||||
ldr r12, [sp]
|
||||
vdup.16 q0, r0
|
||||
|
||||
vld1.32 {d2[0]}, [r1], r2
|
||||
vld1.32 {d2[1]}, [r1], r2
|
||||
vld1.32 {d4[0]}, [r1], r2
|
||||
vld1.32 {d4[1]}, [r1]
|
||||
|
||||
vaddw.u8 q1, q0, d2
|
||||
vaddw.u8 q2, q0, d4
|
||||
|
||||
vqmovun.s16 d2, q1
|
||||
vqmovun.s16 d4, q2
|
||||
|
||||
vst1.32 {d2[0]}, [r3], r12
|
||||
vst1.32 {d2[1]}, [r3], r12
|
||||
vst1.32 {d4[0]}, [r3], r12
|
||||
vst1.32 {d4[1]}, [r3]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
42
media/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c
Normal file
42
media/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c
Normal file
@ -0,0 +1,42 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
void vp8_dc_only_idct_add_neon(
|
||||
int16_t input_dc,
|
||||
unsigned char *pred_ptr,
|
||||
int pred_stride,
|
||||
unsigned char *dst_ptr,
|
||||
int dst_stride) {
|
||||
int i;
|
||||
uint16_t a1 = ((input_dc + 4) >> 3);
|
||||
uint32x2_t d2u32 = vdup_n_u32(0);
|
||||
uint8x8_t d2u8;
|
||||
uint16x8_t q1u16;
|
||||
uint16x8_t qAdd;
|
||||
|
||||
qAdd = vdupq_n_u16(a1);
|
||||
|
||||
for (i = 0; i < 2; i++) {
|
||||
d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 0);
|
||||
pred_ptr += pred_stride;
|
||||
d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 1);
|
||||
pred_ptr += pred_stride;
|
||||
|
||||
q1u16 = vaddw_u8(qAdd, vreinterpret_u8_u32(d2u32));
|
||||
d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
|
||||
|
||||
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0);
|
||||
dst_ptr += dst_stride;
|
||||
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1);
|
||||
dst_ptr += dst_stride;
|
||||
}
|
||||
}
|
@ -1,131 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_dequant_idct_add_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;void vp8_dequant_idct_add_neon(short *input, short *dq,
|
||||
; unsigned char *dest, int stride)
|
||||
; r0 short *input,
|
||||
; r1 short *dq,
|
||||
; r2 unsigned char *dest
|
||||
; r3 int stride
|
||||
|
||||
|vp8_dequant_idct_add_neon| PROC
|
||||
vld1.16 {q3, q4}, [r0]
|
||||
vld1.16 {q5, q6}, [r1]
|
||||
|
||||
add r1, r2, r3 ; r1 = dest + stride
|
||||
lsl r3, #1 ; 2x stride
|
||||
|
||||
vld1.32 {d14[0]}, [r2], r3
|
||||
vld1.32 {d14[1]}, [r1], r3
|
||||
vld1.32 {d15[0]}, [r2]
|
||||
vld1.32 {d15[1]}, [r1]
|
||||
|
||||
adr r12, cospi8sqrt2minus1 ; pointer to the first constant
|
||||
|
||||
vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon
|
||||
vmul.i16 q2, q4, q6
|
||||
|
||||
;|short_idct4x4llm_neon| PROC
|
||||
vld1.16 {d0}, [r12]
|
||||
vswp d3, d4 ;q2(vp[4] vp[12])
|
||||
|
||||
vqdmulh.s16 q3, q2, d0[2]
|
||||
vqdmulh.s16 q4, q2, d0[0]
|
||||
|
||||
vqadd.s16 d12, d2, d3 ;a1
|
||||
vqsub.s16 d13, d2, d3 ;b1
|
||||
|
||||
vshr.s16 q3, q3, #1
|
||||
vshr.s16 q4, q4, #1
|
||||
|
||||
vqadd.s16 q3, q3, q2
|
||||
vqadd.s16 q4, q4, q2
|
||||
|
||||
vqsub.s16 d10, d6, d9 ;c1
|
||||
vqadd.s16 d11, d7, d8 ;d1
|
||||
|
||||
vqadd.s16 d2, d12, d11
|
||||
vqadd.s16 d3, d13, d10
|
||||
vqsub.s16 d4, d13, d10
|
||||
vqsub.s16 d5, d12, d11
|
||||
|
||||
vtrn.32 d2, d4
|
||||
vtrn.32 d3, d5
|
||||
vtrn.16 d2, d3
|
||||
vtrn.16 d4, d5
|
||||
|
||||
; memset(input, 0, 32) -- 32bytes
|
||||
vmov.i16 q14, #0
|
||||
|
||||
vswp d3, d4
|
||||
vqdmulh.s16 q3, q2, d0[2]
|
||||
vqdmulh.s16 q4, q2, d0[0]
|
||||
|
||||
vqadd.s16 d12, d2, d3 ;a1
|
||||
vqsub.s16 d13, d2, d3 ;b1
|
||||
|
||||
vmov q15, q14
|
||||
|
||||
vshr.s16 q3, q3, #1
|
||||
vshr.s16 q4, q4, #1
|
||||
|
||||
vqadd.s16 q3, q3, q2
|
||||
vqadd.s16 q4, q4, q2
|
||||
|
||||
vqsub.s16 d10, d6, d9 ;c1
|
||||
vqadd.s16 d11, d7, d8 ;d1
|
||||
|
||||
vqadd.s16 d2, d12, d11
|
||||
vqadd.s16 d3, d13, d10
|
||||
vqsub.s16 d4, d13, d10
|
||||
vqsub.s16 d5, d12, d11
|
||||
|
||||
vst1.16 {q14, q15}, [r0]
|
||||
|
||||
vrshr.s16 d2, d2, #3
|
||||
vrshr.s16 d3, d3, #3
|
||||
vrshr.s16 d4, d4, #3
|
||||
vrshr.s16 d5, d5, #3
|
||||
|
||||
vtrn.32 d2, d4
|
||||
vtrn.32 d3, d5
|
||||
vtrn.16 d2, d3
|
||||
vtrn.16 d4, d5
|
||||
|
||||
vaddw.u8 q1, q1, d14
|
||||
vaddw.u8 q2, q2, d15
|
||||
|
||||
sub r2, r2, r3
|
||||
sub r1, r1, r3
|
||||
|
||||
vqmovun.s16 d0, q1
|
||||
vqmovun.s16 d1, q2
|
||||
|
||||
vst1.32 {d0[0]}, [r2], r3
|
||||
vst1.32 {d0[1]}, [r1], r3
|
||||
vst1.32 {d1[0]}, [r2]
|
||||
vst1.32 {d1[1]}, [r1]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP ; |vp8_dequant_idct_add_neon|
|
||||
|
||||
; Constant Pool
|
||||
cospi8sqrt2minus1 DCD 0x4e7b4e7b
|
||||
sinpi8sqrt2 DCD 0x8a8c8a8c
|
||||
|
||||
END
|
142
media/libvpx/vp8/common/arm/neon/dequant_idct_neon.c
Normal file
142
media/libvpx/vp8/common/arm/neon/dequant_idct_neon.c
Normal file
@ -0,0 +1,142 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
static const int16_t cospi8sqrt2minus1 = 20091;
|
||||
static const int16_t sinpi8sqrt2 = 35468;
|
||||
|
||||
void vp8_dequant_idct_add_neon(
|
||||
int16_t *input,
|
||||
int16_t *dq,
|
||||
unsigned char *dst,
|
||||
int stride) {
|
||||
unsigned char *dst0;
|
||||
int32x2_t d14, d15;
|
||||
int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
|
||||
int16x8_t q1, q2, q3, q4, q5, q6;
|
||||
int16x8_t qEmpty = vdupq_n_s16(0);
|
||||
int32x2x2_t d2tmp0, d2tmp1;
|
||||
int16x4x2_t d2tmp2, d2tmp3;
|
||||
|
||||
d14 = d15 = vdup_n_s32(0);
|
||||
|
||||
// load input
|
||||
q3 = vld1q_s16(input);
|
||||
vst1q_s16(input, qEmpty);
|
||||
input += 8;
|
||||
q4 = vld1q_s16(input);
|
||||
vst1q_s16(input, qEmpty);
|
||||
|
||||
// load dq
|
||||
q5 = vld1q_s16(dq);
|
||||
dq += 8;
|
||||
q6 = vld1q_s16(dq);
|
||||
|
||||
// load src from dst
|
||||
dst0 = dst;
|
||||
d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0);
|
||||
dst0 += stride;
|
||||
d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1);
|
||||
dst0 += stride;
|
||||
d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0);
|
||||
dst0 += stride;
|
||||
d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1);
|
||||
|
||||
q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3),
|
||||
vreinterpretq_u16_s16(q5)));
|
||||
q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4),
|
||||
vreinterpretq_u16_s16(q6)));
|
||||
|
||||
d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2));
|
||||
d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2));
|
||||
|
||||
q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2));
|
||||
|
||||
q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
|
||||
q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
|
||||
|
||||
q3 = vshrq_n_s16(q3, 1);
|
||||
q4 = vshrq_n_s16(q4, 1);
|
||||
|
||||
q3 = vqaddq_s16(q3, q2);
|
||||
q4 = vqaddq_s16(q4, q2);
|
||||
|
||||
d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
|
||||
d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
|
||||
|
||||
d2 = vqadd_s16(d12, d11);
|
||||
d3 = vqadd_s16(d13, d10);
|
||||
d4 = vqsub_s16(d13, d10);
|
||||
d5 = vqsub_s16(d12, d11);
|
||||
|
||||
d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
|
||||
d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
|
||||
d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
|
||||
vreinterpret_s16_s32(d2tmp1.val[0]));
|
||||
d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
|
||||
vreinterpret_s16_s32(d2tmp1.val[1]));
|
||||
|
||||
// loop 2
|
||||
q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]);
|
||||
|
||||
q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
|
||||
q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
|
||||
|
||||
d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
|
||||
d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
|
||||
|
||||
q3 = vshrq_n_s16(q3, 1);
|
||||
q4 = vshrq_n_s16(q4, 1);
|
||||
|
||||
q3 = vqaddq_s16(q3, q2);
|
||||
q4 = vqaddq_s16(q4, q2);
|
||||
|
||||
d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
|
||||
d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
|
||||
|
||||
d2 = vqadd_s16(d12, d11);
|
||||
d3 = vqadd_s16(d13, d10);
|
||||
d4 = vqsub_s16(d13, d10);
|
||||
d5 = vqsub_s16(d12, d11);
|
||||
|
||||
d2 = vrshr_n_s16(d2, 3);
|
||||
d3 = vrshr_n_s16(d3, 3);
|
||||
d4 = vrshr_n_s16(d4, 3);
|
||||
d5 = vrshr_n_s16(d5, 3);
|
||||
|
||||
d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
|
||||
d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
|
||||
d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
|
||||
vreinterpret_s16_s32(d2tmp1.val[0]));
|
||||
d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
|
||||
vreinterpret_s16_s32(d2tmp1.val[1]));
|
||||
|
||||
q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]);
|
||||
q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]);
|
||||
|
||||
q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1),
|
||||
vreinterpret_u8_s32(d14)));
|
||||
q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2),
|
||||
vreinterpret_u8_s32(d15)));
|
||||
|
||||
d14 = vreinterpret_s32_u8(vqmovun_s16(q1));
|
||||
d15 = vreinterpret_s32_u8(vqmovun_s16(q2));
|
||||
|
||||
dst0 = dst;
|
||||
vst1_lane_s32((int32_t *)dst0, d14, 0);
|
||||
dst0 += stride;
|
||||
vst1_lane_s32((int32_t *)dst0, d14, 1);
|
||||
dst0 += stride;
|
||||
vst1_lane_s32((int32_t *)dst0, d15, 0);
|
||||
dst0 += stride;
|
||||
vst1_lane_s32((int32_t *)dst0, d15, 1);
|
||||
return;
|
||||
}
|
@ -1,34 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_dequantize_b_loop_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; r0 short *Q,
|
||||
; r1 short *DQC
|
||||
; r2 short *DQ
|
||||
|vp8_dequantize_b_loop_neon| PROC
|
||||
vld1.16 {q0, q1}, [r0]
|
||||
vld1.16 {q2, q3}, [r1]
|
||||
|
||||
vmul.i16 q4, q0, q2
|
||||
vmul.i16 q5, q1, q3
|
||||
|
||||
vst1.16 {q4, q5}, [r2]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
25
media/libvpx/vp8/common/arm/neon/dequantizeb_neon.c
Normal file
25
media/libvpx/vp8/common/arm/neon/dequantizeb_neon.c
Normal file
@ -0,0 +1,25 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "vp8/common/blockd.h"
|
||||
|
||||
void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) {
|
||||
int16x8x2_t qQ, qDQC, qDQ;
|
||||
|
||||
qQ = vld2q_s16(d->qcoeff);
|
||||
qDQC = vld2q_s16(DQC);
|
||||
|
||||
qDQ.val[0] = vmulq_s16(qQ.val[0], qDQC.val[0]);
|
||||
qDQ.val[1] = vmulq_s16(qQ.val[1], qDQC.val[1]);
|
||||
|
||||
vst2q_s16(d->dqcoeff, qDQ);
|
||||
}
|
@ -1,79 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license and patent
|
||||
; grant that can be found in the LICENSE file in the root of the source
|
||||
; tree. All contributing project authors may be found in the AUTHORS
|
||||
; file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |idct_dequant_0_2x_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;void idct_dequant_0_2x_neon(short *q, short dq,
|
||||
; unsigned char *dst, int stride);
|
||||
; r0 *q
|
||||
; r1 dq
|
||||
; r2 *dst
|
||||
; r3 stride
|
||||
|idct_dequant_0_2x_neon| PROC
|
||||
push {r4, r5}
|
||||
|
||||
add r12, r2, #4
|
||||
vld1.32 {d2[0]}, [r2], r3
|
||||
vld1.32 {d8[0]}, [r12], r3
|
||||
vld1.32 {d2[1]}, [r2], r3
|
||||
vld1.32 {d8[1]}, [r12], r3
|
||||
vld1.32 {d4[0]}, [r2], r3
|
||||
vld1.32 {d10[0]}, [r12], r3
|
||||
vld1.32 {d4[1]}, [r2], r3
|
||||
vld1.32 {d10[1]}, [r12], r3
|
||||
|
||||
ldrh r12, [r0] ; lo q
|
||||
ldrh r4, [r0, #32] ; hi q
|
||||
mov r5, #0
|
||||
strh r5, [r0]
|
||||
strh r5, [r0, #32]
|
||||
|
||||
sxth r12, r12 ; lo
|
||||
mul r0, r12, r1
|
||||
add r0, r0, #4
|
||||
asr r0, r0, #3
|
||||
vdup.16 q0, r0
|
||||
sxth r4, r4 ; hi
|
||||
mul r0, r4, r1
|
||||
add r0, r0, #4
|
||||
asr r0, r0, #3
|
||||
vdup.16 q3, r0
|
||||
|
||||
vaddw.u8 q1, q0, d2 ; lo
|
||||
vaddw.u8 q2, q0, d4
|
||||
vaddw.u8 q4, q3, d8 ; hi
|
||||
vaddw.u8 q5, q3, d10
|
||||
|
||||
sub r2, r2, r3, lsl #2 ; dst - 4*stride
|
||||
add r0, r2, #4
|
||||
|
||||
vqmovun.s16 d2, q1 ; lo
|
||||
vqmovun.s16 d4, q2
|
||||
vqmovun.s16 d8, q4 ; hi
|
||||
vqmovun.s16 d10, q5
|
||||
|
||||
vst1.32 {d2[0]}, [r2], r3 ; lo
|
||||
vst1.32 {d8[0]}, [r0], r3 ; hi
|
||||
vst1.32 {d2[1]}, [r2], r3
|
||||
vst1.32 {d8[1]}, [r0], r3
|
||||
vst1.32 {d4[0]}, [r2], r3
|
||||
vst1.32 {d10[0]}, [r0], r3
|
||||
vst1.32 {d4[1]}, [r2]
|
||||
vst1.32 {d10[1]}, [r0]
|
||||
|
||||
pop {r4, r5}
|
||||
bx lr
|
||||
|
||||
ENDP ; |idct_dequant_0_2x_neon|
|
||||
END
|
62
media/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
Normal file
62
media/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
Normal file
@ -0,0 +1,62 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
void idct_dequant_0_2x_neon(
|
||||
int16_t *q,
|
||||
int16_t dq,
|
||||
unsigned char *dst,
|
||||
int stride) {
|
||||
unsigned char *dst0;
|
||||
int i, a0, a1;
|
||||
int16x8x2_t q2Add;
|
||||
int32x2_t d2s32, d4s32;
|
||||
uint8x8_t d2u8, d4u8;
|
||||
uint16x8_t q1u16, q2u16;
|
||||
|
||||
a0 = ((q[0] * dq) + 4) >> 3;
|
||||
a1 = ((q[16] * dq) + 4) >> 3;
|
||||
q[0] = q[16] = 0;
|
||||
q2Add.val[0] = vdupq_n_s16((int16_t)a0);
|
||||
q2Add.val[1] = vdupq_n_s16((int16_t)a1);
|
||||
|
||||
for (i = 0; i < 2; i++, dst += 4) {
|
||||
dst0 = dst;
|
||||
d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
|
||||
dst0 += stride;
|
||||
d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
|
||||
dst0 += stride;
|
||||
d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
|
||||
dst0 += stride;
|
||||
d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
|
||||
|
||||
q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
|
||||
vreinterpret_u8_s32(d2s32));
|
||||
q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
|
||||
vreinterpret_u8_s32(d4s32));
|
||||
|
||||
d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
|
||||
d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
|
||||
|
||||
d2s32 = vreinterpret_s32_u8(d2u8);
|
||||
d4s32 = vreinterpret_s32_u8(d4u8);
|
||||
|
||||
dst0 = dst;
|
||||
vst1_lane_s32((int32_t *)dst0, d2s32, 0);
|
||||
dst0 += stride;
|
||||
vst1_lane_s32((int32_t *)dst0, d2s32, 1);
|
||||
dst0 += stride;
|
||||
vst1_lane_s32((int32_t *)dst0, d4s32, 0);
|
||||
dst0 += stride;
|
||||
vst1_lane_s32((int32_t *)dst0, d4s32, 1);
|
||||
}
|
||||
return;
|
||||
}
|
@ -1,196 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |idct_dequant_full_2x_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;void idct_dequant_full_2x_neon(short *q, short *dq,
|
||||
; unsigned char *dst, int stride);
|
||||
; r0 *q,
|
||||
; r1 *dq,
|
||||
; r2 *dst
|
||||
; r3 stride
|
||||
|idct_dequant_full_2x_neon| PROC
|
||||
vld1.16 {q0, q1}, [r1] ; dq (same l/r)
|
||||
vld1.16 {q2, q3}, [r0] ; l q
|
||||
add r0, r0, #32
|
||||
vld1.16 {q4, q5}, [r0] ; r q
|
||||
add r12, r2, #4
|
||||
|
||||
; interleave the predictors
|
||||
vld1.32 {d28[0]}, [r2], r3 ; l pre
|
||||
vld1.32 {d28[1]}, [r12], r3 ; r pre
|
||||
vld1.32 {d29[0]}, [r2], r3
|
||||
vld1.32 {d29[1]}, [r12], r3
|
||||
vld1.32 {d30[0]}, [r2], r3
|
||||
vld1.32 {d30[1]}, [r12], r3
|
||||
vld1.32 {d31[0]}, [r2], r3
|
||||
vld1.32 {d31[1]}, [r12]
|
||||
|
||||
adr r1, cospi8sqrt2minus1 ; pointer to the first constant
|
||||
|
||||
; dequant: q[i] = q[i] * dq[i]
|
||||
vmul.i16 q2, q2, q0
|
||||
vmul.i16 q3, q3, q1
|
||||
vmul.i16 q4, q4, q0
|
||||
vmul.i16 q5, q5, q1
|
||||
|
||||
vld1.16 {d0}, [r1]
|
||||
|
||||
; q2: l0r0 q3: l8r8
|
||||
; q4: l4r4 q5: l12r12
|
||||
vswp d5, d8
|
||||
vswp d7, d10
|
||||
|
||||
; _CONSTANTS_ * 4,12 >> 16
|
||||
; q6: 4 * sinpi : c1/temp1
|
||||
; q7: 12 * sinpi : d1/temp2
|
||||
; q8: 4 * cospi
|
||||
; q9: 12 * cospi
|
||||
vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2
|
||||
vqdmulh.s16 q7, q5, d0[2]
|
||||
vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1
|
||||
vqdmulh.s16 q9, q5, d0[0]
|
||||
|
||||
vqadd.s16 q10, q2, q3 ; a1 = 0 + 8
|
||||
vqsub.s16 q11, q2, q3 ; b1 = 0 - 8
|
||||
|
||||
; vqdmulh only accepts signed values. this was a problem because
|
||||
; our constant had the high bit set, and was treated as a negative value.
|
||||
; vqdmulh also doubles the value before it shifts by 16. we need to
|
||||
; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
|
||||
; so we can shift the constant without losing precision. this avoids
|
||||
; shift again afterward, but also avoids the sign issue. win win!
|
||||
; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
|
||||
; pre-shift it
|
||||
vshr.s16 q8, q8, #1
|
||||
vshr.s16 q9, q9, #1
|
||||
|
||||
; q4: 4 + 4 * cospi : d1/temp1
|
||||
; q5: 12 + 12 * cospi : c1/temp2
|
||||
vqadd.s16 q4, q4, q8
|
||||
vqadd.s16 q5, q5, q9
|
||||
|
||||
; c1 = temp1 - temp2
|
||||
; d1 = temp1 + temp2
|
||||
vqsub.s16 q2, q6, q5
|
||||
vqadd.s16 q3, q4, q7
|
||||
|
||||
; [0]: a1+d1
|
||||
; [1]: b1+c1
|
||||
; [2]: b1-c1
|
||||
; [3]: a1-d1
|
||||
vqadd.s16 q4, q10, q3
|
||||
vqadd.s16 q5, q11, q2
|
||||
vqsub.s16 q6, q11, q2
|
||||
vqsub.s16 q7, q10, q3
|
||||
|
||||
; rotate
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vtrn.16 q4, q5
|
||||
vtrn.16 q6, q7
|
||||
; idct loop 2
|
||||
; q4: l 0, 4, 8,12 r 0, 4, 8,12
|
||||
; q5: l 1, 5, 9,13 r 1, 5, 9,13
|
||||
; q6: l 2, 6,10,14 r 2, 6,10,14
|
||||
; q7: l 3, 7,11,15 r 3, 7,11,15
|
||||
|
||||
; q8: 1 * sinpi : c1/temp1
|
||||
; q9: 3 * sinpi : d1/temp2
|
||||
; q10: 1 * cospi
|
||||
; q11: 3 * cospi
|
||||
vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2
|
||||
vqdmulh.s16 q9, q7, d0[2]
|
||||
vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1
|
||||
vqdmulh.s16 q11, q7, d0[0]
|
||||
|
||||
vqadd.s16 q2, q4, q6 ; a1 = 0 + 2
|
||||
vqsub.s16 q3, q4, q6 ; b1 = 0 - 2
|
||||
|
||||
; see note on shifting above
|
||||
vshr.s16 q10, q10, #1
|
||||
vshr.s16 q11, q11, #1
|
||||
|
||||
; q10: 1 + 1 * cospi : d1/temp1
|
||||
; q11: 3 + 3 * cospi : c1/temp2
|
||||
vqadd.s16 q10, q5, q10
|
||||
vqadd.s16 q11, q7, q11
|
||||
|
||||
; q8: c1 = temp1 - temp2
|
||||
; q9: d1 = temp1 + temp2
|
||||
vqsub.s16 q8, q8, q11
|
||||
vqadd.s16 q9, q10, q9
|
||||
|
||||
; a1+d1
|
||||
; b1+c1
|
||||
; b1-c1
|
||||
; a1-d1
|
||||
vqadd.s16 q4, q2, q9
|
||||
vqadd.s16 q5, q3, q8
|
||||
vqsub.s16 q6, q3, q8
|
||||
vqsub.s16 q7, q2, q9
|
||||
|
||||
; +4 >> 3 (rounding)
|
||||
vrshr.s16 q4, q4, #3 ; lo
|
||||
vrshr.s16 q5, q5, #3
|
||||
vrshr.s16 q6, q6, #3 ; hi
|
||||
vrshr.s16 q7, q7, #3
|
||||
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vtrn.16 q4, q5
|
||||
vtrn.16 q6, q7
|
||||
|
||||
; adding pre
|
||||
; input is still packed. pre was read interleaved
|
||||
vaddw.u8 q4, q4, d28
|
||||
vaddw.u8 q5, q5, d29
|
||||
vaddw.u8 q6, q6, d30
|
||||
vaddw.u8 q7, q7, d31
|
||||
|
||||
vmov.i16 q14, #0
|
||||
vmov q15, q14
|
||||
vst1.16 {q14, q15}, [r0] ; write over high input
|
||||
sub r0, r0, #32
|
||||
vst1.16 {q14, q15}, [r0] ; write over low input
|
||||
|
||||
sub r2, r2, r3, lsl #2 ; dst - 4*stride
|
||||
add r1, r2, #4 ; hi
|
||||
|
||||
;saturate and narrow
|
||||
vqmovun.s16 d0, q4 ; lo
|
||||
vqmovun.s16 d1, q5
|
||||
vqmovun.s16 d2, q6 ; hi
|
||||
vqmovun.s16 d3, q7
|
||||
|
||||
vst1.32 {d0[0]}, [r2], r3 ; lo
|
||||
vst1.32 {d0[1]}, [r1], r3 ; hi
|
||||
vst1.32 {d1[0]}, [r2], r3
|
||||
vst1.32 {d1[1]}, [r1], r3
|
||||
vst1.32 {d2[0]}, [r2], r3
|
||||
vst1.32 {d2[1]}, [r1], r3
|
||||
vst1.32 {d3[0]}, [r2]
|
||||
vst1.32 {d3[1]}, [r1]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP ; |idct_dequant_full_2x_neon|
|
||||
|
||||
; Constant Pool
|
||||
cospi8sqrt2minus1 DCD 0x4e7b
|
||||
; because the lowest bit in 0x8a8c is 0, we can pre-shift this
|
||||
sinpi8sqrt2 DCD 0x4546
|
||||
|
||||
END
|
185
media/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
Normal file
185
media/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
Normal file
@ -0,0 +1,185 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
static const int16_t cospi8sqrt2minus1 = 20091;
|
||||
static const int16_t sinpi8sqrt2 = 17734;
|
||||
// because the lowest bit in 0x8a8c is 0, we can pre-shift this
|
||||
|
||||
void idct_dequant_full_2x_neon(
|
||||
int16_t *q,
|
||||
int16_t *dq,
|
||||
unsigned char *dst,
|
||||
int stride) {
|
||||
unsigned char *dst0, *dst1;
|
||||
int32x2_t d28, d29, d30, d31;
|
||||
int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
|
||||
int16x8_t qEmpty = vdupq_n_s16(0);
|
||||
int32x4x2_t q2tmp0, q2tmp1;
|
||||
int16x8x2_t q2tmp2, q2tmp3;
|
||||
int16x4_t dLow0, dLow1, dHigh0, dHigh1;
|
||||
|
||||
d28 = d29 = d30 = d31 = vdup_n_s32(0);
|
||||
|
||||
// load dq
|
||||
q0 = vld1q_s16(dq);
|
||||
dq += 8;
|
||||
q1 = vld1q_s16(dq);
|
||||
|
||||
// load q
|
||||
q2 = vld1q_s16(q);
|
||||
vst1q_s16(q, qEmpty);
|
||||
q += 8;
|
||||
q3 = vld1q_s16(q);
|
||||
vst1q_s16(q, qEmpty);
|
||||
q += 8;
|
||||
q4 = vld1q_s16(q);
|
||||
vst1q_s16(q, qEmpty);
|
||||
q += 8;
|
||||
q5 = vld1q_s16(q);
|
||||
vst1q_s16(q, qEmpty);
|
||||
|
||||
// load src from dst
|
||||
dst0 = dst;
|
||||
dst1 = dst + 4;
|
||||
d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
|
||||
dst0 += stride;
|
||||
d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
|
||||
dst1 += stride;
|
||||
d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
|
||||
dst0 += stride;
|
||||
d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
|
||||
dst1 += stride;
|
||||
|
||||
d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
|
||||
dst0 += stride;
|
||||
d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
|
||||
dst1 += stride;
|
||||
d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
|
||||
d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
|
||||
|
||||
q2 = vmulq_s16(q2, q0);
|
||||
q3 = vmulq_s16(q3, q1);
|
||||
q4 = vmulq_s16(q4, q0);
|
||||
q5 = vmulq_s16(q5, q1);
|
||||
|
||||
// vswp
|
||||
dLow0 = vget_low_s16(q2);
|
||||
dHigh0 = vget_high_s16(q2);
|
||||
dLow1 = vget_low_s16(q4);
|
||||
dHigh1 = vget_high_s16(q4);
|
||||
q2 = vcombine_s16(dLow0, dLow1);
|
||||
q4 = vcombine_s16(dHigh0, dHigh1);
|
||||
|
||||
dLow0 = vget_low_s16(q3);
|
||||
dHigh0 = vget_high_s16(q3);
|
||||
dLow1 = vget_low_s16(q5);
|
||||
dHigh1 = vget_high_s16(q5);
|
||||
q3 = vcombine_s16(dLow0, dLow1);
|
||||
q5 = vcombine_s16(dHigh0, dHigh1);
|
||||
|
||||
q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
|
||||
q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
|
||||
q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
|
||||
q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
|
||||
|
||||
q10 = vqaddq_s16(q2, q3);
|
||||
q11 = vqsubq_s16(q2, q3);
|
||||
|
||||
q8 = vshrq_n_s16(q8, 1);
|
||||
q9 = vshrq_n_s16(q9, 1);
|
||||
|
||||
q4 = vqaddq_s16(q4, q8);
|
||||
q5 = vqaddq_s16(q5, q9);
|
||||
|
||||
q2 = vqsubq_s16(q6, q5);
|
||||
q3 = vqaddq_s16(q7, q4);
|
||||
|
||||
q4 = vqaddq_s16(q10, q3);
|
||||
q5 = vqaddq_s16(q11, q2);
|
||||
q6 = vqsubq_s16(q11, q2);
|
||||
q7 = vqsubq_s16(q10, q3);
|
||||
|
||||
q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
|
||||
q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
|
||||
q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
|
||||
vreinterpretq_s16_s32(q2tmp1.val[0]));
|
||||
q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
|
||||
vreinterpretq_s16_s32(q2tmp1.val[1]));
|
||||
|
||||
// loop 2
|
||||
q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
|
||||
q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
|
||||
q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
|
||||
q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
|
||||
|
||||
q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
|
||||
q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
|
||||
|
||||
q10 = vshrq_n_s16(q10, 1);
|
||||
q11 = vshrq_n_s16(q11, 1);
|
||||
|
||||
q10 = vqaddq_s16(q2tmp2.val[1], q10);
|
||||
q11 = vqaddq_s16(q2tmp3.val[1], q11);
|
||||
|
||||
q8 = vqsubq_s16(q8, q11);
|
||||
q9 = vqaddq_s16(q9, q10);
|
||||
|
||||
q4 = vqaddq_s16(q2, q9);
|
||||
q5 = vqaddq_s16(q3, q8);
|
||||
q6 = vqsubq_s16(q3, q8);
|
||||
q7 = vqsubq_s16(q2, q9);
|
||||
|
||||
q4 = vrshrq_n_s16(q4, 3);
|
||||
q5 = vrshrq_n_s16(q5, 3);
|
||||
q6 = vrshrq_n_s16(q6, 3);
|
||||
q7 = vrshrq_n_s16(q7, 3);
|
||||
|
||||
q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
|
||||
q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
|
||||
q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
|
||||
vreinterpretq_s16_s32(q2tmp1.val[0]));
|
||||
q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
|
||||
vreinterpretq_s16_s32(q2tmp1.val[1]));
|
||||
|
||||
q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]),
|
||||
vreinterpret_u8_s32(d28)));
|
||||
q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]),
|
||||
vreinterpret_u8_s32(d29)));
|
||||
q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]),
|
||||
vreinterpret_u8_s32(d30)));
|
||||
q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]),
|
||||
vreinterpret_u8_s32(d31)));
|
||||
|
||||
d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
|
||||
d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
|
||||
d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
|
||||
d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
|
||||
|
||||
dst0 = dst;
|
||||
dst1 = dst + 4;
|
||||
vst1_lane_s32((int32_t *)dst0, d28, 0);
|
||||
dst0 += stride;
|
||||
vst1_lane_s32((int32_t *)dst1, d28, 1);
|
||||
dst1 += stride;
|
||||
vst1_lane_s32((int32_t *)dst0, d29, 0);
|
||||
dst0 += stride;
|
||||
vst1_lane_s32((int32_t *)dst1, d29, 1);
|
||||
dst1 += stride;
|
||||
|
||||
vst1_lane_s32((int32_t *)dst0, d30, 0);
|
||||
dst0 += stride;
|
||||
vst1_lane_s32((int32_t *)dst1, d30, 1);
|
||||
dst1 += stride;
|
||||
vst1_lane_s32((int32_t *)dst0, d31, 0);
|
||||
vst1_lane_s32((int32_t *)dst1, d31, 1);
|
||||
return;
|
||||
}
|
@ -1,87 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
EXPORT |vp8_short_inv_walsh4x4_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
|
||||
;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff)
|
||||
|vp8_short_inv_walsh4x4_neon| PROC
|
||||
|
||||
; read in all four lines of values: d0->d3
|
||||
vld1.i16 {q0-q1}, [r0@128]
|
||||
|
||||
; first for loop
|
||||
vadd.s16 d4, d0, d3 ;a = [0] + [12]
|
||||
vadd.s16 d6, d1, d2 ;b = [4] + [8]
|
||||
vsub.s16 d5, d0, d3 ;d = [0] - [12]
|
||||
vsub.s16 d7, d1, d2 ;c = [4] - [8]
|
||||
|
||||
vadd.s16 q0, q2, q3 ; a+b d+c
|
||||
vsub.s16 q1, q2, q3 ; a-b d-c
|
||||
|
||||
vtrn.32 d0, d2 ;d0: 0 1 8 9
|
||||
;d2: 2 3 10 11
|
||||
vtrn.32 d1, d3 ;d1: 4 5 12 13
|
||||
;d3: 6 7 14 15
|
||||
|
||||
vtrn.16 d0, d1 ;d0: 0 4 8 12
|
||||
;d1: 1 5 9 13
|
||||
vtrn.16 d2, d3 ;d2: 2 6 10 14
|
||||
;d3: 3 7 11 15
|
||||
|
||||
; second for loop
|
||||
|
||||
vadd.s16 d4, d0, d3 ;a = [0] + [3]
|
||||
vadd.s16 d6, d1, d2 ;b = [1] + [2]
|
||||
vsub.s16 d5, d0, d3 ;d = [0] - [3]
|
||||
vsub.s16 d7, d1, d2 ;c = [1] - [2]
|
||||
|
||||
vmov.i16 q8, #3
|
||||
|
||||
vadd.s16 q0, q2, q3 ; a+b d+c
|
||||
vsub.s16 q1, q2, q3 ; a-b d-c
|
||||
|
||||
vadd.i16 q0, q0, q8 ;e/f += 3
|
||||
vadd.i16 q1, q1, q8 ;g/h += 3
|
||||
|
||||
vshr.s16 q0, q0, #3 ;e/f >> 3
|
||||
vshr.s16 q1, q1, #3 ;g/h >> 3
|
||||
|
||||
mov r2, #64
|
||||
add r3, r1, #32
|
||||
|
||||
vst1.i16 d0[0], [r1],r2
|
||||
vst1.i16 d1[0], [r3],r2
|
||||
vst1.i16 d2[0], [r1],r2
|
||||
vst1.i16 d3[0], [r3],r2
|
||||
|
||||
vst1.i16 d0[1], [r1],r2
|
||||
vst1.i16 d1[1], [r3],r2
|
||||
vst1.i16 d2[1], [r1],r2
|
||||
vst1.i16 d3[1], [r3],r2
|
||||
|
||||
vst1.i16 d0[2], [r1],r2
|
||||
vst1.i16 d1[2], [r3],r2
|
||||
vst1.i16 d2[2], [r1],r2
|
||||
vst1.i16 d3[2], [r3],r2
|
||||
|
||||
vst1.i16 d0[3], [r1],r2
|
||||
vst1.i16 d1[3], [r3],r2
|
||||
vst1.i16 d2[3], [r1]
|
||||
vst1.i16 d3[3], [r3]
|
||||
|
||||
bx lr
|
||||
ENDP ; |vp8_short_inv_walsh4x4_neon|
|
||||
|
||||
END
|
102
media/libvpx/vp8/common/arm/neon/iwalsh_neon.c
Normal file
102
media/libvpx/vp8/common/arm/neon/iwalsh_neon.c
Normal file
@ -0,0 +1,102 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
void vp8_short_inv_walsh4x4_neon(
|
||||
int16_t *input,
|
||||
int16_t *mb_dqcoeff) {
|
||||
int16x8_t q0s16, q1s16, q2s16, q3s16;
|
||||
int16x4_t d4s16, d5s16, d6s16, d7s16;
|
||||
int16x4x2_t v2tmp0, v2tmp1;
|
||||
int32x2x2_t v2tmp2, v2tmp3;
|
||||
int16x8_t qAdd3;
|
||||
|
||||
q0s16 = vld1q_s16(input);
|
||||
q1s16 = vld1q_s16(input + 8);
|
||||
|
||||
// 1st for loop
|
||||
d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
|
||||
d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
|
||||
d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
|
||||
d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
|
||||
|
||||
q2s16 = vcombine_s16(d4s16, d5s16);
|
||||
q3s16 = vcombine_s16(d6s16, d7s16);
|
||||
|
||||
q0s16 = vaddq_s16(q2s16, q3s16);
|
||||
q1s16 = vsubq_s16(q2s16, q3s16);
|
||||
|
||||
v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)),
|
||||
vreinterpret_s32_s16(vget_low_s16(q1s16)));
|
||||
v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)),
|
||||
vreinterpret_s32_s16(vget_high_s16(q1s16)));
|
||||
v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),
|
||||
vreinterpret_s16_s32(v2tmp3.val[0]));
|
||||
v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),
|
||||
vreinterpret_s16_s32(v2tmp3.val[1]));
|
||||
|
||||
// 2nd for loop
|
||||
d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
|
||||
d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
|
||||
d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
|
||||
d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
|
||||
q2s16 = vcombine_s16(d4s16, d5s16);
|
||||
q3s16 = vcombine_s16(d6s16, d7s16);
|
||||
|
||||
qAdd3 = vdupq_n_s16(3);
|
||||
|
||||
q0s16 = vaddq_s16(q2s16, q3s16);
|
||||
q1s16 = vsubq_s16(q2s16, q3s16);
|
||||
|
||||
q0s16 = vaddq_s16(q0s16, qAdd3);
|
||||
q1s16 = vaddq_s16(q1s16, qAdd3);
|
||||
|
||||
q0s16 = vshrq_n_s16(q0s16, 3);
|
||||
q1s16 = vshrq_n_s16(q1s16, 3);
|
||||
|
||||
// store
|
||||
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 0);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 0);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0);
|
||||
mb_dqcoeff += 16;
|
||||
|
||||
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 1);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 1);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1);
|
||||
mb_dqcoeff += 16;
|
||||
|
||||
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 2);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 2);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2);
|
||||
mb_dqcoeff += 16;
|
||||
|
||||
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 3);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 3);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3);
|
||||
mb_dqcoeff += 16;
|
||||
return;
|
||||
}
|
@ -1,397 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_loop_filter_horizontal_edge_y_neon|
|
||||
EXPORT |vp8_loop_filter_horizontal_edge_uv_neon|
|
||||
EXPORT |vp8_loop_filter_vertical_edge_y_neon|
|
||||
EXPORT |vp8_loop_filter_vertical_edge_uv_neon|
|
||||
ARM
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src
|
||||
; r1 int pitch
|
||||
; r2 unsigned char blimit
|
||||
; r3 unsigned char limit
|
||||
; sp unsigned char thresh,
|
||||
|vp8_loop_filter_horizontal_edge_y_neon| PROC
|
||||
push {lr}
|
||||
vdup.u8 q0, r2 ; duplicate blimit
|
||||
vdup.u8 q1, r3 ; duplicate limit
|
||||
sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines
|
||||
ldr r3, [sp, #4] ; load thresh
|
||||
add r12, r2, r1
|
||||
add r1, r1, r1
|
||||
|
||||
vdup.u8 q2, r3 ; duplicate thresh
|
||||
|
||||
vld1.u8 {q3}, [r2@128], r1 ; p3
|
||||
vld1.u8 {q4}, [r12@128], r1 ; p2
|
||||
vld1.u8 {q5}, [r2@128], r1 ; p1
|
||||
vld1.u8 {q6}, [r12@128], r1 ; p0
|
||||
vld1.u8 {q7}, [r2@128], r1 ; q0
|
||||
vld1.u8 {q8}, [r12@128], r1 ; q1
|
||||
vld1.u8 {q9}, [r2@128] ; q2
|
||||
vld1.u8 {q10}, [r12@128] ; q3
|
||||
|
||||
sub r2, r2, r1, lsl #1
|
||||
sub r12, r12, r1, lsl #1
|
||||
|
||||
bl vp8_loop_filter_neon
|
||||
|
||||
vst1.u8 {q5}, [r2@128], r1 ; store op1
|
||||
vst1.u8 {q6}, [r12@128], r1 ; store op0
|
||||
vst1.u8 {q7}, [r2@128], r1 ; store oq0
|
||||
vst1.u8 {q8}, [r12@128], r1 ; store oq1
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
|
||||
|
||||
|
||||
; r0 unsigned char *u,
|
||||
; r1 int pitch,
|
||||
; r2 unsigned char blimit
|
||||
; r3 unsigned char limit
|
||||
; sp unsigned char thresh,
|
||||
; sp+4 unsigned char *v
|
||||
|vp8_loop_filter_horizontal_edge_uv_neon| PROC
|
||||
push {lr}
|
||||
vdup.u8 q0, r2 ; duplicate blimit
|
||||
vdup.u8 q1, r3 ; duplicate limit
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
ldr r2, [sp, #8] ; load v ptr
|
||||
vdup.u8 q2, r12 ; duplicate thresh
|
||||
|
||||
sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines
|
||||
sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines
|
||||
|
||||
vld1.u8 {d6}, [r3@64], r1 ; p3
|
||||
vld1.u8 {d7}, [r12@64], r1 ; p3
|
||||
vld1.u8 {d8}, [r3@64], r1 ; p2
|
||||
vld1.u8 {d9}, [r12@64], r1 ; p2
|
||||
vld1.u8 {d10}, [r3@64], r1 ; p1
|
||||
vld1.u8 {d11}, [r12@64], r1 ; p1
|
||||
vld1.u8 {d12}, [r3@64], r1 ; p0
|
||||
vld1.u8 {d13}, [r12@64], r1 ; p0
|
||||
vld1.u8 {d14}, [r3@64], r1 ; q0
|
||||
vld1.u8 {d15}, [r12@64], r1 ; q0
|
||||
vld1.u8 {d16}, [r3@64], r1 ; q1
|
||||
vld1.u8 {d17}, [r12@64], r1 ; q1
|
||||
vld1.u8 {d18}, [r3@64], r1 ; q2
|
||||
vld1.u8 {d19}, [r12@64], r1 ; q2
|
||||
vld1.u8 {d20}, [r3@64] ; q3
|
||||
vld1.u8 {d21}, [r12@64] ; q3
|
||||
|
||||
bl vp8_loop_filter_neon
|
||||
|
||||
sub r0, r0, r1, lsl #1
|
||||
sub r2, r2, r1, lsl #1
|
||||
|
||||
vst1.u8 {d10}, [r0@64], r1 ; store u op1
|
||||
vst1.u8 {d11}, [r2@64], r1 ; store v op1
|
||||
vst1.u8 {d12}, [r0@64], r1 ; store u op0
|
||||
vst1.u8 {d13}, [r2@64], r1 ; store v op0
|
||||
vst1.u8 {d14}, [r0@64], r1 ; store u oq0
|
||||
vst1.u8 {d15}, [r2@64], r1 ; store v oq0
|
||||
vst1.u8 {d16}, [r0@64] ; store u oq1
|
||||
vst1.u8 {d17}, [r2@64] ; store v oq1
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon|
|
||||
|
||||
; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
|
||||
; const signed char *flimit,
|
||||
; const signed char *limit,
|
||||
; const signed char *thresh,
|
||||
; int count)
|
||||
; r0 unsigned char *src
|
||||
; r1 int pitch
|
||||
; r2 unsigned char blimit
|
||||
; r3 unsigned char limit
|
||||
; sp unsigned char thresh,
|
||||
|
||||
|vp8_loop_filter_vertical_edge_y_neon| PROC
|
||||
push {lr}
|
||||
vdup.u8 q0, r2 ; duplicate blimit
|
||||
vdup.u8 q1, r3 ; duplicate limit
|
||||
sub r2, r0, #4 ; src ptr down by 4 columns
|
||||
add r1, r1, r1
|
||||
ldr r3, [sp, #4] ; load thresh
|
||||
add r12, r2, r1, asr #1
|
||||
|
||||
vld1.u8 {d6}, [r2], r1
|
||||
vld1.u8 {d8}, [r12], r1
|
||||
vld1.u8 {d10}, [r2], r1
|
||||
vld1.u8 {d12}, [r12], r1
|
||||
vld1.u8 {d14}, [r2], r1
|
||||
vld1.u8 {d16}, [r12], r1
|
||||
vld1.u8 {d18}, [r2], r1
|
||||
vld1.u8 {d20}, [r12], r1
|
||||
|
||||
vld1.u8 {d7}, [r2], r1 ; load second 8-line src data
|
||||
vld1.u8 {d9}, [r12], r1
|
||||
vld1.u8 {d11}, [r2], r1
|
||||
vld1.u8 {d13}, [r12], r1
|
||||
vld1.u8 {d15}, [r2], r1
|
||||
vld1.u8 {d17}, [r12], r1
|
||||
vld1.u8 {d19}, [r2]
|
||||
vld1.u8 {d21}, [r12]
|
||||
|
||||
;transpose to 8x16 matrix
|
||||
vtrn.32 q3, q7
|
||||
vtrn.32 q4, q8
|
||||
vtrn.32 q5, q9
|
||||
vtrn.32 q6, q10
|
||||
|
||||
vdup.u8 q2, r3 ; duplicate thresh
|
||||
|
||||
vtrn.16 q3, q5
|
||||
vtrn.16 q4, q6
|
||||
vtrn.16 q7, q9
|
||||
vtrn.16 q8, q10
|
||||
|
||||
vtrn.8 q3, q4
|
||||
vtrn.8 q5, q6
|
||||
vtrn.8 q7, q8
|
||||
vtrn.8 q9, q10
|
||||
|
||||
bl vp8_loop_filter_neon
|
||||
|
||||
vswp d12, d11
|
||||
vswp d16, d13
|
||||
|
||||
sub r0, r0, #2 ; dst ptr
|
||||
|
||||
vswp d14, d12
|
||||
vswp d16, d15
|
||||
|
||||
add r12, r0, r1, asr #1
|
||||
|
||||
;store op1, op0, oq0, oq1
|
||||
vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
|
||||
vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
|
||||
vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
|
||||
vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
|
||||
vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
|
||||
vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
|
||||
vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
|
||||
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1
|
||||
|
||||
vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
|
||||
vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
|
||||
vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
|
||||
vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
|
||||
vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
|
||||
vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
|
||||
vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0]
|
||||
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12]
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp8_loop_filter_vertical_edge_y_neon|
|
||||
|
||||
; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
|
||||
; const signed char *flimit,
|
||||
; const signed char *limit,
|
||||
; const signed char *thresh,
|
||||
; unsigned char *v)
|
||||
; r0 unsigned char *u,
|
||||
; r1 int pitch,
|
||||
; r2 unsigned char blimit
|
||||
; r3 unsigned char limit
|
||||
; sp unsigned char thresh,
|
||||
; sp+4 unsigned char *v
|
||||
|vp8_loop_filter_vertical_edge_uv_neon| PROC
|
||||
push {lr}
|
||||
vdup.u8 q0, r2 ; duplicate blimit
|
||||
sub r12, r0, #4 ; move u pointer down by 4 columns
|
||||
ldr r2, [sp, #8] ; load v ptr
|
||||
vdup.u8 q1, r3 ; duplicate limit
|
||||
sub r3, r2, #4 ; move v pointer down by 4 columns
|
||||
|
||||
vld1.u8 {d6}, [r12], r1 ;load u data
|
||||
vld1.u8 {d7}, [r3], r1 ;load v data
|
||||
vld1.u8 {d8}, [r12], r1
|
||||
vld1.u8 {d9}, [r3], r1
|
||||
vld1.u8 {d10}, [r12], r1
|
||||
vld1.u8 {d11}, [r3], r1
|
||||
vld1.u8 {d12}, [r12], r1
|
||||
vld1.u8 {d13}, [r3], r1
|
||||
vld1.u8 {d14}, [r12], r1
|
||||
vld1.u8 {d15}, [r3], r1
|
||||
vld1.u8 {d16}, [r12], r1
|
||||
vld1.u8 {d17}, [r3], r1
|
||||
vld1.u8 {d18}, [r12], r1
|
||||
vld1.u8 {d19}, [r3], r1
|
||||
vld1.u8 {d20}, [r12]
|
||||
vld1.u8 {d21}, [r3]
|
||||
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
|
||||
;transpose to 8x16 matrix
|
||||
vtrn.32 q3, q7
|
||||
vtrn.32 q4, q8
|
||||
vtrn.32 q5, q9
|
||||
vtrn.32 q6, q10
|
||||
|
||||
vdup.u8 q2, r12 ; duplicate thresh
|
||||
|
||||
vtrn.16 q3, q5
|
||||
vtrn.16 q4, q6
|
||||
vtrn.16 q7, q9
|
||||
vtrn.16 q8, q10
|
||||
|
||||
vtrn.8 q3, q4
|
||||
vtrn.8 q5, q6
|
||||
vtrn.8 q7, q8
|
||||
vtrn.8 q9, q10
|
||||
|
||||
bl vp8_loop_filter_neon
|
||||
|
||||
vswp d12, d11
|
||||
vswp d16, d13
|
||||
vswp d14, d12
|
||||
vswp d16, d15
|
||||
|
||||
sub r0, r0, #2
|
||||
sub r2, r2, #2
|
||||
|
||||
;store op1, op0, oq0, oq1
|
||||
vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
|
||||
vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
|
||||
vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
|
||||
vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r2], r1
|
||||
vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
|
||||
vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r2], r1
|
||||
vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
|
||||
vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r2], r1
|
||||
vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
|
||||
vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r2], r1
|
||||
vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
|
||||
vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r2], r1
|
||||
vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
|
||||
vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r2], r1
|
||||
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0]
|
||||
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2]
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp8_loop_filter_vertical_edge_uv_neon|
|
||||
|
||||
; void vp8_loop_filter_neon();
|
||||
; This is a helper function for the loopfilters. The invidual functions do the
|
||||
; necessary load, transpose (if necessary) and store.
|
||||
|
||||
; r0-r3 PRESERVE
|
||||
; q0 flimit
|
||||
; q1 limit
|
||||
; q2 thresh
|
||||
; q3 p3
|
||||
; q4 p2
|
||||
; q5 p1
|
||||
; q6 p0
|
||||
; q7 q0
|
||||
; q8 q1
|
||||
; q9 q2
|
||||
; q10 q3
|
||||
|vp8_loop_filter_neon| PROC
|
||||
|
||||
; vp8_filter_mask
|
||||
vabd.u8 q11, q3, q4 ; abs(p3 - p2)
|
||||
vabd.u8 q12, q4, q5 ; abs(p2 - p1)
|
||||
vabd.u8 q13, q5, q6 ; abs(p1 - p0)
|
||||
vabd.u8 q14, q8, q7 ; abs(q1 - q0)
|
||||
vabd.u8 q3, q9, q8 ; abs(q2 - q1)
|
||||
vabd.u8 q4, q10, q9 ; abs(q3 - q2)
|
||||
|
||||
vmax.u8 q11, q11, q12
|
||||
vmax.u8 q12, q13, q14
|
||||
vmax.u8 q3, q3, q4
|
||||
vmax.u8 q15, q11, q12
|
||||
|
||||
vabd.u8 q9, q6, q7 ; abs(p0 - q0)
|
||||
|
||||
; vp8_hevmask
|
||||
vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1
|
||||
vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1
|
||||
vmax.u8 q15, q15, q3
|
||||
|
||||
vmov.u8 q10, #0x80 ; 0x80
|
||||
|
||||
vabd.u8 q2, q5, q8 ; a = abs(p1 - q1)
|
||||
vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2
|
||||
|
||||
vcge.u8 q15, q1, q15
|
||||
|
||||
; vp8_filter() function
|
||||
; convert to signed
|
||||
veor q7, q7, q10 ; qs0
|
||||
vshr.u8 q2, q2, #1 ; a = a / 2
|
||||
veor q6, q6, q10 ; ps0
|
||||
|
||||
veor q5, q5, q10 ; ps1
|
||||
vqadd.u8 q9, q9, q2 ; a = b + a
|
||||
|
||||
veor q8, q8, q10 ; qs1
|
||||
|
||||
vmov.u8 q10, #3 ; #3
|
||||
|
||||
vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
|
||||
vsubl.s8 q11, d15, d13
|
||||
|
||||
vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1
|
||||
|
||||
vmovl.u8 q4, d20
|
||||
|
||||
vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1)
|
||||
vorr q14, q13, q14 ; vp8_hevmask
|
||||
|
||||
vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0)
|
||||
vmul.i16 q11, q11, q4
|
||||
|
||||
vand q1, q1, q14 ; vp8_filter &= hev
|
||||
vand q15, q15, q9 ; vp8_filter_mask
|
||||
|
||||
vaddw.s8 q2, q2, d2
|
||||
vaddw.s8 q11, q11, d3
|
||||
|
||||
vmov.u8 q9, #4 ; #4
|
||||
|
||||
; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
|
||||
vqmovn.s16 d2, q2
|
||||
vqmovn.s16 d3, q11
|
||||
vand q1, q1, q15 ; vp8_filter &= mask
|
||||
|
||||
vqadd.s8 q2, q1, q10 ; Filter2 = clamp(vp8_filter+3)
|
||||
vqadd.s8 q1, q1, q9 ; Filter1 = clamp(vp8_filter+4)
|
||||
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
|
||||
vshr.s8 q1, q1, #3 ; Filter1 >>= 3
|
||||
|
||||
|
||||
vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2)
|
||||
vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1)
|
||||
|
||||
; outer tap adjustments: ++vp8_filter >> 1
|
||||
vrshr.s8 q1, q1, #1
|
||||
vbic q1, q1, q14 ; vp8_filter &= ~hev
|
||||
vmov.u8 q0, #0x80 ; 0x80
|
||||
vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp8_filter)
|
||||
vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp8_filter)
|
||||
|
||||
veor q6, q11, q0 ; *op0 = u^0x80
|
||||
veor q7, q10, q0 ; *oq0 = u^0x80
|
||||
veor q5, q13, q0 ; *op1 = u^0x80
|
||||
veor q8, q12, q0 ; *oq1 = u^0x80
|
||||
|
||||
bx lr
|
||||
ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
|
||||
|
||||
;-----------------
|
||||
|
||||
END
|
549
media/libvpx/vp8/common/arm/neon/loopfilter_neon.c
Normal file
549
media/libvpx/vp8/common/arm/neon/loopfilter_neon.c
Normal file
@ -0,0 +1,549 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include "./vpx_config.h"
|
||||
|
||||
static INLINE void vp8_loop_filter_neon(
|
||||
uint8x16_t qblimit, // flimit
|
||||
uint8x16_t qlimit, // limit
|
||||
uint8x16_t qthresh, // thresh
|
||||
uint8x16_t q3, // p3
|
||||
uint8x16_t q4, // p2
|
||||
uint8x16_t q5, // p1
|
||||
uint8x16_t q6, // p0
|
||||
uint8x16_t q7, // q0
|
||||
uint8x16_t q8, // q1
|
||||
uint8x16_t q9, // q2
|
||||
uint8x16_t q10, // q3
|
||||
uint8x16_t *q5r, // p1
|
||||
uint8x16_t *q6r, // p0
|
||||
uint8x16_t *q7r, // q0
|
||||
uint8x16_t *q8r) { // q1
|
||||
uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
|
||||
int16x8_t q2s16, q11s16;
|
||||
uint16x8_t q4u16;
|
||||
int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8;
|
||||
int8x8_t d2s8, d3s8;
|
||||
|
||||
q11u8 = vabdq_u8(q3, q4);
|
||||
q12u8 = vabdq_u8(q4, q5);
|
||||
q13u8 = vabdq_u8(q5, q6);
|
||||
q14u8 = vabdq_u8(q8, q7);
|
||||
q3 = vabdq_u8(q9, q8);
|
||||
q4 = vabdq_u8(q10, q9);
|
||||
|
||||
q11u8 = vmaxq_u8(q11u8, q12u8);
|
||||
q12u8 = vmaxq_u8(q13u8, q14u8);
|
||||
q3 = vmaxq_u8(q3, q4);
|
||||
q15u8 = vmaxq_u8(q11u8, q12u8);
|
||||
|
||||
q9 = vabdq_u8(q6, q7);
|
||||
|
||||
// vp8_hevmask
|
||||
q13u8 = vcgtq_u8(q13u8, qthresh);
|
||||
q14u8 = vcgtq_u8(q14u8, qthresh);
|
||||
q15u8 = vmaxq_u8(q15u8, q3);
|
||||
|
||||
q2u8 = vabdq_u8(q5, q8);
|
||||
q9 = vqaddq_u8(q9, q9);
|
||||
|
||||
q15u8 = vcgeq_u8(qlimit, q15u8);
|
||||
|
||||
// vp8_filter() function
|
||||
// convert to signed
|
||||
q10 = vdupq_n_u8(0x80);
|
||||
q8 = veorq_u8(q8, q10);
|
||||
q7 = veorq_u8(q7, q10);
|
||||
q6 = veorq_u8(q6, q10);
|
||||
q5 = veorq_u8(q5, q10);
|
||||
|
||||
q2u8 = vshrq_n_u8(q2u8, 1);
|
||||
q9 = vqaddq_u8(q9, q2u8);
|
||||
|
||||
q10 = vdupq_n_u8(3);
|
||||
|
||||
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
|
||||
vget_low_s8(vreinterpretq_s8_u8(q6)));
|
||||
q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
|
||||
vget_high_s8(vreinterpretq_s8_u8(q6)));
|
||||
|
||||
q9 = vcgeq_u8(qblimit, q9);
|
||||
|
||||
q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
|
||||
vreinterpretq_s8_u8(q8));
|
||||
|
||||
q14u8 = vorrq_u8(q13u8, q14u8);
|
||||
|
||||
q4u16 = vmovl_u8(vget_low_u8(q10));
|
||||
q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
|
||||
q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
|
||||
|
||||
q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
|
||||
q15u8 = vandq_u8(q15u8, q9);
|
||||
|
||||
q1s8 = vreinterpretq_s8_u8(q1u8);
|
||||
q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
|
||||
q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
|
||||
|
||||
q9 = vdupq_n_u8(4);
|
||||
// vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
|
||||
d2s8 = vqmovn_s16(q2s16);
|
||||
d3s8 = vqmovn_s16(q11s16);
|
||||
q1s8 = vcombine_s8(d2s8, d3s8);
|
||||
q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
|
||||
q1s8 = vreinterpretq_s8_u8(q1u8);
|
||||
|
||||
q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10));
|
||||
q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
|
||||
q2s8 = vshrq_n_s8(q2s8, 3);
|
||||
q1s8 = vshrq_n_s8(q1s8, 3);
|
||||
|
||||
q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
|
||||
q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
|
||||
|
||||
q1s8 = vrshrq_n_s8(q1s8, 1);
|
||||
q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
|
||||
|
||||
q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
|
||||
q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
|
||||
|
||||
q0u8 = vdupq_n_u8(0x80);
|
||||
*q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8);
|
||||
*q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
|
||||
*q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
|
||||
*q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8);
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_loop_filter_horizontal_edge_y_neon(
|
||||
unsigned char *src,
|
||||
int pitch,
|
||||
unsigned char blimit,
|
||||
unsigned char limit,
|
||||
unsigned char thresh) {
|
||||
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
|
||||
uint8x16_t q5, q6, q7, q8, q9, q10;
|
||||
|
||||
qblimit = vdupq_n_u8(blimit);
|
||||
qlimit = vdupq_n_u8(limit);
|
||||
qthresh = vdupq_n_u8(thresh);
|
||||
src -= (pitch << 2);
|
||||
|
||||
q3 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q4 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q5 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q6 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q7 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q8 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q9 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q10 = vld1q_u8(src);
|
||||
|
||||
vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
|
||||
q5, q6, q7, q8, q9, q10,
|
||||
&q5, &q6, &q7, &q8);
|
||||
|
||||
src -= (pitch * 5);
|
||||
vst1q_u8(src, q5);
|
||||
src += pitch;
|
||||
vst1q_u8(src, q6);
|
||||
src += pitch;
|
||||
vst1q_u8(src, q7);
|
||||
src += pitch;
|
||||
vst1q_u8(src, q8);
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_loop_filter_horizontal_edge_uv_neon(
|
||||
unsigned char *u,
|
||||
int pitch,
|
||||
unsigned char blimit,
|
||||
unsigned char limit,
|
||||
unsigned char thresh,
|
||||
unsigned char *v) {
|
||||
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
|
||||
uint8x16_t q5, q6, q7, q8, q9, q10;
|
||||
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
|
||||
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
|
||||
|
||||
qblimit = vdupq_n_u8(blimit);
|
||||
qlimit = vdupq_n_u8(limit);
|
||||
qthresh = vdupq_n_u8(thresh);
|
||||
|
||||
u -= (pitch << 2);
|
||||
v -= (pitch << 2);
|
||||
|
||||
d6 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d7 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d8 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d9 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d10 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d11 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d12 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d13 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d14 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d15 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d16 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d17 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d18 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d19 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d20 = vld1_u8(u);
|
||||
d21 = vld1_u8(v);
|
||||
|
||||
q3 = vcombine_u8(d6, d7);
|
||||
q4 = vcombine_u8(d8, d9);
|
||||
q5 = vcombine_u8(d10, d11);
|
||||
q6 = vcombine_u8(d12, d13);
|
||||
q7 = vcombine_u8(d14, d15);
|
||||
q8 = vcombine_u8(d16, d17);
|
||||
q9 = vcombine_u8(d18, d19);
|
||||
q10 = vcombine_u8(d20, d21);
|
||||
|
||||
vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
|
||||
q5, q6, q7, q8, q9, q10,
|
||||
&q5, &q6, &q7, &q8);
|
||||
|
||||
u -= (pitch * 5);
|
||||
vst1_u8(u, vget_low_u8(q5));
|
||||
u += pitch;
|
||||
vst1_u8(u, vget_low_u8(q6));
|
||||
u += pitch;
|
||||
vst1_u8(u, vget_low_u8(q7));
|
||||
u += pitch;
|
||||
vst1_u8(u, vget_low_u8(q8));
|
||||
|
||||
v -= (pitch * 5);
|
||||
vst1_u8(v, vget_high_u8(q5));
|
||||
v += pitch;
|
||||
vst1_u8(v, vget_high_u8(q6));
|
||||
v += pitch;
|
||||
vst1_u8(v, vget_high_u8(q7));
|
||||
v += pitch;
|
||||
vst1_u8(v, vget_high_u8(q8));
|
||||
return;
|
||||
}
|
||||
|
||||
static INLINE void write_4x8(unsigned char *dst, int pitch,
|
||||
const uint8x8x4_t result) {
|
||||
#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
|
||||
vst4_lane_u8(dst, result, 0);
|
||||
dst += pitch;
|
||||
vst4_lane_u8(dst, result, 1);
|
||||
dst += pitch;
|
||||
vst4_lane_u8(dst, result, 2);
|
||||
dst += pitch;
|
||||
vst4_lane_u8(dst, result, 3);
|
||||
dst += pitch;
|
||||
vst4_lane_u8(dst, result, 4);
|
||||
dst += pitch;
|
||||
vst4_lane_u8(dst, result, 5);
|
||||
dst += pitch;
|
||||
vst4_lane_u8(dst, result, 6);
|
||||
dst += pitch;
|
||||
vst4_lane_u8(dst, result, 7);
|
||||
#else
|
||||
/*
|
||||
* uint8x8x4_t result
|
||||
00 01 02 03 | 04 05 06 07
|
||||
10 11 12 13 | 14 15 16 17
|
||||
20 21 22 23 | 24 25 26 27
|
||||
30 31 32 33 | 34 35 36 37
|
||||
---
|
||||
* after vtrn_u16
|
||||
00 01 20 21 | 04 05 24 25
|
||||
02 03 22 23 | 06 07 26 27
|
||||
10 11 30 31 | 14 15 34 35
|
||||
12 13 32 33 | 16 17 36 37
|
||||
---
|
||||
* after vtrn_u8
|
||||
00 10 20 30 | 04 14 24 34
|
||||
01 11 21 31 | 05 15 25 35
|
||||
02 12 22 32 | 06 16 26 36
|
||||
03 13 23 33 | 07 17 27 37
|
||||
*/
|
||||
const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),
|
||||
vreinterpret_u16_u8(result.val[2]));
|
||||
const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),
|
||||
vreinterpret_u16_u8(result.val[3]));
|
||||
const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
|
||||
vreinterpret_u8_u16(r13_u16.val[0]));
|
||||
const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
|
||||
vreinterpret_u8_u16(r13_u16.val[1]));
|
||||
const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);
|
||||
const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);
|
||||
const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);
|
||||
const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);
|
||||
vst1_lane_u32((uint32_t *)dst, x_0_4, 0);
|
||||
dst += pitch;
|
||||
vst1_lane_u32((uint32_t *)dst, x_1_5, 0);
|
||||
dst += pitch;
|
||||
vst1_lane_u32((uint32_t *)dst, x_2_6, 0);
|
||||
dst += pitch;
|
||||
vst1_lane_u32((uint32_t *)dst, x_3_7, 0);
|
||||
dst += pitch;
|
||||
vst1_lane_u32((uint32_t *)dst, x_0_4, 1);
|
||||
dst += pitch;
|
||||
vst1_lane_u32((uint32_t *)dst, x_1_5, 1);
|
||||
dst += pitch;
|
||||
vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
|
||||
dst += pitch;
|
||||
vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
void vp8_loop_filter_vertical_edge_y_neon(
|
||||
unsigned char *src,
|
||||
int pitch,
|
||||
unsigned char blimit,
|
||||
unsigned char limit,
|
||||
unsigned char thresh) {
|
||||
unsigned char *s, *d;
|
||||
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
|
||||
uint8x16_t q5, q6, q7, q8, q9, q10;
|
||||
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
|
||||
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
|
||||
uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
|
||||
uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
|
||||
uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
|
||||
uint8x8x4_t q4ResultH, q4ResultL;
|
||||
|
||||
qblimit = vdupq_n_u8(blimit);
|
||||
qlimit = vdupq_n_u8(limit);
|
||||
qthresh = vdupq_n_u8(thresh);
|
||||
|
||||
s = src - 4;
|
||||
d6 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d10 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d12 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d14 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d16 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d18 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d20 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d7 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d9 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d11 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d13 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d15 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d17 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d19 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d21 = vld1_u8(s);
|
||||
|
||||
q3 = vcombine_u8(d6, d7);
|
||||
q4 = vcombine_u8(d8, d9);
|
||||
q5 = vcombine_u8(d10, d11);
|
||||
q6 = vcombine_u8(d12, d13);
|
||||
q7 = vcombine_u8(d14, d15);
|
||||
q8 = vcombine_u8(d16, d17);
|
||||
q9 = vcombine_u8(d18, d19);
|
||||
q10 = vcombine_u8(d20, d21);
|
||||
|
||||
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
|
||||
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
|
||||
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
|
||||
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
|
||||
|
||||
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[0]));
|
||||
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[0]));
|
||||
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[1]));
|
||||
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[1]));
|
||||
|
||||
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[0]));
|
||||
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[1]));
|
||||
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[0]));
|
||||
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[1]));
|
||||
|
||||
q3 = q2tmp8.val[0];
|
||||
q4 = q2tmp8.val[1];
|
||||
q5 = q2tmp9.val[0];
|
||||
q6 = q2tmp9.val[1];
|
||||
q7 = q2tmp10.val[0];
|
||||
q8 = q2tmp10.val[1];
|
||||
q9 = q2tmp11.val[0];
|
||||
q10 = q2tmp11.val[1];
|
||||
|
||||
vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
|
||||
q5, q6, q7, q8, q9, q10,
|
||||
&q5, &q6, &q7, &q8);
|
||||
|
||||
q4ResultL.val[0] = vget_low_u8(q5); // d10
|
||||
q4ResultL.val[1] = vget_low_u8(q6); // d12
|
||||
q4ResultL.val[2] = vget_low_u8(q7); // d14
|
||||
q4ResultL.val[3] = vget_low_u8(q8); // d16
|
||||
q4ResultH.val[0] = vget_high_u8(q5); // d11
|
||||
q4ResultH.val[1] = vget_high_u8(q6); // d13
|
||||
q4ResultH.val[2] = vget_high_u8(q7); // d15
|
||||
q4ResultH.val[3] = vget_high_u8(q8); // d17
|
||||
|
||||
d = src - 2;
|
||||
write_4x8(d, pitch, q4ResultL);
|
||||
d += pitch * 8;
|
||||
write_4x8(d, pitch, q4ResultH);
|
||||
}
|
||||
|
||||
void vp8_loop_filter_vertical_edge_uv_neon(
|
||||
unsigned char *u,
|
||||
int pitch,
|
||||
unsigned char blimit,
|
||||
unsigned char limit,
|
||||
unsigned char thresh,
|
||||
unsigned char *v) {
|
||||
unsigned char *us, *ud;
|
||||
unsigned char *vs, *vd;
|
||||
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
|
||||
uint8x16_t q5, q6, q7, q8, q9, q10;
|
||||
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
|
||||
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
|
||||
uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
|
||||
uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
|
||||
uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
|
||||
uint8x8x4_t q4ResultH, q4ResultL;
|
||||
|
||||
qblimit = vdupq_n_u8(blimit);
|
||||
qlimit = vdupq_n_u8(limit);
|
||||
qthresh = vdupq_n_u8(thresh);
|
||||
|
||||
us = u - 4;
|
||||
d6 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d8 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d10 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d12 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d14 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d16 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d18 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d20 = vld1_u8(us);
|
||||
|
||||
vs = v - 4;
|
||||
d7 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d9 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d11 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d13 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d15 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d17 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d19 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d21 = vld1_u8(vs);
|
||||
|
||||
q3 = vcombine_u8(d6, d7);
|
||||
q4 = vcombine_u8(d8, d9);
|
||||
q5 = vcombine_u8(d10, d11);
|
||||
q6 = vcombine_u8(d12, d13);
|
||||
q7 = vcombine_u8(d14, d15);
|
||||
q8 = vcombine_u8(d16, d17);
|
||||
q9 = vcombine_u8(d18, d19);
|
||||
q10 = vcombine_u8(d20, d21);
|
||||
|
||||
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
|
||||
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
|
||||
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
|
||||
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
|
||||
|
||||
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[0]));
|
||||
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[0]));
|
||||
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[1]));
|
||||
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[1]));
|
||||
|
||||
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[0]));
|
||||
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[1]));
|
||||
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[0]));
|
||||
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[1]));
|
||||
|
||||
q3 = q2tmp8.val[0];
|
||||
q4 = q2tmp8.val[1];
|
||||
q5 = q2tmp9.val[0];
|
||||
q6 = q2tmp9.val[1];
|
||||
q7 = q2tmp10.val[0];
|
||||
q8 = q2tmp10.val[1];
|
||||
q9 = q2tmp11.val[0];
|
||||
q10 = q2tmp11.val[1];
|
||||
|
||||
vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
|
||||
q5, q6, q7, q8, q9, q10,
|
||||
&q5, &q6, &q7, &q8);
|
||||
|
||||
q4ResultL.val[0] = vget_low_u8(q5); // d10
|
||||
q4ResultL.val[1] = vget_low_u8(q6); // d12
|
||||
q4ResultL.val[2] = vget_low_u8(q7); // d14
|
||||
q4ResultL.val[3] = vget_low_u8(q8); // d16
|
||||
ud = u - 2;
|
||||
write_4x8(ud, pitch, q4ResultL);
|
||||
|
||||
q4ResultH.val[0] = vget_high_u8(q5); // d11
|
||||
q4ResultH.val[1] = vget_high_u8(q6); // d13
|
||||
q4ResultH.val[2] = vget_high_u8(q7); // d15
|
||||
q4ResultH.val[3] = vget_high_u8(q8); // d17
|
||||
vd = v - 2;
|
||||
write_4x8(vd, pitch, q4ResultH);
|
||||
}
|
@ -1,117 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
;EXPORT |vp8_loop_filter_simple_horizontal_edge_neon|
|
||||
EXPORT |vp8_loop_filter_bhs_neon|
|
||||
EXPORT |vp8_loop_filter_mbhs_neon|
|
||||
ARM
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *s, PRESERVE
|
||||
; r1 int p, PRESERVE
|
||||
; q1 limit, PRESERVE
|
||||
|
||||
|vp8_loop_filter_simple_horizontal_edge_neon| PROC
|
||||
|
||||
sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines
|
||||
|
||||
vld1.u8 {q7}, [r0@128], r1 ; q0
|
||||
vld1.u8 {q5}, [r3@128], r1 ; p0
|
||||
vld1.u8 {q8}, [r0@128] ; q1
|
||||
vld1.u8 {q6}, [r3@128] ; p1
|
||||
|
||||
vabd.u8 q15, q6, q7 ; abs(p0 - q0)
|
||||
vabd.u8 q14, q5, q8 ; abs(p1 - q1)
|
||||
|
||||
vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
|
||||
vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
|
||||
vmov.u8 q0, #0x80 ; 0x80
|
||||
vmov.s16 q13, #3
|
||||
vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
|
||||
|
||||
veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value
|
||||
veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value
|
||||
veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value
|
||||
veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value
|
||||
|
||||
vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1
|
||||
|
||||
vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
|
||||
vsubl.s8 q3, d15, d13
|
||||
|
||||
vqsub.s8 q4, q5, q8 ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1)
|
||||
|
||||
vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0)
|
||||
vmul.s16 q3, q3, q13
|
||||
|
||||
vmov.u8 q10, #0x03 ; 0x03
|
||||
vmov.u8 q9, #0x04 ; 0x04
|
||||
|
||||
vaddw.s8 q2, q2, d8 ; vp8_filter + 3 * ( qs0 - ps0)
|
||||
vaddw.s8 q3, q3, d9
|
||||
|
||||
vqmovn.s16 d8, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
|
||||
vqmovn.s16 d9, q3
|
||||
|
||||
vand q14, q4, q15 ; vp8_filter &= mask
|
||||
|
||||
vqadd.s8 q2, q14, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
|
||||
vqadd.s8 q3, q14, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
|
||||
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
|
||||
vshr.s8 q4, q3, #3 ; Filter1 >>= 3
|
||||
|
||||
sub r0, r0, r1
|
||||
|
||||
;calculate output
|
||||
vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2)
|
||||
vqsub.s8 q10, q7, q4 ; u = vp8_signed_char_clamp(qs0 - Filter1)
|
||||
|
||||
veor q6, q11, q0 ; *op0 = u^0x80
|
||||
veor q7, q10, q0 ; *oq0 = u^0x80
|
||||
|
||||
vst1.u8 {q6}, [r3@128] ; store op0
|
||||
vst1.u8 {q7}, [r0@128] ; store oq0
|
||||
|
||||
bx lr
|
||||
ENDP ; |vp8_loop_filter_simple_horizontal_edge_neon|
|
||||
|
||||
; r0 unsigned char *y
|
||||
; r1 int ystride
|
||||
; r2 const unsigned char *blimit
|
||||
|
||||
|vp8_loop_filter_bhs_neon| PROC
|
||||
push {r4, lr}
|
||||
ldrb r3, [r2] ; load blim from mem
|
||||
vdup.s8 q1, r3 ; duplicate blim
|
||||
|
||||
add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride
|
||||
bl vp8_loop_filter_simple_horizontal_edge_neon
|
||||
; vp8_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1
|
||||
add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride
|
||||
bl vp8_loop_filter_simple_horizontal_edge_neon
|
||||
add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride
|
||||
pop {r4, lr}
|
||||
b vp8_loop_filter_simple_horizontal_edge_neon
|
||||
ENDP ;|vp8_loop_filter_bhs_neon|
|
||||
|
||||
; r0 unsigned char *y
|
||||
; r1 int ystride
|
||||
; r2 const unsigned char *blimit
|
||||
|
||||
|vp8_loop_filter_mbhs_neon| PROC
|
||||
ldrb r3, [r2] ; load blim from mem
|
||||
vdup.s8 q1, r3 ; duplicate mblim
|
||||
b vp8_loop_filter_simple_horizontal_edge_neon
|
||||
ENDP ;|vp8_loop_filter_bhs_neon|
|
||||
|
||||
END
|
@ -0,0 +1,111 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include "./vpx_config.h"
|
||||
|
||||
static INLINE void vp8_loop_filter_simple_horizontal_edge_neon(
|
||||
unsigned char *s,
|
||||
int p,
|
||||
const unsigned char *blimit) {
|
||||
uint8_t *sp;
|
||||
uint8x16_t qblimit, q0u8;
|
||||
uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8;
|
||||
int16x8_t q2s16, q3s16, q13s16;
|
||||
int8x8_t d8s8, d9s8;
|
||||
int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8;
|
||||
|
||||
qblimit = vdupq_n_u8(*blimit);
|
||||
|
||||
sp = s - (p << 1);
|
||||
q5u8 = vld1q_u8(sp);
|
||||
sp += p;
|
||||
q6u8 = vld1q_u8(sp);
|
||||
sp += p;
|
||||
q7u8 = vld1q_u8(sp);
|
||||
sp += p;
|
||||
q8u8 = vld1q_u8(sp);
|
||||
|
||||
q15u8 = vabdq_u8(q6u8, q7u8);
|
||||
q14u8 = vabdq_u8(q5u8, q8u8);
|
||||
|
||||
q15u8 = vqaddq_u8(q15u8, q15u8);
|
||||
q14u8 = vshrq_n_u8(q14u8, 1);
|
||||
q0u8 = vdupq_n_u8(0x80);
|
||||
q13s16 = vdupq_n_s16(3);
|
||||
q15u8 = vqaddq_u8(q15u8, q14u8);
|
||||
|
||||
q5u8 = veorq_u8(q5u8, q0u8);
|
||||
q6u8 = veorq_u8(q6u8, q0u8);
|
||||
q7u8 = veorq_u8(q7u8, q0u8);
|
||||
q8u8 = veorq_u8(q8u8, q0u8);
|
||||
|
||||
q15u8 = vcgeq_u8(qblimit, q15u8);
|
||||
|
||||
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)),
|
||||
vget_low_s8(vreinterpretq_s8_u8(q6u8)));
|
||||
q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)),
|
||||
vget_high_s8(vreinterpretq_s8_u8(q6u8)));
|
||||
|
||||
q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8),
|
||||
vreinterpretq_s8_u8(q8u8));
|
||||
|
||||
q2s16 = vmulq_s16(q2s16, q13s16);
|
||||
q3s16 = vmulq_s16(q3s16, q13s16);
|
||||
|
||||
q10u8 = vdupq_n_u8(3);
|
||||
q9u8 = vdupq_n_u8(4);
|
||||
|
||||
q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8));
|
||||
q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8));
|
||||
|
||||
d8s8 = vqmovn_s16(q2s16);
|
||||
d9s8 = vqmovn_s16(q3s16);
|
||||
q4s8 = vcombine_s8(d8s8, d9s8);
|
||||
|
||||
q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8));
|
||||
|
||||
q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8));
|
||||
q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8));
|
||||
q2s8 = vshrq_n_s8(q2s8, 3);
|
||||
q3s8 = vshrq_n_s8(q3s8, 3);
|
||||
|
||||
q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8);
|
||||
q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8);
|
||||
|
||||
q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
|
||||
q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
|
||||
|
||||
vst1q_u8(s, q7u8);
|
||||
s -= p;
|
||||
vst1q_u8(s, q6u8);
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_loop_filter_bhs_neon(
|
||||
unsigned char *y_ptr,
|
||||
int y_stride,
|
||||
const unsigned char *blimit) {
|
||||
y_ptr += y_stride * 4;
|
||||
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
|
||||
y_ptr += y_stride * 4;
|
||||
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
|
||||
y_ptr += y_stride * 4;
|
||||
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_loop_filter_mbhs_neon(
|
||||
unsigned char *y_ptr,
|
||||
int y_stride,
|
||||
const unsigned char *blimit) {
|
||||
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
|
||||
return;
|
||||
}
|
@ -1,154 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
;EXPORT |vp8_loop_filter_simple_vertical_edge_neon|
|
||||
EXPORT |vp8_loop_filter_bvs_neon|
|
||||
EXPORT |vp8_loop_filter_mbvs_neon|
|
||||
ARM
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *s, PRESERVE
|
||||
; r1 int p, PRESERVE
|
||||
; q1 limit, PRESERVE
|
||||
|
||||
|vp8_loop_filter_simple_vertical_edge_neon| PROC
|
||||
sub r0, r0, #2 ; move src pointer down by 2 columns
|
||||
add r12, r1, r1
|
||||
add r3, r0, r1
|
||||
|
||||
vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12
|
||||
vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12
|
||||
vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12
|
||||
vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12
|
||||
vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12
|
||||
vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12
|
||||
vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12
|
||||
vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12
|
||||
|
||||
vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12
|
||||
vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12
|
||||
vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12
|
||||
vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12
|
||||
vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12
|
||||
vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12
|
||||
vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12
|
||||
vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3]
|
||||
|
||||
vswp d7, d10
|
||||
vswp d12, d9
|
||||
|
||||
;vp8_filter_mask() function
|
||||
;vp8_hevmask() function
|
||||
sub r0, r0, r1, lsl #4
|
||||
vabd.u8 q15, q5, q4 ; abs(p0 - q0)
|
||||
vabd.u8 q14, q3, q6 ; abs(p1 - q1)
|
||||
|
||||
vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
|
||||
vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
|
||||
vmov.u8 q0, #0x80 ; 0x80
|
||||
vmov.s16 q11, #3
|
||||
vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
|
||||
|
||||
veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value
|
||||
veor q5, q5, q0 ; ps0: p0 offset to convert to a signed value
|
||||
veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value
|
||||
veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value
|
||||
|
||||
vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
|
||||
|
||||
vsubl.s8 q2, d8, d10 ; ( qs0 - ps0)
|
||||
vsubl.s8 q13, d9, d11
|
||||
|
||||
vqsub.s8 q14, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
|
||||
|
||||
vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0)
|
||||
vmul.s16 q13, q13, q11
|
||||
|
||||
vmov.u8 q11, #0x03 ; 0x03
|
||||
vmov.u8 q12, #0x04 ; 0x04
|
||||
|
||||
vaddw.s8 q2, q2, d28 ; vp8_filter + 3 * ( qs0 - ps0)
|
||||
vaddw.s8 q13, q13, d29
|
||||
|
||||
vqmovn.s16 d28, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
|
||||
vqmovn.s16 d29, q13
|
||||
|
||||
add r0, r0, #1
|
||||
add r3, r0, r1
|
||||
|
||||
vand q14, q14, q15 ; vp8_filter &= mask
|
||||
|
||||
vqadd.s8 q2, q14, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
|
||||
vqadd.s8 q3, q14, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
|
||||
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
|
||||
vshr.s8 q14, q3, #3 ; Filter1 >>= 3
|
||||
|
||||
;calculate output
|
||||
vqadd.s8 q11, q5, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2)
|
||||
vqsub.s8 q10, q4, q14 ; u = vp8_signed_char_clamp(qs0 - Filter1)
|
||||
|
||||
veor q6, q11, q0 ; *op0 = u^0x80
|
||||
veor q7, q10, q0 ; *oq0 = u^0x80
|
||||
add r12, r1, r1
|
||||
vswp d13, d14
|
||||
|
||||
;store op1, op0, oq0, oq1
|
||||
vst2.8 {d12[0], d13[0]}, [r0], r12
|
||||
vst2.8 {d12[1], d13[1]}, [r3], r12
|
||||
vst2.8 {d12[2], d13[2]}, [r0], r12
|
||||
vst2.8 {d12[3], d13[3]}, [r3], r12
|
||||
vst2.8 {d12[4], d13[4]}, [r0], r12
|
||||
vst2.8 {d12[5], d13[5]}, [r3], r12
|
||||
vst2.8 {d12[6], d13[6]}, [r0], r12
|
||||
vst2.8 {d12[7], d13[7]}, [r3], r12
|
||||
vst2.8 {d14[0], d15[0]}, [r0], r12
|
||||
vst2.8 {d14[1], d15[1]}, [r3], r12
|
||||
vst2.8 {d14[2], d15[2]}, [r0], r12
|
||||
vst2.8 {d14[3], d15[3]}, [r3], r12
|
||||
vst2.8 {d14[4], d15[4]}, [r0], r12
|
||||
vst2.8 {d14[5], d15[5]}, [r3], r12
|
||||
vst2.8 {d14[6], d15[6]}, [r0], r12
|
||||
vst2.8 {d14[7], d15[7]}, [r3]
|
||||
|
||||
bx lr
|
||||
ENDP ; |vp8_loop_filter_simple_vertical_edge_neon|
|
||||
|
||||
; r0 unsigned char *y
|
||||
; r1 int ystride
|
||||
; r2 const unsigned char *blimit
|
||||
|
||||
|vp8_loop_filter_bvs_neon| PROC
|
||||
push {r4, lr}
|
||||
ldrb r3, [r2] ; load blim from mem
|
||||
mov r4, r0
|
||||
add r0, r0, #4
|
||||
vdup.s8 q1, r3 ; duplicate blim
|
||||
bl vp8_loop_filter_simple_vertical_edge_neon
|
||||
; vp8_loop_filter_simple_vertical_edge_neon preserves r1 and q1
|
||||
add r0, r4, #8
|
||||
bl vp8_loop_filter_simple_vertical_edge_neon
|
||||
add r0, r4, #12
|
||||
pop {r4, lr}
|
||||
b vp8_loop_filter_simple_vertical_edge_neon
|
||||
ENDP ;|vp8_loop_filter_bvs_neon|
|
||||
|
||||
; r0 unsigned char *y
|
||||
; r1 int ystride
|
||||
; r2 const unsigned char *blimit
|
||||
|
||||
|vp8_loop_filter_mbvs_neon| PROC
|
||||
ldrb r3, [r2] ; load mblim from mem
|
||||
vdup.s8 q1, r3 ; duplicate mblim
|
||||
b vp8_loop_filter_simple_vertical_edge_neon
|
||||
ENDP ;|vp8_loop_filter_bvs_neon|
|
||||
END
|
@ -0,0 +1,279 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include "./vpx_config.h"
|
||||
|
||||
#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
|
||||
static INLINE void write_2x8(unsigned char *dst, int pitch,
|
||||
const uint8x8x2_t result,
|
||||
const uint8x8x2_t result2) {
|
||||
vst2_lane_u8(dst, result, 0);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result, 1);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result, 2);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result, 3);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result, 4);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result, 5);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result, 6);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result, 7);
|
||||
dst += pitch;
|
||||
|
||||
vst2_lane_u8(dst, result2, 0);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result2, 1);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result2, 2);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result2, 3);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result2, 4);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result2, 5);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result2, 6);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result2, 7);
|
||||
}
|
||||
#else
|
||||
static INLINE void write_2x4(unsigned char *dst, int pitch,
|
||||
const uint8x8x2_t result) {
|
||||
/*
|
||||
* uint8x8x2_t result
|
||||
00 01 02 03 | 04 05 06 07
|
||||
10 11 12 13 | 14 15 16 17
|
||||
---
|
||||
* after vtrn_u8
|
||||
00 10 02 12 | 04 14 06 16
|
||||
01 11 03 13 | 05 15 07 17
|
||||
*/
|
||||
const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0],
|
||||
result.val[1]);
|
||||
const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]);
|
||||
const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]);
|
||||
vst1_lane_u16((uint16_t *)dst, x_0_4, 0);
|
||||
dst += pitch;
|
||||
vst1_lane_u16((uint16_t *)dst, x_1_5, 0);
|
||||
dst += pitch;
|
||||
vst1_lane_u16((uint16_t *)dst, x_0_4, 1);
|
||||
dst += pitch;
|
||||
vst1_lane_u16((uint16_t *)dst, x_1_5, 1);
|
||||
dst += pitch;
|
||||
vst1_lane_u16((uint16_t *)dst, x_0_4, 2);
|
||||
dst += pitch;
|
||||
vst1_lane_u16((uint16_t *)dst, x_1_5, 2);
|
||||
dst += pitch;
|
||||
vst1_lane_u16((uint16_t *)dst, x_0_4, 3);
|
||||
dst += pitch;
|
||||
vst1_lane_u16((uint16_t *)dst, x_1_5, 3);
|
||||
}
|
||||
|
||||
static INLINE void write_2x8(unsigned char *dst, int pitch,
|
||||
const uint8x8x2_t result,
|
||||
const uint8x8x2_t result2) {
|
||||
write_2x4(dst, pitch, result);
|
||||
dst += pitch * 8;
|
||||
write_2x4(dst, pitch, result2);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
|
||||
static INLINE
|
||||
uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
|
||||
x = vld4_lane_u8(src, x, 0);
|
||||
src += pitch;
|
||||
x = vld4_lane_u8(src, x, 1);
|
||||
src += pitch;
|
||||
x = vld4_lane_u8(src, x, 2);
|
||||
src += pitch;
|
||||
x = vld4_lane_u8(src, x, 3);
|
||||
src += pitch;
|
||||
x = vld4_lane_u8(src, x, 4);
|
||||
src += pitch;
|
||||
x = vld4_lane_u8(src, x, 5);
|
||||
src += pitch;
|
||||
x = vld4_lane_u8(src, x, 6);
|
||||
src += pitch;
|
||||
x = vld4_lane_u8(src, x, 7);
|
||||
return x;
|
||||
}
|
||||
#else
|
||||
static INLINE
|
||||
uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
|
||||
const uint8x8_t a = vld1_u8(src);
|
||||
const uint8x8_t b = vld1_u8(src + pitch * 1);
|
||||
const uint8x8_t c = vld1_u8(src + pitch * 2);
|
||||
const uint8x8_t d = vld1_u8(src + pitch * 3);
|
||||
const uint8x8_t e = vld1_u8(src + pitch * 4);
|
||||
const uint8x8_t f = vld1_u8(src + pitch * 5);
|
||||
const uint8x8_t g = vld1_u8(src + pitch * 6);
|
||||
const uint8x8_t h = vld1_u8(src + pitch * 7);
|
||||
const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a),
|
||||
vreinterpret_u32_u8(e));
|
||||
const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b),
|
||||
vreinterpret_u32_u8(f));
|
||||
const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c),
|
||||
vreinterpret_u32_u8(g));
|
||||
const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d),
|
||||
vreinterpret_u32_u8(h));
|
||||
const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]),
|
||||
vreinterpret_u16_u32(r26_u32.val[0]));
|
||||
const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]),
|
||||
vreinterpret_u16_u32(r37_u32.val[0]));
|
||||
const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
|
||||
vreinterpret_u8_u16(r13_u16.val[0]));
|
||||
const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
|
||||
vreinterpret_u8_u16(r13_u16.val[1]));
|
||||
/*
|
||||
* after vtrn_u32
|
||||
00 01 02 03 | 40 41 42 43
|
||||
10 11 12 13 | 50 51 52 53
|
||||
20 21 22 23 | 60 61 62 63
|
||||
30 31 32 33 | 70 71 72 73
|
||||
---
|
||||
* after vtrn_u16
|
||||
00 01 20 21 | 40 41 60 61
|
||||
02 03 22 23 | 42 43 62 63
|
||||
10 11 30 31 | 50 51 70 71
|
||||
12 13 32 33 | 52 52 72 73
|
||||
|
||||
00 01 20 21 | 40 41 60 61
|
||||
10 11 30 31 | 50 51 70 71
|
||||
02 03 22 23 | 42 43 62 63
|
||||
12 13 32 33 | 52 52 72 73
|
||||
---
|
||||
* after vtrn_u8
|
||||
00 10 20 30 | 40 50 60 70
|
||||
01 11 21 31 | 41 51 61 71
|
||||
02 12 22 32 | 42 52 62 72
|
||||
03 13 23 33 | 43 53 63 73
|
||||
*/
|
||||
x.val[0] = r01_u8.val[0];
|
||||
x.val[1] = r01_u8.val[1];
|
||||
x.val[2] = r23_u8.val[0];
|
||||
x.val[3] = r23_u8.val[1];
|
||||
|
||||
return x;
|
||||
}
|
||||
#endif
|
||||
|
||||
static INLINE void vp8_loop_filter_simple_vertical_edge_neon(
|
||||
unsigned char *s,
|
||||
int p,
|
||||
const unsigned char *blimit) {
|
||||
unsigned char *src1;
|
||||
uint8x16_t qblimit, q0u8;
|
||||
uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8;
|
||||
int16x8_t q2s16, q13s16, q11s16;
|
||||
int8x8_t d28s8, d29s8;
|
||||
int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8;
|
||||
uint8x8x4_t d0u8x4; // d6, d7, d8, d9
|
||||
uint8x8x4_t d1u8x4; // d10, d11, d12, d13
|
||||
uint8x8x2_t d2u8x2; // d12, d13
|
||||
uint8x8x2_t d3u8x2; // d14, d15
|
||||
|
||||
qblimit = vdupq_n_u8(*blimit);
|
||||
|
||||
src1 = s - 2;
|
||||
d0u8x4 = read_4x8(src1, p, d0u8x4);
|
||||
src1 += p * 8;
|
||||
d1u8x4 = read_4x8(src1, p, d1u8x4);
|
||||
|
||||
q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]); // d6 d10
|
||||
q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]); // d8 d12
|
||||
q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]); // d7 d11
|
||||
q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]); // d9 d13
|
||||
|
||||
q15u8 = vabdq_u8(q5u8, q4u8);
|
||||
q14u8 = vabdq_u8(q3u8, q6u8);
|
||||
|
||||
q15u8 = vqaddq_u8(q15u8, q15u8);
|
||||
q14u8 = vshrq_n_u8(q14u8, 1);
|
||||
q0u8 = vdupq_n_u8(0x80);
|
||||
q11s16 = vdupq_n_s16(3);
|
||||
q15u8 = vqaddq_u8(q15u8, q14u8);
|
||||
|
||||
q3u8 = veorq_u8(q3u8, q0u8);
|
||||
q4u8 = veorq_u8(q4u8, q0u8);
|
||||
q5u8 = veorq_u8(q5u8, q0u8);
|
||||
q6u8 = veorq_u8(q6u8, q0u8);
|
||||
|
||||
q15u8 = vcgeq_u8(qblimit, q15u8);
|
||||
|
||||
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)),
|
||||
vget_low_s8(vreinterpretq_s8_u8(q5u8)));
|
||||
q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)),
|
||||
vget_high_s8(vreinterpretq_s8_u8(q5u8)));
|
||||
|
||||
q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8),
|
||||
vreinterpretq_s8_u8(q6u8));
|
||||
|
||||
q2s16 = vmulq_s16(q2s16, q11s16);
|
||||
q13s16 = vmulq_s16(q13s16, q11s16);
|
||||
|
||||
q11u8 = vdupq_n_u8(3);
|
||||
q12u8 = vdupq_n_u8(4);
|
||||
|
||||
q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8));
|
||||
q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8));
|
||||
|
||||
d28s8 = vqmovn_s16(q2s16);
|
||||
d29s8 = vqmovn_s16(q13s16);
|
||||
q14s8 = vcombine_s8(d28s8, d29s8);
|
||||
|
||||
q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8));
|
||||
|
||||
q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8));
|
||||
q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8));
|
||||
q2s8 = vshrq_n_s8(q2s8, 3);
|
||||
q14s8 = vshrq_n_s8(q3s8, 3);
|
||||
|
||||
q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8);
|
||||
q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8);
|
||||
|
||||
q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
|
||||
q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
|
||||
|
||||
d2u8x2.val[0] = vget_low_u8(q6u8); // d12
|
||||
d2u8x2.val[1] = vget_low_u8(q7u8); // d14
|
||||
d3u8x2.val[0] = vget_high_u8(q6u8); // d13
|
||||
d3u8x2.val[1] = vget_high_u8(q7u8); // d15
|
||||
|
||||
src1 = s - 1;
|
||||
write_2x8(src1, p, d2u8x2, d3u8x2);
|
||||
}
|
||||
|
||||
void vp8_loop_filter_bvs_neon(
|
||||
unsigned char *y_ptr,
|
||||
int y_stride,
|
||||
const unsigned char *blimit) {
|
||||
y_ptr += 4;
|
||||
vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
|
||||
y_ptr += 4;
|
||||
vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
|
||||
y_ptr += 4;
|
||||
vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_loop_filter_mbvs_neon(
|
||||
unsigned char *y_ptr,
|
||||
int y_stride,
|
||||
const unsigned char *blimit) {
|
||||
vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
|
||||
return;
|
||||
}
|
@ -1,469 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_mbloop_filter_horizontal_edge_y_neon|
|
||||
EXPORT |vp8_mbloop_filter_horizontal_edge_uv_neon|
|
||||
EXPORT |vp8_mbloop_filter_vertical_edge_y_neon|
|
||||
EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon|
|
||||
ARM
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
|
||||
; const unsigned char *blimit,
|
||||
; const unsigned char *limit,
|
||||
; const unsigned char *thresh)
|
||||
; r0 unsigned char *src,
|
||||
; r1 int pitch,
|
||||
; r2 unsigned char blimit
|
||||
; r3 unsigned char limit
|
||||
; sp unsigned char thresh,
|
||||
|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
|
||||
push {lr}
|
||||
add r1, r1, r1 ; double stride
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines
|
||||
vdup.u8 q2, r12 ; thresh
|
||||
add r12, r0, r1, lsr #1 ; move src pointer up by 1 line
|
||||
|
||||
vld1.u8 {q3}, [r0@128], r1 ; p3
|
||||
vld1.u8 {q4}, [r12@128], r1 ; p2
|
||||
vld1.u8 {q5}, [r0@128], r1 ; p1
|
||||
vld1.u8 {q6}, [r12@128], r1 ; p0
|
||||
vld1.u8 {q7}, [r0@128], r1 ; q0
|
||||
vld1.u8 {q8}, [r12@128], r1 ; q1
|
||||
vld1.u8 {q9}, [r0@128], r1 ; q2
|
||||
vld1.u8 {q10}, [r12@128], r1 ; q3
|
||||
|
||||
bl vp8_mbloop_filter_neon
|
||||
|
||||
sub r12, r12, r1, lsl #2
|
||||
add r0, r12, r1, lsr #1
|
||||
|
||||
vst1.u8 {q4}, [r12@128],r1 ; store op2
|
||||
vst1.u8 {q5}, [r0@128],r1 ; store op1
|
||||
vst1.u8 {q6}, [r12@128], r1 ; store op0
|
||||
vst1.u8 {q7}, [r0@128],r1 ; store oq0
|
||||
vst1.u8 {q8}, [r12@128] ; store oq1
|
||||
vst1.u8 {q9}, [r0@128] ; store oq2
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon|
|
||||
|
||||
; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
|
||||
; const unsigned char *blimit,
|
||||
; const unsigned char *limit,
|
||||
; const unsigned char *thresh,
|
||||
; unsigned char *v)
|
||||
; r0 unsigned char *u,
|
||||
; r1 int pitch,
|
||||
; r2 unsigned char blimit
|
||||
; r3 unsigned char limit
|
||||
; sp unsigned char thresh,
|
||||
; sp+4 unsigned char *v
|
||||
|
||||
|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
|
||||
push {lr}
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines
|
||||
vdup.u8 q2, r12 ; thresh
|
||||
ldr r12, [sp, #8] ; load v ptr
|
||||
sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines
|
||||
|
||||
vld1.u8 {d6}, [r0@64], r1 ; p3
|
||||
vld1.u8 {d7}, [r12@64], r1 ; p3
|
||||
vld1.u8 {d8}, [r0@64], r1 ; p2
|
||||
vld1.u8 {d9}, [r12@64], r1 ; p2
|
||||
vld1.u8 {d10}, [r0@64], r1 ; p1
|
||||
vld1.u8 {d11}, [r12@64], r1 ; p1
|
||||
vld1.u8 {d12}, [r0@64], r1 ; p0
|
||||
vld1.u8 {d13}, [r12@64], r1 ; p0
|
||||
vld1.u8 {d14}, [r0@64], r1 ; q0
|
||||
vld1.u8 {d15}, [r12@64], r1 ; q0
|
||||
vld1.u8 {d16}, [r0@64], r1 ; q1
|
||||
vld1.u8 {d17}, [r12@64], r1 ; q1
|
||||
vld1.u8 {d18}, [r0@64], r1 ; q2
|
||||
vld1.u8 {d19}, [r12@64], r1 ; q2
|
||||
vld1.u8 {d20}, [r0@64], r1 ; q3
|
||||
vld1.u8 {d21}, [r12@64], r1 ; q3
|
||||
|
||||
bl vp8_mbloop_filter_neon
|
||||
|
||||
sub r0, r0, r1, lsl #3
|
||||
sub r12, r12, r1, lsl #3
|
||||
|
||||
add r0, r0, r1
|
||||
add r12, r12, r1
|
||||
|
||||
vst1.u8 {d8}, [r0@64], r1 ; store u op2
|
||||
vst1.u8 {d9}, [r12@64], r1 ; store v op2
|
||||
vst1.u8 {d10}, [r0@64], r1 ; store u op1
|
||||
vst1.u8 {d11}, [r12@64], r1 ; store v op1
|
||||
vst1.u8 {d12}, [r0@64], r1 ; store u op0
|
||||
vst1.u8 {d13}, [r12@64], r1 ; store v op0
|
||||
vst1.u8 {d14}, [r0@64], r1 ; store u oq0
|
||||
vst1.u8 {d15}, [r12@64], r1 ; store v oq0
|
||||
vst1.u8 {d16}, [r0@64], r1 ; store u oq1
|
||||
vst1.u8 {d17}, [r12@64], r1 ; store v oq1
|
||||
vst1.u8 {d18}, [r0@64], r1 ; store u oq2
|
||||
vst1.u8 {d19}, [r12@64], r1 ; store v oq2
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
|
||||
|
||||
; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
|
||||
; const unsigned char *blimit,
|
||||
; const unsigned char *limit,
|
||||
; const unsigned char *thresh)
|
||||
; r0 unsigned char *src,
|
||||
; r1 int pitch,
|
||||
; r2 unsigned char blimit
|
||||
; r3 unsigned char limit
|
||||
; sp unsigned char thresh,
|
||||
|vp8_mbloop_filter_vertical_edge_y_neon| PROC
|
||||
push {lr}
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
sub r0, r0, #4 ; move src pointer down by 4 columns
|
||||
vdup.s8 q2, r12 ; thresh
|
||||
add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines
|
||||
|
||||
vld1.u8 {d6}, [r0], r1 ; load first 8-line src data
|
||||
vld1.u8 {d7}, [r12], r1 ; load second 8-line src data
|
||||
vld1.u8 {d8}, [r0], r1
|
||||
vld1.u8 {d9}, [r12], r1
|
||||
vld1.u8 {d10}, [r0], r1
|
||||
vld1.u8 {d11}, [r12], r1
|
||||
vld1.u8 {d12}, [r0], r1
|
||||
vld1.u8 {d13}, [r12], r1
|
||||
vld1.u8 {d14}, [r0], r1
|
||||
vld1.u8 {d15}, [r12], r1
|
||||
vld1.u8 {d16}, [r0], r1
|
||||
vld1.u8 {d17}, [r12], r1
|
||||
vld1.u8 {d18}, [r0], r1
|
||||
vld1.u8 {d19}, [r12], r1
|
||||
vld1.u8 {d20}, [r0], r1
|
||||
vld1.u8 {d21}, [r12], r1
|
||||
|
||||
;transpose to 8x16 matrix
|
||||
vtrn.32 q3, q7
|
||||
vtrn.32 q4, q8
|
||||
vtrn.32 q5, q9
|
||||
vtrn.32 q6, q10
|
||||
|
||||
vtrn.16 q3, q5
|
||||
vtrn.16 q4, q6
|
||||
vtrn.16 q7, q9
|
||||
vtrn.16 q8, q10
|
||||
|
||||
vtrn.8 q3, q4
|
||||
vtrn.8 q5, q6
|
||||
vtrn.8 q7, q8
|
||||
vtrn.8 q9, q10
|
||||
|
||||
sub r0, r0, r1, lsl #3
|
||||
|
||||
bl vp8_mbloop_filter_neon
|
||||
|
||||
sub r12, r12, r1, lsl #3
|
||||
|
||||
;transpose to 16x8 matrix
|
||||
vtrn.32 q3, q7
|
||||
vtrn.32 q4, q8
|
||||
vtrn.32 q5, q9
|
||||
vtrn.32 q6, q10
|
||||
|
||||
vtrn.16 q3, q5
|
||||
vtrn.16 q4, q6
|
||||
vtrn.16 q7, q9
|
||||
vtrn.16 q8, q10
|
||||
|
||||
vtrn.8 q3, q4
|
||||
vtrn.8 q5, q6
|
||||
vtrn.8 q7, q8
|
||||
vtrn.8 q9, q10
|
||||
|
||||
;store op2, op1, op0, oq0, oq1, oq2
|
||||
vst1.8 {d6}, [r0], r1
|
||||
vst1.8 {d7}, [r12], r1
|
||||
vst1.8 {d8}, [r0], r1
|
||||
vst1.8 {d9}, [r12], r1
|
||||
vst1.8 {d10}, [r0], r1
|
||||
vst1.8 {d11}, [r12], r1
|
||||
vst1.8 {d12}, [r0], r1
|
||||
vst1.8 {d13}, [r12], r1
|
||||
vst1.8 {d14}, [r0], r1
|
||||
vst1.8 {d15}, [r12], r1
|
||||
vst1.8 {d16}, [r0], r1
|
||||
vst1.8 {d17}, [r12], r1
|
||||
vst1.8 {d18}, [r0], r1
|
||||
vst1.8 {d19}, [r12], r1
|
||||
vst1.8 {d20}, [r0]
|
||||
vst1.8 {d21}, [r12]
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon|
|
||||
|
||||
; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
|
||||
; const unsigned char *blimit,
|
||||
; const unsigned char *limit,
|
||||
; const unsigned char *thresh,
|
||||
; unsigned char *v)
|
||||
; r0 unsigned char *u,
|
||||
; r1 int pitch,
|
||||
; r2 const signed char *flimit,
|
||||
; r3 const signed char *limit,
|
||||
; sp const signed char *thresh,
|
||||
; sp+4 unsigned char *v
|
||||
|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
|
||||
push {lr}
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
sub r0, r0, #4 ; move u pointer down by 4 columns
|
||||
vdup.u8 q2, r12 ; thresh
|
||||
ldr r12, [sp, #8] ; load v ptr
|
||||
sub r12, r12, #4 ; move v pointer down by 4 columns
|
||||
|
||||
vld1.u8 {d6}, [r0], r1 ;load u data
|
||||
vld1.u8 {d7}, [r12], r1 ;load v data
|
||||
vld1.u8 {d8}, [r0], r1
|
||||
vld1.u8 {d9}, [r12], r1
|
||||
vld1.u8 {d10}, [r0], r1
|
||||
vld1.u8 {d11}, [r12], r1
|
||||
vld1.u8 {d12}, [r0], r1
|
||||
vld1.u8 {d13}, [r12], r1
|
||||
vld1.u8 {d14}, [r0], r1
|
||||
vld1.u8 {d15}, [r12], r1
|
||||
vld1.u8 {d16}, [r0], r1
|
||||
vld1.u8 {d17}, [r12], r1
|
||||
vld1.u8 {d18}, [r0], r1
|
||||
vld1.u8 {d19}, [r12], r1
|
||||
vld1.u8 {d20}, [r0], r1
|
||||
vld1.u8 {d21}, [r12], r1
|
||||
|
||||
;transpose to 8x16 matrix
|
||||
vtrn.32 q3, q7
|
||||
vtrn.32 q4, q8
|
||||
vtrn.32 q5, q9
|
||||
vtrn.32 q6, q10
|
||||
|
||||
vtrn.16 q3, q5
|
||||
vtrn.16 q4, q6
|
||||
vtrn.16 q7, q9
|
||||
vtrn.16 q8, q10
|
||||
|
||||
vtrn.8 q3, q4
|
||||
vtrn.8 q5, q6
|
||||
vtrn.8 q7, q8
|
||||
vtrn.8 q9, q10
|
||||
|
||||
sub r0, r0, r1, lsl #3
|
||||
|
||||
bl vp8_mbloop_filter_neon
|
||||
|
||||
sub r12, r12, r1, lsl #3
|
||||
|
||||
;transpose to 16x8 matrix
|
||||
vtrn.32 q3, q7
|
||||
vtrn.32 q4, q8
|
||||
vtrn.32 q5, q9
|
||||
vtrn.32 q6, q10
|
||||
|
||||
vtrn.16 q3, q5
|
||||
vtrn.16 q4, q6
|
||||
vtrn.16 q7, q9
|
||||
vtrn.16 q8, q10
|
||||
|
||||
vtrn.8 q3, q4
|
||||
vtrn.8 q5, q6
|
||||
vtrn.8 q7, q8
|
||||
vtrn.8 q9, q10
|
||||
|
||||
;store op2, op1, op0, oq0, oq1, oq2
|
||||
vst1.8 {d6}, [r0], r1
|
||||
vst1.8 {d7}, [r12], r1
|
||||
vst1.8 {d8}, [r0], r1
|
||||
vst1.8 {d9}, [r12], r1
|
||||
vst1.8 {d10}, [r0], r1
|
||||
vst1.8 {d11}, [r12], r1
|
||||
vst1.8 {d12}, [r0], r1
|
||||
vst1.8 {d13}, [r12], r1
|
||||
vst1.8 {d14}, [r0], r1
|
||||
vst1.8 {d15}, [r12], r1
|
||||
vst1.8 {d16}, [r0], r1
|
||||
vst1.8 {d17}, [r12], r1
|
||||
vst1.8 {d18}, [r0], r1
|
||||
vst1.8 {d19}, [r12], r1
|
||||
vst1.8 {d20}, [r0]
|
||||
vst1.8 {d21}, [r12]
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon|
|
||||
|
||||
; void vp8_mbloop_filter_neon()
|
||||
; This is a helper function for the macroblock loopfilters. The individual
|
||||
; functions do the necessary load, transpose (if necessary), preserve (if
|
||||
; necessary) and store.
|
||||
|
||||
; r0,r1 PRESERVE
|
||||
; r2 mblimit
|
||||
; r3 limit
|
||||
|
||||
; q2 thresh
|
||||
; q3 p3 PRESERVE
|
||||
; q4 p2
|
||||
; q5 p1
|
||||
; q6 p0
|
||||
; q7 q0
|
||||
; q8 q1
|
||||
; q9 q2
|
||||
; q10 q3 PRESERVE
|
||||
|
||||
|vp8_mbloop_filter_neon| PROC
|
||||
|
||||
; vp8_filter_mask
|
||||
vabd.u8 q11, q3, q4 ; abs(p3 - p2)
|
||||
vabd.u8 q12, q4, q5 ; abs(p2 - p1)
|
||||
vabd.u8 q13, q5, q6 ; abs(p1 - p0)
|
||||
vabd.u8 q14, q8, q7 ; abs(q1 - q0)
|
||||
vabd.u8 q1, q9, q8 ; abs(q2 - q1)
|
||||
vabd.u8 q0, q10, q9 ; abs(q3 - q2)
|
||||
|
||||
vmax.u8 q11, q11, q12
|
||||
vmax.u8 q12, q13, q14
|
||||
vmax.u8 q1, q1, q0
|
||||
vmax.u8 q15, q11, q12
|
||||
|
||||
vabd.u8 q12, q6, q7 ; abs(p0 - q0)
|
||||
|
||||
; vp8_hevmask
|
||||
vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1
|
||||
vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1
|
||||
vmax.u8 q15, q15, q1
|
||||
|
||||
vdup.u8 q1, r3 ; limit
|
||||
vdup.u8 q2, r2 ; mblimit
|
||||
|
||||
vmov.u8 q0, #0x80 ; 0x80
|
||||
|
||||
vcge.u8 q15, q1, q15
|
||||
|
||||
vabd.u8 q1, q5, q8 ; a = abs(p1 - q1)
|
||||
vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2
|
||||
vmov.u16 q11, #3 ; #3
|
||||
|
||||
; vp8_filter
|
||||
; convert to signed
|
||||
veor q7, q7, q0 ; qs0
|
||||
vshr.u8 q1, q1, #1 ; a = a / 2
|
||||
veor q6, q6, q0 ; ps0
|
||||
veor q5, q5, q0 ; ps1
|
||||
|
||||
vqadd.u8 q12, q12, q1 ; a = b + a
|
||||
|
||||
veor q8, q8, q0 ; qs1
|
||||
veor q4, q4, q0 ; ps2
|
||||
veor q9, q9, q0 ; qs2
|
||||
|
||||
vorr q14, q13, q14 ; vp8_hevmask
|
||||
|
||||
vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1
|
||||
|
||||
vsubl.s8 q2, d14, d12 ; qs0 - ps0
|
||||
vsubl.s8 q13, d15, d13
|
||||
|
||||
vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1)
|
||||
|
||||
vmul.i16 q2, q2, q11 ; 3 * ( qs0 - ps0)
|
||||
|
||||
vand q15, q15, q12 ; vp8_filter_mask
|
||||
|
||||
vmul.i16 q13, q13, q11
|
||||
|
||||
vmov.u8 q12, #3 ; #3
|
||||
|
||||
vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0)
|
||||
vaddw.s8 q13, q13, d3
|
||||
|
||||
vmov.u8 q11, #4 ; #4
|
||||
|
||||
; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
|
||||
vqmovn.s16 d2, q2
|
||||
vqmovn.s16 d3, q13
|
||||
|
||||
vand q1, q1, q15 ; vp8_filter &= mask
|
||||
|
||||
vmov.u16 q15, #63 ; #63
|
||||
|
||||
vand q13, q1, q14 ; Filter2 &= hev
|
||||
|
||||
vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4)
|
||||
vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3)
|
||||
|
||||
vmov q0, q15
|
||||
|
||||
vshr.s8 q2, q2, #3 ; Filter1 >>= 3
|
||||
vshr.s8 q13, q13, #3 ; Filter2 >>= 3
|
||||
|
||||
vmov q11, q15
|
||||
vmov q12, q15
|
||||
|
||||
vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1)
|
||||
|
||||
vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2)
|
||||
|
||||
vbic q1, q1, q14 ; vp8_filter &= ~hev
|
||||
|
||||
; roughly 1/7th difference across boundary
|
||||
; roughly 2/7th difference across boundary
|
||||
; roughly 3/7th difference across boundary
|
||||
|
||||
vmov.u8 d5, #9 ; #9
|
||||
vmov.u8 d4, #18 ; #18
|
||||
|
||||
vmov q13, q15
|
||||
vmov q14, q15
|
||||
|
||||
vmlal.s8 q0, d2, d5 ; 63 + Filter2 * 9
|
||||
vmlal.s8 q11, d3, d5
|
||||
vmov.u8 d5, #27 ; #27
|
||||
vmlal.s8 q12, d2, d4 ; 63 + Filter2 * 18
|
||||
vmlal.s8 q13, d3, d4
|
||||
vmlal.s8 q14, d2, d5 ; 63 + Filter2 * 27
|
||||
vmlal.s8 q15, d3, d5
|
||||
|
||||
vqshrn.s16 d0, q0, #7 ; u = clamp((63 + Filter2 * 9)>>7)
|
||||
vqshrn.s16 d1, q11, #7
|
||||
vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7)
|
||||
vqshrn.s16 d25, q13, #7
|
||||
vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7)
|
||||
vqshrn.s16 d29, q15, #7
|
||||
|
||||
vmov.u8 q1, #0x80 ; 0x80
|
||||
|
||||
vqsub.s8 q11, q9, q0 ; s = clamp(qs2 - u)
|
||||
vqadd.s8 q0, q4, q0 ; s = clamp(ps2 + u)
|
||||
vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u)
|
||||
vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u)
|
||||
vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u)
|
||||
vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u)
|
||||
|
||||
veor q9, q11, q1 ; *oq2 = s^0x80
|
||||
veor q4, q0, q1 ; *op2 = s^0x80
|
||||
veor q8, q13, q1 ; *oq1 = s^0x80
|
||||
veor q5, q12, q1 ; *op2 = s^0x80
|
||||
veor q7, q15, q1 ; *oq0 = s^0x80
|
||||
veor q6, q14, q1 ; *op0 = s^0x80
|
||||
|
||||
bx lr
|
||||
ENDP ; |vp8_mbloop_filter_neon|
|
||||
|
||||
;-----------------
|
||||
|
||||
END
|
625
media/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c
Normal file
625
media/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c
Normal file
@ -0,0 +1,625 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include "./vpx_config.h"
|
||||
|
||||
static INLINE void vp8_mbloop_filter_neon(
|
||||
uint8x16_t qblimit, // mblimit
|
||||
uint8x16_t qlimit, // limit
|
||||
uint8x16_t qthresh, // thresh
|
||||
uint8x16_t q3, // p2
|
||||
uint8x16_t q4, // p2
|
||||
uint8x16_t q5, // p1
|
||||
uint8x16_t q6, // p0
|
||||
uint8x16_t q7, // q0
|
||||
uint8x16_t q8, // q1
|
||||
uint8x16_t q9, // q2
|
||||
uint8x16_t q10, // q3
|
||||
uint8x16_t *q4r, // p1
|
||||
uint8x16_t *q5r, // p1
|
||||
uint8x16_t *q6r, // p0
|
||||
uint8x16_t *q7r, // q0
|
||||
uint8x16_t *q8r, // q1
|
||||
uint8x16_t *q9r) { // q1
|
||||
uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8;
|
||||
int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16;
|
||||
int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8;
|
||||
uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16;
|
||||
int8x16_t q0s8, q12s8, q14s8, q15s8;
|
||||
int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29;
|
||||
|
||||
q11u8 = vabdq_u8(q3, q4);
|
||||
q12u8 = vabdq_u8(q4, q5);
|
||||
q13u8 = vabdq_u8(q5, q6);
|
||||
q14u8 = vabdq_u8(q8, q7);
|
||||
q1u8 = vabdq_u8(q9, q8);
|
||||
q0u8 = vabdq_u8(q10, q9);
|
||||
|
||||
q11u8 = vmaxq_u8(q11u8, q12u8);
|
||||
q12u8 = vmaxq_u8(q13u8, q14u8);
|
||||
q1u8 = vmaxq_u8(q1u8, q0u8);
|
||||
q15u8 = vmaxq_u8(q11u8, q12u8);
|
||||
|
||||
q12u8 = vabdq_u8(q6, q7);
|
||||
|
||||
// vp8_hevmask
|
||||
q13u8 = vcgtq_u8(q13u8, qthresh);
|
||||
q14u8 = vcgtq_u8(q14u8, qthresh);
|
||||
q15u8 = vmaxq_u8(q15u8, q1u8);
|
||||
|
||||
q15u8 = vcgeq_u8(qlimit, q15u8);
|
||||
|
||||
q1u8 = vabdq_u8(q5, q8);
|
||||
q12u8 = vqaddq_u8(q12u8, q12u8);
|
||||
|
||||
// vp8_filter() function
|
||||
// convert to signed
|
||||
q0u8 = vdupq_n_u8(0x80);
|
||||
q9 = veorq_u8(q9, q0u8);
|
||||
q8 = veorq_u8(q8, q0u8);
|
||||
q7 = veorq_u8(q7, q0u8);
|
||||
q6 = veorq_u8(q6, q0u8);
|
||||
q5 = veorq_u8(q5, q0u8);
|
||||
q4 = veorq_u8(q4, q0u8);
|
||||
|
||||
q1u8 = vshrq_n_u8(q1u8, 1);
|
||||
q12u8 = vqaddq_u8(q12u8, q1u8);
|
||||
|
||||
q14u8 = vorrq_u8(q13u8, q14u8);
|
||||
q12u8 = vcgeq_u8(qblimit, q12u8);
|
||||
|
||||
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
|
||||
vget_low_s8(vreinterpretq_s8_u8(q6)));
|
||||
q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
|
||||
vget_high_s8(vreinterpretq_s8_u8(q6)));
|
||||
|
||||
q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
|
||||
vreinterpretq_s8_u8(q8));
|
||||
|
||||
q11s16 = vdupq_n_s16(3);
|
||||
q2s16 = vmulq_s16(q2s16, q11s16);
|
||||
q13s16 = vmulq_s16(q13s16, q11s16);
|
||||
|
||||
q15u8 = vandq_u8(q15u8, q12u8);
|
||||
|
||||
q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
|
||||
q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8));
|
||||
|
||||
q12u8 = vdupq_n_u8(3);
|
||||
q11u8 = vdupq_n_u8(4);
|
||||
// vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
|
||||
d2 = vqmovn_s16(q2s16);
|
||||
d3 = vqmovn_s16(q13s16);
|
||||
q1s8 = vcombine_s8(d2, d3);
|
||||
q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8));
|
||||
q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
|
||||
|
||||
q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8));
|
||||
q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8));
|
||||
q2s8 = vshrq_n_s8(q2s8, 3);
|
||||
q13s8 = vshrq_n_s8(q13s8, 3);
|
||||
|
||||
q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8);
|
||||
q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8);
|
||||
|
||||
q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
|
||||
|
||||
q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63);
|
||||
d5 = vdup_n_s8(9);
|
||||
d4 = vdup_n_s8(18);
|
||||
|
||||
q0s16 = vmlal_s8(vreinterpretq_s16_u16(q0u16), vget_low_s8(q1s8), d5);
|
||||
q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5);
|
||||
d5 = vdup_n_s8(27);
|
||||
q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8), d4);
|
||||
q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4);
|
||||
q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8), d5);
|
||||
q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5);
|
||||
|
||||
d0 = vqshrn_n_s16(q0s16 , 7);
|
||||
d1 = vqshrn_n_s16(q11s16, 7);
|
||||
d24 = vqshrn_n_s16(q12s16, 7);
|
||||
d25 = vqshrn_n_s16(q13s16, 7);
|
||||
d28 = vqshrn_n_s16(q14s16, 7);
|
||||
d29 = vqshrn_n_s16(q15s16, 7);
|
||||
|
||||
q0s8 = vcombine_s8(d0, d1);
|
||||
q12s8 = vcombine_s8(d24, d25);
|
||||
q14s8 = vcombine_s8(d28, d29);
|
||||
|
||||
q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8);
|
||||
q0s8 = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8);
|
||||
q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8);
|
||||
q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8);
|
||||
q15s8 = vqsubq_s8((q7s8), q14s8);
|
||||
q14s8 = vqaddq_s8((q6s8), q14s8);
|
||||
|
||||
q1u8 = vdupq_n_u8(0x80);
|
||||
*q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8);
|
||||
*q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8);
|
||||
*q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8);
|
||||
*q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8);
|
||||
*q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8);
|
||||
*q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8);
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_mbloop_filter_horizontal_edge_y_neon(
|
||||
unsigned char *src,
|
||||
int pitch,
|
||||
unsigned char blimit,
|
||||
unsigned char limit,
|
||||
unsigned char thresh) {
|
||||
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
|
||||
uint8x16_t q5, q6, q7, q8, q9, q10;
|
||||
|
||||
qblimit = vdupq_n_u8(blimit);
|
||||
qlimit = vdupq_n_u8(limit);
|
||||
qthresh = vdupq_n_u8(thresh);
|
||||
|
||||
src -= (pitch << 2);
|
||||
|
||||
q3 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q4 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q5 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q6 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q7 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q8 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q9 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q10 = vld1q_u8(src);
|
||||
|
||||
vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
|
||||
q5, q6, q7, q8, q9, q10,
|
||||
&q4, &q5, &q6, &q7, &q8, &q9);
|
||||
|
||||
src -= (pitch * 6);
|
||||
vst1q_u8(src, q4);
|
||||
src += pitch;
|
||||
vst1q_u8(src, q5);
|
||||
src += pitch;
|
||||
vst1q_u8(src, q6);
|
||||
src += pitch;
|
||||
vst1q_u8(src, q7);
|
||||
src += pitch;
|
||||
vst1q_u8(src, q8);
|
||||
src += pitch;
|
||||
vst1q_u8(src, q9);
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_mbloop_filter_horizontal_edge_uv_neon(
|
||||
unsigned char *u,
|
||||
int pitch,
|
||||
unsigned char blimit,
|
||||
unsigned char limit,
|
||||
unsigned char thresh,
|
||||
unsigned char *v) {
|
||||
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
|
||||
uint8x16_t q5, q6, q7, q8, q9, q10;
|
||||
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
|
||||
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
|
||||
|
||||
qblimit = vdupq_n_u8(blimit);
|
||||
qlimit = vdupq_n_u8(limit);
|
||||
qthresh = vdupq_n_u8(thresh);
|
||||
|
||||
u -= (pitch << 2);
|
||||
v -= (pitch << 2);
|
||||
|
||||
d6 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d7 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d8 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d9 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d10 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d11 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d12 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d13 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d14 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d15 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d16 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d17 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d18 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d19 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d20 = vld1_u8(u);
|
||||
d21 = vld1_u8(v);
|
||||
|
||||
q3 = vcombine_u8(d6, d7);
|
||||
q4 = vcombine_u8(d8, d9);
|
||||
q5 = vcombine_u8(d10, d11);
|
||||
q6 = vcombine_u8(d12, d13);
|
||||
q7 = vcombine_u8(d14, d15);
|
||||
q8 = vcombine_u8(d16, d17);
|
||||
q9 = vcombine_u8(d18, d19);
|
||||
q10 = vcombine_u8(d20, d21);
|
||||
|
||||
vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
|
||||
q5, q6, q7, q8, q9, q10,
|
||||
&q4, &q5, &q6, &q7, &q8, &q9);
|
||||
|
||||
u -= (pitch * 6);
|
||||
v -= (pitch * 6);
|
||||
vst1_u8(u, vget_low_u8(q4));
|
||||
u += pitch;
|
||||
vst1_u8(v, vget_high_u8(q4));
|
||||
v += pitch;
|
||||
vst1_u8(u, vget_low_u8(q5));
|
||||
u += pitch;
|
||||
vst1_u8(v, vget_high_u8(q5));
|
||||
v += pitch;
|
||||
vst1_u8(u, vget_low_u8(q6));
|
||||
u += pitch;
|
||||
vst1_u8(v, vget_high_u8(q6));
|
||||
v += pitch;
|
||||
vst1_u8(u, vget_low_u8(q7));
|
||||
u += pitch;
|
||||
vst1_u8(v, vget_high_u8(q7));
|
||||
v += pitch;
|
||||
vst1_u8(u, vget_low_u8(q8));
|
||||
u += pitch;
|
||||
vst1_u8(v, vget_high_u8(q8));
|
||||
v += pitch;
|
||||
vst1_u8(u, vget_low_u8(q9));
|
||||
vst1_u8(v, vget_high_u8(q9));
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_mbloop_filter_vertical_edge_y_neon(
|
||||
unsigned char *src,
|
||||
int pitch,
|
||||
unsigned char blimit,
|
||||
unsigned char limit,
|
||||
unsigned char thresh) {
|
||||
unsigned char *s1, *s2;
|
||||
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
|
||||
uint8x16_t q5, q6, q7, q8, q9, q10;
|
||||
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
|
||||
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
|
||||
uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
|
||||
uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
|
||||
uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
|
||||
|
||||
qblimit = vdupq_n_u8(blimit);
|
||||
qlimit = vdupq_n_u8(limit);
|
||||
qthresh = vdupq_n_u8(thresh);
|
||||
|
||||
s1 = src - 4;
|
||||
s2 = s1 + 8 * pitch;
|
||||
d6 = vld1_u8(s1);
|
||||
s1 += pitch;
|
||||
d7 = vld1_u8(s2);
|
||||
s2 += pitch;
|
||||
d8 = vld1_u8(s1);
|
||||
s1 += pitch;
|
||||
d9 = vld1_u8(s2);
|
||||
s2 += pitch;
|
||||
d10 = vld1_u8(s1);
|
||||
s1 += pitch;
|
||||
d11 = vld1_u8(s2);
|
||||
s2 += pitch;
|
||||
d12 = vld1_u8(s1);
|
||||
s1 += pitch;
|
||||
d13 = vld1_u8(s2);
|
||||
s2 += pitch;
|
||||
d14 = vld1_u8(s1);
|
||||
s1 += pitch;
|
||||
d15 = vld1_u8(s2);
|
||||
s2 += pitch;
|
||||
d16 = vld1_u8(s1);
|
||||
s1 += pitch;
|
||||
d17 = vld1_u8(s2);
|
||||
s2 += pitch;
|
||||
d18 = vld1_u8(s1);
|
||||
s1 += pitch;
|
||||
d19 = vld1_u8(s2);
|
||||
s2 += pitch;
|
||||
d20 = vld1_u8(s1);
|
||||
d21 = vld1_u8(s2);
|
||||
|
||||
q3 = vcombine_u8(d6, d7);
|
||||
q4 = vcombine_u8(d8, d9);
|
||||
q5 = vcombine_u8(d10, d11);
|
||||
q6 = vcombine_u8(d12, d13);
|
||||
q7 = vcombine_u8(d14, d15);
|
||||
q8 = vcombine_u8(d16, d17);
|
||||
q9 = vcombine_u8(d18, d19);
|
||||
q10 = vcombine_u8(d20, d21);
|
||||
|
||||
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
|
||||
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
|
||||
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
|
||||
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
|
||||
|
||||
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[0]));
|
||||
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[0]));
|
||||
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[1]));
|
||||
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[1]));
|
||||
|
||||
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[0]));
|
||||
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[1]));
|
||||
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[0]));
|
||||
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[1]));
|
||||
|
||||
q3 = q2tmp8.val[0];
|
||||
q4 = q2tmp8.val[1];
|
||||
q5 = q2tmp9.val[0];
|
||||
q6 = q2tmp9.val[1];
|
||||
q7 = q2tmp10.val[0];
|
||||
q8 = q2tmp10.val[1];
|
||||
q9 = q2tmp11.val[0];
|
||||
q10 = q2tmp11.val[1];
|
||||
|
||||
vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
|
||||
q5, q6, q7, q8, q9, q10,
|
||||
&q4, &q5, &q6, &q7, &q8, &q9);
|
||||
|
||||
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
|
||||
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
|
||||
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
|
||||
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
|
||||
|
||||
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[0]));
|
||||
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[0]));
|
||||
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[1]));
|
||||
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[1]));
|
||||
|
||||
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[0]));
|
||||
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[1]));
|
||||
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[0]));
|
||||
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[1]));
|
||||
|
||||
q3 = q2tmp8.val[0];
|
||||
q4 = q2tmp8.val[1];
|
||||
q5 = q2tmp9.val[0];
|
||||
q6 = q2tmp9.val[1];
|
||||
q7 = q2tmp10.val[0];
|
||||
q8 = q2tmp10.val[1];
|
||||
q9 = q2tmp11.val[0];
|
||||
q10 = q2tmp11.val[1];
|
||||
|
||||
s1 -= 7 * pitch;
|
||||
s2 -= 7 * pitch;
|
||||
|
||||
vst1_u8(s1, vget_low_u8(q3));
|
||||
s1 += pitch;
|
||||
vst1_u8(s2, vget_high_u8(q3));
|
||||
s2 += pitch;
|
||||
vst1_u8(s1, vget_low_u8(q4));
|
||||
s1 += pitch;
|
||||
vst1_u8(s2, vget_high_u8(q4));
|
||||
s2 += pitch;
|
||||
vst1_u8(s1, vget_low_u8(q5));
|
||||
s1 += pitch;
|
||||
vst1_u8(s2, vget_high_u8(q5));
|
||||
s2 += pitch;
|
||||
vst1_u8(s1, vget_low_u8(q6));
|
||||
s1 += pitch;
|
||||
vst1_u8(s2, vget_high_u8(q6));
|
||||
s2 += pitch;
|
||||
vst1_u8(s1, vget_low_u8(q7));
|
||||
s1 += pitch;
|
||||
vst1_u8(s2, vget_high_u8(q7));
|
||||
s2 += pitch;
|
||||
vst1_u8(s1, vget_low_u8(q8));
|
||||
s1 += pitch;
|
||||
vst1_u8(s2, vget_high_u8(q8));
|
||||
s2 += pitch;
|
||||
vst1_u8(s1, vget_low_u8(q9));
|
||||
s1 += pitch;
|
||||
vst1_u8(s2, vget_high_u8(q9));
|
||||
s2 += pitch;
|
||||
vst1_u8(s1, vget_low_u8(q10));
|
||||
vst1_u8(s2, vget_high_u8(q10));
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_mbloop_filter_vertical_edge_uv_neon(
|
||||
unsigned char *u,
|
||||
int pitch,
|
||||
unsigned char blimit,
|
||||
unsigned char limit,
|
||||
unsigned char thresh,
|
||||
unsigned char *v) {
|
||||
unsigned char *us, *ud;
|
||||
unsigned char *vs, *vd;
|
||||
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
|
||||
uint8x16_t q5, q6, q7, q8, q9, q10;
|
||||
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
|
||||
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
|
||||
uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
|
||||
uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
|
||||
uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
|
||||
|
||||
qblimit = vdupq_n_u8(blimit);
|
||||
qlimit = vdupq_n_u8(limit);
|
||||
qthresh = vdupq_n_u8(thresh);
|
||||
|
||||
us = u - 4;
|
||||
vs = v - 4;
|
||||
d6 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d7 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d8 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d9 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d10 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d11 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d12 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d13 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d14 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d15 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d16 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d17 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d18 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d19 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d20 = vld1_u8(us);
|
||||
d21 = vld1_u8(vs);
|
||||
|
||||
q3 = vcombine_u8(d6, d7);
|
||||
q4 = vcombine_u8(d8, d9);
|
||||
q5 = vcombine_u8(d10, d11);
|
||||
q6 = vcombine_u8(d12, d13);
|
||||
q7 = vcombine_u8(d14, d15);
|
||||
q8 = vcombine_u8(d16, d17);
|
||||
q9 = vcombine_u8(d18, d19);
|
||||
q10 = vcombine_u8(d20, d21);
|
||||
|
||||
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
|
||||
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
|
||||
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
|
||||
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
|
||||
|
||||
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[0]));
|
||||
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[0]));
|
||||
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[1]));
|
||||
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[1]));
|
||||
|
||||
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[0]));
|
||||
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[1]));
|
||||
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[0]));
|
||||
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[1]));
|
||||
|
||||
q3 = q2tmp8.val[0];
|
||||
q4 = q2tmp8.val[1];
|
||||
q5 = q2tmp9.val[0];
|
||||
q6 = q2tmp9.val[1];
|
||||
q7 = q2tmp10.val[0];
|
||||
q8 = q2tmp10.val[1];
|
||||
q9 = q2tmp11.val[0];
|
||||
q10 = q2tmp11.val[1];
|
||||
|
||||
vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
|
||||
q5, q6, q7, q8, q9, q10,
|
||||
&q4, &q5, &q6, &q7, &q8, &q9);
|
||||
|
||||
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
|
||||
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
|
||||
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
|
||||
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
|
||||
|
||||
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[0]));
|
||||
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[0]));
|
||||
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[1]));
|
||||
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[1]));
|
||||
|
||||
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[0]));
|
||||
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[1]));
|
||||
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[0]));
|
||||
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[1]));
|
||||
|
||||
q3 = q2tmp8.val[0];
|
||||
q4 = q2tmp8.val[1];
|
||||
q5 = q2tmp9.val[0];
|
||||
q6 = q2tmp9.val[1];
|
||||
q7 = q2tmp10.val[0];
|
||||
q8 = q2tmp10.val[1];
|
||||
q9 = q2tmp11.val[0];
|
||||
q10 = q2tmp11.val[1];
|
||||
|
||||
ud = u - 4;
|
||||
vst1_u8(ud, vget_low_u8(q3));
|
||||
ud += pitch;
|
||||
vst1_u8(ud, vget_low_u8(q4));
|
||||
ud += pitch;
|
||||
vst1_u8(ud, vget_low_u8(q5));
|
||||
ud += pitch;
|
||||
vst1_u8(ud, vget_low_u8(q6));
|
||||
ud += pitch;
|
||||
vst1_u8(ud, vget_low_u8(q7));
|
||||
ud += pitch;
|
||||
vst1_u8(ud, vget_low_u8(q8));
|
||||
ud += pitch;
|
||||
vst1_u8(ud, vget_low_u8(q9));
|
||||
ud += pitch;
|
||||
vst1_u8(ud, vget_low_u8(q10));
|
||||
|
||||
vd = v - 4;
|
||||
vst1_u8(vd, vget_high_u8(q3));
|
||||
vd += pitch;
|
||||
vst1_u8(vd, vget_high_u8(q4));
|
||||
vd += pitch;
|
||||
vst1_u8(vd, vget_high_u8(q5));
|
||||
vd += pitch;
|
||||
vst1_u8(vd, vget_high_u8(q6));
|
||||
vd += pitch;
|
||||
vst1_u8(vd, vget_high_u8(q7));
|
||||
vd += pitch;
|
||||
vst1_u8(vd, vget_high_u8(q8));
|
||||
vd += pitch;
|
||||
vst1_u8(vd, vget_high_u8(q9));
|
||||
vd += pitch;
|
||||
vst1_u8(vd, vget_high_u8(q10));
|
||||
return;
|
||||
}
|
210
media/libvpx/vp8/common/arm/neon/reconintra_neon.c
Normal file
210
media/libvpx/vp8/common/arm/neon/reconintra_neon.c
Normal file
@ -0,0 +1,210 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "vp8/common/blockd.h"
|
||||
|
||||
void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x,
|
||||
unsigned char * yabove_row,
|
||||
unsigned char * yleft,
|
||||
int left_stride,
|
||||
unsigned char * ypred_ptr,
|
||||
int y_stride) {
|
||||
const int mode = x->mode_info_context->mbmi.mode;
|
||||
int i;
|
||||
|
||||
switch (mode) {
|
||||
case DC_PRED:
|
||||
{
|
||||
int shift = x->up_available + x->left_available;
|
||||
uint8x16_t v_expected_dc = vdupq_n_u8(128);
|
||||
|
||||
if (shift) {
|
||||
unsigned int average = 0;
|
||||
int expected_dc;
|
||||
if (x->up_available) {
|
||||
const uint8x16_t v_above = vld1q_u8(yabove_row);
|
||||
const uint16x8_t a = vpaddlq_u8(v_above);
|
||||
const uint32x4_t b = vpaddlq_u16(a);
|
||||
const uint64x2_t c = vpaddlq_u32(b);
|
||||
const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
|
||||
vreinterpret_u32_u64(vget_high_u64(c)));
|
||||
average = vget_lane_u32(d, 0);
|
||||
}
|
||||
if (x->left_available) {
|
||||
for (i = 0; i < 16; ++i) {
|
||||
average += yleft[0];
|
||||
yleft += left_stride;
|
||||
}
|
||||
}
|
||||
shift += 3;
|
||||
expected_dc = (average + (1 << (shift - 1))) >> shift;
|
||||
v_expected_dc = vmovq_n_u8((uint8_t)expected_dc);
|
||||
}
|
||||
for (i = 0; i < 16; ++i) {
|
||||
vst1q_u8(ypred_ptr, v_expected_dc);
|
||||
ypred_ptr += y_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case V_PRED:
|
||||
{
|
||||
const uint8x16_t v_above = vld1q_u8(yabove_row);
|
||||
for (i = 0; i < 16; ++i) {
|
||||
vst1q_u8(ypred_ptr, v_above);
|
||||
ypred_ptr += y_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case H_PRED:
|
||||
{
|
||||
for (i = 0; i < 16; ++i) {
|
||||
const uint8x16_t v_yleft = vmovq_n_u8((uint8_t)yleft[0]);
|
||||
yleft += left_stride;
|
||||
vst1q_u8(ypred_ptr, v_yleft);
|
||||
ypred_ptr += y_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case TM_PRED:
|
||||
{
|
||||
const uint16x8_t v_ytop_left = vmovq_n_u16((int16_t)yabove_row[-1]);
|
||||
const uint8x16_t v_above = vld1q_u8(yabove_row);
|
||||
for (i = 0; i < 16; ++i) {
|
||||
const uint8x8_t v_yleft = vmov_n_u8((int8_t)yleft[0]);
|
||||
const uint16x8_t a_lo = vaddl_u8(vget_low_u8(v_above), v_yleft);
|
||||
const uint16x8_t a_hi = vaddl_u8(vget_high_u8(v_above), v_yleft);
|
||||
const int16x8_t b_lo = vsubq_s16(vreinterpretq_s16_u16(a_lo),
|
||||
vreinterpretq_s16_u16(v_ytop_left));
|
||||
const int16x8_t b_hi = vsubq_s16(vreinterpretq_s16_u16(a_hi),
|
||||
vreinterpretq_s16_u16(v_ytop_left));
|
||||
const uint8x8_t pred_lo = vqmovun_s16(b_lo);
|
||||
const uint8x8_t pred_hi = vqmovun_s16(b_hi);
|
||||
|
||||
vst1q_u8(ypred_ptr, vcombine_u8(pred_lo, pred_hi));
|
||||
ypred_ptr += y_stride;
|
||||
yleft += left_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_build_intra_predictors_mbuv_s_neon(MACROBLOCKD *x,
|
||||
unsigned char * uabove_row,
|
||||
unsigned char * vabove_row,
|
||||
unsigned char * uleft,
|
||||
unsigned char * vleft,
|
||||
int left_stride,
|
||||
unsigned char * upred_ptr,
|
||||
unsigned char * vpred_ptr,
|
||||
int pred_stride) {
|
||||
const int mode = x->mode_info_context->mbmi.uv_mode;
|
||||
int i;
|
||||
|
||||
switch (mode) {
|
||||
case DC_PRED:
|
||||
{
|
||||
int shift = x->up_available + x->left_available;
|
||||
uint8x8_t v_expected_udc = vdup_n_u8(128);
|
||||
uint8x8_t v_expected_vdc = vdup_n_u8(128);
|
||||
|
||||
if (shift) {
|
||||
unsigned int average_u = 0;
|
||||
unsigned int average_v = 0;
|
||||
int expected_udc;
|
||||
int expected_vdc;
|
||||
if (x->up_available) {
|
||||
const uint8x8_t v_uabove = vld1_u8(uabove_row);
|
||||
const uint8x8_t v_vabove = vld1_u8(vabove_row);
|
||||
const uint16x8_t a = vpaddlq_u8(vcombine_u8(v_uabove, v_vabove));
|
||||
const uint32x4_t b = vpaddlq_u16(a);
|
||||
const uint64x2_t c = vpaddlq_u32(b);
|
||||
average_u = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 0);
|
||||
average_v = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 2);
|
||||
}
|
||||
if (x->left_available) {
|
||||
for (i = 0; i < 8; ++i) {
|
||||
average_u += uleft[0];
|
||||
uleft += left_stride;
|
||||
average_v += vleft[0];
|
||||
vleft += left_stride;
|
||||
}
|
||||
}
|
||||
shift += 2;
|
||||
expected_udc = (average_u + (1 << (shift - 1))) >> shift;
|
||||
expected_vdc = (average_v + (1 << (shift - 1))) >> shift;
|
||||
v_expected_udc = vmov_n_u8((uint8_t)expected_udc);
|
||||
v_expected_vdc = vmov_n_u8((uint8_t)expected_vdc);
|
||||
}
|
||||
for (i = 0; i < 8; ++i) {
|
||||
vst1_u8(upred_ptr, v_expected_udc);
|
||||
upred_ptr += pred_stride;
|
||||
vst1_u8(vpred_ptr, v_expected_vdc);
|
||||
vpred_ptr += pred_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case V_PRED:
|
||||
{
|
||||
const uint8x8_t v_uabove = vld1_u8(uabove_row);
|
||||
const uint8x8_t v_vabove = vld1_u8(vabove_row);
|
||||
for (i = 0; i < 8; ++i) {
|
||||
vst1_u8(upred_ptr, v_uabove);
|
||||
upred_ptr += pred_stride;
|
||||
vst1_u8(vpred_ptr, v_vabove);
|
||||
vpred_ptr += pred_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case H_PRED:
|
||||
{
|
||||
for (i = 0; i < 8; ++i) {
|
||||
const uint8x8_t v_uleft = vmov_n_u8((uint8_t)uleft[0]);
|
||||
const uint8x8_t v_vleft = vmov_n_u8((uint8_t)vleft[0]);
|
||||
uleft += left_stride;
|
||||
vleft += left_stride;
|
||||
vst1_u8(upred_ptr, v_uleft);
|
||||
upred_ptr += pred_stride;
|
||||
vst1_u8(vpred_ptr, v_vleft);
|
||||
vpred_ptr += pred_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case TM_PRED:
|
||||
{
|
||||
const uint16x8_t v_utop_left = vmovq_n_u16((int16_t)uabove_row[-1]);
|
||||
const uint16x8_t v_vtop_left = vmovq_n_u16((int16_t)vabove_row[-1]);
|
||||
const uint8x8_t v_uabove = vld1_u8(uabove_row);
|
||||
const uint8x8_t v_vabove = vld1_u8(vabove_row);
|
||||
for (i = 0; i < 8; ++i) {
|
||||
const uint8x8_t v_uleft = vmov_n_u8((int8_t)uleft[0]);
|
||||
const uint8x8_t v_vleft = vmov_n_u8((int8_t)vleft[0]);
|
||||
const uint16x8_t a_u = vaddl_u8(v_uabove, v_uleft);
|
||||
const uint16x8_t a_v = vaddl_u8(v_vabove, v_vleft);
|
||||
const int16x8_t b_u = vsubq_s16(vreinterpretq_s16_u16(a_u),
|
||||
vreinterpretq_s16_u16(v_utop_left));
|
||||
const int16x8_t b_v = vsubq_s16(vreinterpretq_s16_u16(a_v),
|
||||
vreinterpretq_s16_u16(v_vtop_left));
|
||||
const uint8x8_t pred_u = vqmovun_s16(b_u);
|
||||
const uint8x8_t pred_v = vqmovun_s16(b_v);
|
||||
|
||||
vst1_u8(upred_ptr, pred_u);
|
||||
vst1_u8(vpred_ptr, pred_v);
|
||||
upred_ptr += pred_stride;
|
||||
vpred_ptr += pred_stride;
|
||||
uleft += left_stride;
|
||||
vleft += left_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
@ -1,207 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_sad16x16_neon|
|
||||
EXPORT |vp8_sad16x8_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int src_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int ref_stride
|
||||
|vp8_sad16x16_neon| PROC
|
||||
;;
|
||||
vld1.8 {q0}, [r0], r1
|
||||
vld1.8 {q4}, [r2], r3
|
||||
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q5}, [r2], r3
|
||||
|
||||
vabdl.u8 q12, d0, d8
|
||||
vabdl.u8 q13, d1, d9
|
||||
|
||||
vld1.8 {q2}, [r0], r1
|
||||
vld1.8 {q6}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
vabal.u8 q13, d3, d11
|
||||
|
||||
vld1.8 {q3}, [r0], r1
|
||||
vld1.8 {q7}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q13, d5, d13
|
||||
|
||||
;;
|
||||
vld1.8 {q0}, [r0], r1
|
||||
vld1.8 {q4}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
vabal.u8 q13, d7, d15
|
||||
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q5}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
vabal.u8 q13, d1, d9
|
||||
|
||||
vld1.8 {q2}, [r0], r1
|
||||
vld1.8 {q6}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
vabal.u8 q13, d3, d11
|
||||
|
||||
vld1.8 {q3}, [r0], r1
|
||||
vld1.8 {q7}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q13, d5, d13
|
||||
|
||||
;;
|
||||
vld1.8 {q0}, [r0], r1
|
||||
vld1.8 {q4}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
vabal.u8 q13, d7, d15
|
||||
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q5}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
vabal.u8 q13, d1, d9
|
||||
|
||||
vld1.8 {q2}, [r0], r1
|
||||
vld1.8 {q6}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
vabal.u8 q13, d3, d11
|
||||
|
||||
vld1.8 {q3}, [r0], r1
|
||||
vld1.8 {q7}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q13, d5, d13
|
||||
|
||||
;;
|
||||
vld1.8 {q0}, [r0], r1
|
||||
vld1.8 {q4}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
vabal.u8 q13, d7, d15
|
||||
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q5}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
vabal.u8 q13, d1, d9
|
||||
|
||||
vld1.8 {q2}, [r0], r1
|
||||
vld1.8 {q6}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
vabal.u8 q13, d3, d11
|
||||
|
||||
vld1.8 {q3}, [r0]
|
||||
vld1.8 {q7}, [r2]
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q13, d5, d13
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
vabal.u8 q13, d7, d15
|
||||
|
||||
vadd.u16 q0, q12, q13
|
||||
|
||||
vpaddl.u16 q1, q0
|
||||
vpaddl.u32 q0, q1
|
||||
|
||||
vadd.u32 d0, d0, d1
|
||||
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;==============================
|
||||
;unsigned int vp8_sad16x8_c(
|
||||
; unsigned char *src_ptr,
|
||||
; int src_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_stride)
|
||||
|vp8_sad16x8_neon| PROC
|
||||
vld1.8 {q0}, [r0], r1
|
||||
vld1.8 {q4}, [r2], r3
|
||||
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q5}, [r2], r3
|
||||
|
||||
vabdl.u8 q12, d0, d8
|
||||
vabdl.u8 q13, d1, d9
|
||||
|
||||
vld1.8 {q2}, [r0], r1
|
||||
vld1.8 {q6}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
vabal.u8 q13, d3, d11
|
||||
|
||||
vld1.8 {q3}, [r0], r1
|
||||
vld1.8 {q7}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q13, d5, d13
|
||||
|
||||
vld1.8 {q0}, [r0], r1
|
||||
vld1.8 {q4}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
vabal.u8 q13, d7, d15
|
||||
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q5}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
vabal.u8 q13, d1, d9
|
||||
|
||||
vld1.8 {q2}, [r0], r1
|
||||
vld1.8 {q6}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
vabal.u8 q13, d3, d11
|
||||
|
||||
vld1.8 {q3}, [r0], r1
|
||||
vld1.8 {q7}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q13, d5, d13
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
vabal.u8 q13, d7, d15
|
||||
|
||||
vadd.u16 q0, q12, q13
|
||||
|
||||
vpaddl.u16 q1, q0
|
||||
vpaddl.u32 q0, q1
|
||||
|
||||
vadd.u32 d0, d0, d1
|
||||
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
@ -1,209 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_sad8x8_neon|
|
||||
EXPORT |vp8_sad8x16_neon|
|
||||
EXPORT |vp8_sad4x4_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; unsigned int vp8_sad8x8_c(
|
||||
; unsigned char *src_ptr,
|
||||
; int src_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_stride)
|
||||
|
||||
|vp8_sad8x8_neon| PROC
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d10}, [r2], r3
|
||||
|
||||
vabdl.u8 q12, d0, d8
|
||||
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d12}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
|
||||
vld1.8 {d6}, [r0], r1
|
||||
vld1.8 {d14}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d10}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d12}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
|
||||
vld1.8 {d6}, [r0], r1
|
||||
vld1.8 {d14}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q12, d6, d14
|
||||
|
||||
vpaddl.u16 q1, q12
|
||||
vpaddl.u32 q0, q1
|
||||
vadd.u32 d0, d0, d1
|
||||
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;============================
|
||||
;unsigned int vp8_sad8x16_c(
|
||||
; unsigned char *src_ptr,
|
||||
; int src_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_stride)
|
||||
|
||||
|vp8_sad8x16_neon| PROC
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d10}, [r2], r3
|
||||
|
||||
vabdl.u8 q12, d0, d8
|
||||
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d12}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
|
||||
vld1.8 {d6}, [r0], r1
|
||||
vld1.8 {d14}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d10}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d12}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
|
||||
vld1.8 {d6}, [r0], r1
|
||||
vld1.8 {d14}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d10}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d12}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
|
||||
vld1.8 {d6}, [r0], r1
|
||||
vld1.8 {d14}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d10}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d12}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
|
||||
vld1.8 {d6}, [r0], r1
|
||||
vld1.8 {d14}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q12, d6, d14
|
||||
|
||||
vpaddl.u16 q1, q12
|
||||
vpaddl.u32 q0, q1
|
||||
vadd.u32 d0, d0, d1
|
||||
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;===========================
|
||||
;unsigned int vp8_sad4x4_c(
|
||||
; unsigned char *src_ptr,
|
||||
; int src_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_stride)
|
||||
|
||||
|vp8_sad4x4_neon| PROC
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d10}, [r2], r3
|
||||
|
||||
vabdl.u8 q12, d0, d8
|
||||
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d12}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
|
||||
vld1.8 {d6}, [r0], r1
|
||||
vld1.8 {d14}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q12, d6, d14
|
||||
|
||||
vpaddl.u16 d1, d24
|
||||
vpaddl.u32 d0, d1
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
184
media/libvpx/vp8/common/arm/neon/sad_neon.c
Normal file
184
media/libvpx/vp8/common/arm/neon/sad_neon.c
Normal file
@ -0,0 +1,184 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
unsigned int vp8_sad8x8_neon(
|
||||
unsigned char *src_ptr,
|
||||
int src_stride,
|
||||
unsigned char *ref_ptr,
|
||||
int ref_stride) {
|
||||
uint8x8_t d0, d8;
|
||||
uint16x8_t q12;
|
||||
uint32x4_t q1;
|
||||
uint64x2_t q3;
|
||||
uint32x2_t d5;
|
||||
int i;
|
||||
|
||||
d0 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
q12 = vabdl_u8(d0, d8);
|
||||
|
||||
for (i = 0; i < 7; i++) {
|
||||
d0 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
q12 = vabal_u8(q12, d0, d8);
|
||||
}
|
||||
|
||||
q1 = vpaddlq_u16(q12);
|
||||
q3 = vpaddlq_u32(q1);
|
||||
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
|
||||
vreinterpret_u32_u64(vget_high_u64(q3)));
|
||||
|
||||
return vget_lane_u32(d5, 0);
|
||||
}
|
||||
|
||||
unsigned int vp8_sad8x16_neon(
|
||||
unsigned char *src_ptr,
|
||||
int src_stride,
|
||||
unsigned char *ref_ptr,
|
||||
int ref_stride) {
|
||||
uint8x8_t d0, d8;
|
||||
uint16x8_t q12;
|
||||
uint32x4_t q1;
|
||||
uint64x2_t q3;
|
||||
uint32x2_t d5;
|
||||
int i;
|
||||
|
||||
d0 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
q12 = vabdl_u8(d0, d8);
|
||||
|
||||
for (i = 0; i < 15; i++) {
|
||||
d0 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
q12 = vabal_u8(q12, d0, d8);
|
||||
}
|
||||
|
||||
q1 = vpaddlq_u16(q12);
|
||||
q3 = vpaddlq_u32(q1);
|
||||
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
|
||||
vreinterpret_u32_u64(vget_high_u64(q3)));
|
||||
|
||||
return vget_lane_u32(d5, 0);
|
||||
}
|
||||
|
||||
unsigned int vp8_sad4x4_neon(
|
||||
unsigned char *src_ptr,
|
||||
int src_stride,
|
||||
unsigned char *ref_ptr,
|
||||
int ref_stride) {
|
||||
uint8x8_t d0, d8;
|
||||
uint16x8_t q12;
|
||||
uint32x2_t d1;
|
||||
uint64x1_t d3;
|
||||
int i;
|
||||
|
||||
d0 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
q12 = vabdl_u8(d0, d8);
|
||||
|
||||
for (i = 0; i < 3; i++) {
|
||||
d0 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
q12 = vabal_u8(q12, d0, d8);
|
||||
}
|
||||
|
||||
d1 = vpaddl_u16(vget_low_u16(q12));
|
||||
d3 = vpaddl_u32(d1);
|
||||
|
||||
return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
|
||||
}
|
||||
|
||||
unsigned int vp8_sad16x16_neon(
|
||||
unsigned char *src_ptr,
|
||||
int src_stride,
|
||||
unsigned char *ref_ptr,
|
||||
int ref_stride) {
|
||||
uint8x16_t q0, q4;
|
||||
uint16x8_t q12, q13;
|
||||
uint32x4_t q1;
|
||||
uint64x2_t q3;
|
||||
uint32x2_t d5;
|
||||
int i;
|
||||
|
||||
q0 = vld1q_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
q4 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
|
||||
q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
|
||||
|
||||
for (i = 0; i < 15; i++) {
|
||||
q0 = vld1q_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
q4 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
|
||||
q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
|
||||
}
|
||||
|
||||
q12 = vaddq_u16(q12, q13);
|
||||
q1 = vpaddlq_u16(q12);
|
||||
q3 = vpaddlq_u32(q1);
|
||||
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
|
||||
vreinterpret_u32_u64(vget_high_u64(q3)));
|
||||
|
||||
return vget_lane_u32(d5, 0);
|
||||
}
|
||||
|
||||
unsigned int vp8_sad16x8_neon(
|
||||
unsigned char *src_ptr,
|
||||
int src_stride,
|
||||
unsigned char *ref_ptr,
|
||||
int ref_stride) {
|
||||
uint8x16_t q0, q4;
|
||||
uint16x8_t q12, q13;
|
||||
uint32x4_t q1;
|
||||
uint64x2_t q3;
|
||||
uint32x2_t d5;
|
||||
int i;
|
||||
|
||||
q0 = vld1q_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
q4 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
|
||||
q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
|
||||
|
||||
for (i = 0; i < 7; i++) {
|
||||
q0 = vld1q_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
q4 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
|
||||
q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
|
||||
}
|
||||
|
||||
q12 = vaddq_u16(q12, q13);
|
||||
q1 = vpaddlq_u16(q12);
|
||||
q3 = vpaddlq_u32(q1);
|
||||
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
|
||||
vreinterpret_u32_u64(vget_high_u64(q3)));
|
||||
|
||||
return vget_lane_u32(d5, 0);
|
||||
}
|
@ -1,36 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_push_neon|
|
||||
EXPORT |vp8_pop_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
|vp8_push_neon| PROC
|
||||
vst1.i64 {d8, d9, d10, d11}, [r0]!
|
||||
vst1.i64 {d12, d13, d14, d15}, [r0]!
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
|vp8_pop_neon| PROC
|
||||
vld1.i64 {d8, d9, d10, d11}, [r0]!
|
||||
vld1.i64 {d12, d13, d14, d15}, [r0]!
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
||||
|
@ -1,139 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_short_idct4x4llm_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
;*************************************************************
|
||||
;void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
|
||||
; unsigned char *dst, int stride)
|
||||
;r0 short * input
|
||||
;r1 short * pred
|
||||
;r2 int pitch
|
||||
;r3 unsigned char dst
|
||||
;sp int stride
|
||||
;*************************************************************
|
||||
|
||||
; static const int cospi8sqrt2minus1=20091;
|
||||
; static const int sinpi8sqrt2 =35468;
|
||||
; static const int rounding = 0;
|
||||
|
||||
; Optimization note: The resulted data from dequantization are signed
|
||||
; 13-bit data that is in the range of [-4096, 4095]. This allows to
|
||||
; use "vqdmulh"(neon) instruction since it won't go out of range
|
||||
; (13+16+1=30bits<32bits). This instruction gives the high half
|
||||
; result of the multiplication that is needed in IDCT.
|
||||
|
||||
|vp8_short_idct4x4llm_neon| PROC
|
||||
adr r12, idct_coeff
|
||||
vld1.16 {q1, q2}, [r0]
|
||||
vld1.16 {d0}, [r12]
|
||||
|
||||
vswp d3, d4 ;q2(vp[4] vp[12])
|
||||
ldr r0, [sp] ; stride
|
||||
|
||||
vqdmulh.s16 q3, q2, d0[2]
|
||||
vqdmulh.s16 q4, q2, d0[0]
|
||||
|
||||
vqadd.s16 d12, d2, d3 ;a1
|
||||
vqsub.s16 d13, d2, d3 ;b1
|
||||
|
||||
vshr.s16 q3, q3, #1
|
||||
vshr.s16 q4, q4, #1
|
||||
|
||||
vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
|
||||
vqadd.s16 q4, q4, q2
|
||||
|
||||
;d6 - c1:temp1
|
||||
;d7 - d1:temp2
|
||||
;d8 - d1:temp1
|
||||
;d9 - c1:temp2
|
||||
|
||||
vqsub.s16 d10, d6, d9 ;c1
|
||||
vqadd.s16 d11, d7, d8 ;d1
|
||||
|
||||
vqadd.s16 d2, d12, d11
|
||||
vqadd.s16 d3, d13, d10
|
||||
vqsub.s16 d4, d13, d10
|
||||
vqsub.s16 d5, d12, d11
|
||||
|
||||
vtrn.32 d2, d4
|
||||
vtrn.32 d3, d5
|
||||
vtrn.16 d2, d3
|
||||
vtrn.16 d4, d5
|
||||
|
||||
vswp d3, d4
|
||||
|
||||
vqdmulh.s16 q3, q2, d0[2]
|
||||
vqdmulh.s16 q4, q2, d0[0]
|
||||
|
||||
vqadd.s16 d12, d2, d3 ;a1
|
||||
vqsub.s16 d13, d2, d3 ;b1
|
||||
|
||||
vshr.s16 q3, q3, #1
|
||||
vshr.s16 q4, q4, #1
|
||||
|
||||
vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
|
||||
vqadd.s16 q4, q4, q2
|
||||
|
||||
vqsub.s16 d10, d6, d9 ;c1
|
||||
vqadd.s16 d11, d7, d8 ;d1
|
||||
|
||||
vqadd.s16 d2, d12, d11
|
||||
vqadd.s16 d3, d13, d10
|
||||
vqsub.s16 d4, d13, d10
|
||||
vqsub.s16 d5, d12, d11
|
||||
|
||||
vrshr.s16 d2, d2, #3
|
||||
vrshr.s16 d3, d3, #3
|
||||
vrshr.s16 d4, d4, #3
|
||||
vrshr.s16 d5, d5, #3
|
||||
|
||||
vtrn.32 d2, d4
|
||||
vtrn.32 d3, d5
|
||||
vtrn.16 d2, d3
|
||||
vtrn.16 d4, d5
|
||||
|
||||
; load prediction data
|
||||
vld1.32 d6[0], [r1], r2
|
||||
vld1.32 d6[1], [r1], r2
|
||||
vld1.32 d7[0], [r1], r2
|
||||
vld1.32 d7[1], [r1], r2
|
||||
|
||||
; add prediction and residual
|
||||
vaddw.u8 q1, q1, d6
|
||||
vaddw.u8 q2, q2, d7
|
||||
|
||||
vqmovun.s16 d1, q1
|
||||
vqmovun.s16 d2, q2
|
||||
|
||||
; store to destination
|
||||
vst1.32 d1[0], [r3], r0
|
||||
vst1.32 d1[1], [r3], r0
|
||||
vst1.32 d2[0], [r3], r0
|
||||
vst1.32 d2[1], [r3], r0
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
idct_coeff
|
||||
DCD 0x4e7b4e7b, 0x8a8c8a8c
|
||||
|
||||
;20091, 20091, 35468, 35468
|
||||
|
||||
END
|
123
media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c
Normal file
123
media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c
Normal file
@ -0,0 +1,123 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
static const int16_t cospi8sqrt2minus1 = 20091;
|
||||
static const int16_t sinpi8sqrt2 = 35468;
|
||||
|
||||
void vp8_short_idct4x4llm_neon(
|
||||
int16_t *input,
|
||||
unsigned char *pred_ptr,
|
||||
int pred_stride,
|
||||
unsigned char *dst_ptr,
|
||||
int dst_stride) {
|
||||
int i;
|
||||
uint32x2_t d6u32 = vdup_n_u32(0);
|
||||
uint8x8_t d1u8;
|
||||
int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
|
||||
uint16x8_t q1u16;
|
||||
int16x8_t q1s16, q2s16, q3s16, q4s16;
|
||||
int32x2x2_t v2tmp0, v2tmp1;
|
||||
int16x4x2_t v2tmp2, v2tmp3;
|
||||
|
||||
d2 = vld1_s16(input);
|
||||
d3 = vld1_s16(input + 4);
|
||||
d4 = vld1_s16(input + 8);
|
||||
d5 = vld1_s16(input + 12);
|
||||
|
||||
// 1st for loop
|
||||
q1s16 = vcombine_s16(d2, d4); // Swap d3 d4 here
|
||||
q2s16 = vcombine_s16(d3, d5);
|
||||
|
||||
q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
|
||||
q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
|
||||
|
||||
d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1
|
||||
d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1
|
||||
|
||||
q3s16 = vshrq_n_s16(q3s16, 1);
|
||||
q4s16 = vshrq_n_s16(q4s16, 1);
|
||||
|
||||
q3s16 = vqaddq_s16(q3s16, q2s16);
|
||||
q4s16 = vqaddq_s16(q4s16, q2s16);
|
||||
|
||||
d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1
|
||||
d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1
|
||||
|
||||
d2 = vqadd_s16(d12, d11);
|
||||
d3 = vqadd_s16(d13, d10);
|
||||
d4 = vqsub_s16(d13, d10);
|
||||
d5 = vqsub_s16(d12, d11);
|
||||
|
||||
v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
|
||||
v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
|
||||
v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
|
||||
vreinterpret_s16_s32(v2tmp1.val[0]));
|
||||
v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
|
||||
vreinterpret_s16_s32(v2tmp1.val[1]));
|
||||
|
||||
// 2nd for loop
|
||||
q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]);
|
||||
q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]);
|
||||
|
||||
q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
|
||||
q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
|
||||
|
||||
d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1
|
||||
d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1
|
||||
|
||||
q3s16 = vshrq_n_s16(q3s16, 1);
|
||||
q4s16 = vshrq_n_s16(q4s16, 1);
|
||||
|
||||
q3s16 = vqaddq_s16(q3s16, q2s16);
|
||||
q4s16 = vqaddq_s16(q4s16, q2s16);
|
||||
|
||||
d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1
|
||||
d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1
|
||||
|
||||
d2 = vqadd_s16(d12, d11);
|
||||
d3 = vqadd_s16(d13, d10);
|
||||
d4 = vqsub_s16(d13, d10);
|
||||
d5 = vqsub_s16(d12, d11);
|
||||
|
||||
d2 = vrshr_n_s16(d2, 3);
|
||||
d3 = vrshr_n_s16(d3, 3);
|
||||
d4 = vrshr_n_s16(d4, 3);
|
||||
d5 = vrshr_n_s16(d5, 3);
|
||||
|
||||
v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
|
||||
v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
|
||||
v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
|
||||
vreinterpret_s16_s32(v2tmp1.val[0]));
|
||||
v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
|
||||
vreinterpret_s16_s32(v2tmp1.val[1]));
|
||||
|
||||
q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]);
|
||||
q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]);
|
||||
|
||||
// dc_only_idct_add
|
||||
for (i = 0; i < 2; i++, q1s16 = q2s16) {
|
||||
d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0);
|
||||
pred_ptr += pred_stride;
|
||||
d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1);
|
||||
pred_ptr += pred_stride;
|
||||
|
||||
q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16),
|
||||
vreinterpret_u8_u32(d6u32));
|
||||
d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
|
||||
|
||||
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0);
|
||||
dst_ptr += dst_stride;
|
||||
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1);
|
||||
dst_ptr += dst_stride;
|
||||
}
|
||||
return;
|
||||
}
|
@ -1,490 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_sixtap_predict16x16_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
filter16_coeff
|
||||
DCD 0, 0, 128, 0, 0, 0, 0, 0
|
||||
DCD 0, -6, 123, 12, -1, 0, 0, 0
|
||||
DCD 2, -11, 108, 36, -8, 1, 0, 0
|
||||
DCD 0, -9, 93, 50, -6, 0, 0, 0
|
||||
DCD 3, -16, 77, 77, -16, 3, 0, 0
|
||||
DCD 0, -6, 50, 93, -9, 0, 0, 0
|
||||
DCD 1, -8, 36, 108, -11, 2, 0, 0
|
||||
DCD 0, -1, 12, 123, -6, 0, 0, 0
|
||||
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; r4 unsigned char *dst_ptr,
|
||||
; stack(r5) int dst_pitch
|
||||
|
||||
;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to
|
||||
; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication,
|
||||
; the result can be negtive. So, I treat the result as s16. But, since it is also possible
|
||||
; that the result can be a large positive number (> 2^15-1), which could be confused as a
|
||||
; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2,
|
||||
; which ensures that the result stays in s16 range. Finally, saturated add the result by
|
||||
; applying 3rd filter coeff. Same applys to other filter functions.
|
||||
|
||||
|vp8_sixtap_predict16x16_neon| PROC
|
||||
push {r4-r5, lr}
|
||||
|
||||
adr r12, filter16_coeff
|
||||
ldr r4, [sp, #12] ;load parameters from stack
|
||||
ldr r5, [sp, #16] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_filter16x16_only
|
||||
|
||||
add r2, r12, r2, lsl #5 ;calculate filter location
|
||||
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
|
||||
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
|
||||
|
||||
beq firstpass_filter16x16_only
|
||||
|
||||
sub sp, sp, #336 ;reserve space on stack for temporary storage
|
||||
mov lr, sp
|
||||
|
||||
vabs.s32 q12, q14
|
||||
vabs.s32 q13, q15
|
||||
|
||||
mov r2, #7 ;loop counter
|
||||
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
|
||||
sub r0, r0, r1, lsl #1
|
||||
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vdup.8 d1, d24[4]
|
||||
vdup.8 d2, d25[0]
|
||||
vdup.8 d3, d25[4]
|
||||
vdup.8 d4, d26[0]
|
||||
vdup.8 d5, d26[4]
|
||||
|
||||
;First Pass: output_height lines x output_width columns (21x16)
|
||||
filt_blk2d_fp16x16_loop_neon
|
||||
vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data
|
||||
vld1.u8 {d9, d10, d11}, [r0], r1
|
||||
vld1.u8 {d12, d13, d14}, [r0], r1
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q9, d7, d0
|
||||
vmull.u8 q10, d9, d0
|
||||
vmull.u8 q11, d10, d0
|
||||
vmull.u8 q12, d12, d0
|
||||
vmull.u8 q13, d13, d0
|
||||
|
||||
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d29, d9, d10, #1
|
||||
vext.8 d30, d12, d13, #1
|
||||
|
||||
vmlsl.u8 q8, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q10, d29, d1
|
||||
vmlsl.u8 q12, d30, d1
|
||||
|
||||
vext.8 d28, d7, d8, #1
|
||||
vext.8 d29, d10, d11, #1
|
||||
vext.8 d30, d13, d14, #1
|
||||
|
||||
vmlsl.u8 q9, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q11, d29, d1
|
||||
vmlsl.u8 q13, d30, d1
|
||||
|
||||
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d29, d9, d10, #4
|
||||
vext.8 d30, d12, d13, #4
|
||||
|
||||
vmlsl.u8 q8, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q10, d29, d4
|
||||
vmlsl.u8 q12, d30, d4
|
||||
|
||||
vext.8 d28, d7, d8, #4
|
||||
vext.8 d29, d10, d11, #4
|
||||
vext.8 d30, d13, d14, #4
|
||||
|
||||
vmlsl.u8 q9, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q11, d29, d4
|
||||
vmlsl.u8 q13, d30, d4
|
||||
|
||||
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d29, d9, d10, #5
|
||||
vext.8 d30, d12, d13, #5
|
||||
|
||||
vmlal.u8 q8, d28, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q10, d29, d5
|
||||
vmlal.u8 q12, d30, d5
|
||||
|
||||
vext.8 d28, d7, d8, #5
|
||||
vext.8 d29, d10, d11, #5
|
||||
vext.8 d30, d13, d14, #5
|
||||
|
||||
vmlal.u8 q9, d28, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q11, d29, d5
|
||||
vmlal.u8 q13, d30, d5
|
||||
|
||||
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d29, d9, d10, #2
|
||||
vext.8 d30, d12, d13, #2
|
||||
|
||||
vmlal.u8 q8, d28, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q10, d29, d2
|
||||
vmlal.u8 q12, d30, d2
|
||||
|
||||
vext.8 d28, d7, d8, #2
|
||||
vext.8 d29, d10, d11, #2
|
||||
vext.8 d30, d13, d14, #2
|
||||
|
||||
vmlal.u8 q9, d28, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q11, d29, d2
|
||||
vmlal.u8 q13, d30, d2
|
||||
|
||||
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d29, d9, d10, #3
|
||||
vext.8 d30, d12, d13, #3
|
||||
|
||||
vext.8 d15, d7, d8, #3
|
||||
vext.8 d31, d10, d11, #3
|
||||
vext.8 d6, d13, d14, #3
|
||||
|
||||
vmull.u8 q4, d28, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q5, d29, d3
|
||||
vmull.u8 q6, d30, d3
|
||||
|
||||
vqadd.s16 q8, q4 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q10, q5
|
||||
vqadd.s16 q12, q6
|
||||
|
||||
vmull.u8 q6, d15, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q7, d31, d3
|
||||
vmull.u8 q3, d6, d3
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vqadd.s16 q9, q6
|
||||
vqadd.s16 q11, q7
|
||||
vqadd.s16 q13, q3
|
||||
|
||||
vqrshrun.s16 d6, q8, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q9, #7
|
||||
vqrshrun.s16 d8, q10, #7
|
||||
vqrshrun.s16 d9, q11, #7
|
||||
vqrshrun.s16 d10, q12, #7
|
||||
vqrshrun.s16 d11, q13, #7
|
||||
|
||||
vst1.u8 {d6, d7, d8}, [lr]! ;store result
|
||||
vst1.u8 {d9, d10, d11}, [lr]!
|
||||
|
||||
bne filt_blk2d_fp16x16_loop_neon
|
||||
|
||||
;Second pass: 16x16
|
||||
;secondpass_filter - do first 8-columns and then second 8-columns
|
||||
add r3, r12, r3, lsl #5
|
||||
sub lr, lr, #336
|
||||
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
mov r3, #2 ;loop counter
|
||||
|
||||
vabs.s32 q7, q5
|
||||
vabs.s32 q8, q6
|
||||
|
||||
mov r2, #16
|
||||
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vdup.8 d1, d14[4]
|
||||
vdup.8 d2, d15[0]
|
||||
vdup.8 d3, d15[4]
|
||||
vdup.8 d4, d16[0]
|
||||
vdup.8 d5, d16[4]
|
||||
|
||||
filt_blk2d_sp16x16_outloop_neon
|
||||
vld1.u8 {d18}, [lr], r2 ;load src data
|
||||
vld1.u8 {d19}, [lr], r2
|
||||
vld1.u8 {d20}, [lr], r2
|
||||
vld1.u8 {d21}, [lr], r2
|
||||
mov r12, #4 ;loop counter
|
||||
vld1.u8 {d22}, [lr], r2
|
||||
|
||||
secondpass_inner_loop_neon
|
||||
vld1.u8 {d23}, [lr], r2 ;load src data
|
||||
vld1.u8 {d24}, [lr], r2
|
||||
vld1.u8 {d25}, [lr], r2
|
||||
vld1.u8 {d26}, [lr], r2
|
||||
|
||||
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q4, d19, d0
|
||||
vmull.u8 q5, d20, d0
|
||||
vmull.u8 q6, d21, d0
|
||||
|
||||
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q4, d20, d1
|
||||
vmlsl.u8 q5, d21, d1
|
||||
vmlsl.u8 q6, d22, d1
|
||||
|
||||
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q4, d23, d4
|
||||
vmlsl.u8 q5, d24, d4
|
||||
vmlsl.u8 q6, d25, d4
|
||||
|
||||
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q4, d21, d2
|
||||
vmlal.u8 q5, d22, d2
|
||||
vmlal.u8 q6, d23, d2
|
||||
|
||||
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q4, d24, d5
|
||||
vmlal.u8 q5, d25, d5
|
||||
vmlal.u8 q6, d26, d5
|
||||
|
||||
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q8, d22, d3
|
||||
vmull.u8 q9, d23, d3
|
||||
vmull.u8 q10, d24, d3
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q8, #7
|
||||
vqrshrun.s16 d8, q9, #7
|
||||
vqrshrun.s16 d9, q10, #7
|
||||
|
||||
vst1.u8 {d6}, [r4], r5 ;store result
|
||||
vmov q9, q11
|
||||
vst1.u8 {d7}, [r4], r5
|
||||
vmov q10, q12
|
||||
vst1.u8 {d8}, [r4], r5
|
||||
vmov d22, d26
|
||||
vst1.u8 {d9}, [r4], r5
|
||||
|
||||
bne secondpass_inner_loop_neon
|
||||
|
||||
subs r3, r3, #1
|
||||
sub lr, lr, #336
|
||||
add lr, lr, #8
|
||||
|
||||
sub r4, r4, r5, lsl #4
|
||||
add r4, r4, #8
|
||||
|
||||
bne filt_blk2d_sp16x16_outloop_neon
|
||||
|
||||
add sp, sp, #336
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;--------------------
|
||||
firstpass_filter16x16_only
|
||||
vabs.s32 q12, q14
|
||||
vabs.s32 q13, q15
|
||||
|
||||
mov r2, #8 ;loop counter
|
||||
sub r0, r0, #2 ;move srcptr back to (column-2)
|
||||
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vdup.8 d1, d24[4]
|
||||
vdup.8 d2, d25[0]
|
||||
vdup.8 d3, d25[4]
|
||||
vdup.8 d4, d26[0]
|
||||
vdup.8 d5, d26[4]
|
||||
|
||||
;First Pass: output_height lines x output_width columns (16x16)
|
||||
filt_blk2d_fpo16x16_loop_neon
|
||||
vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data
|
||||
vld1.u8 {d9, d10, d11}, [r0], r1
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
|
||||
vmull.u8 q6, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q7, d7, d0
|
||||
vmull.u8 q8, d9, d0
|
||||
vmull.u8 q9, d10, d0
|
||||
|
||||
vext.8 d20, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d21, d9, d10, #1
|
||||
vext.8 d22, d7, d8, #1
|
||||
vext.8 d23, d10, d11, #1
|
||||
vext.8 d24, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d25, d9, d10, #4
|
||||
vext.8 d26, d7, d8, #4
|
||||
vext.8 d27, d10, d11, #4
|
||||
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d29, d9, d10, #5
|
||||
|
||||
vmlsl.u8 q6, d20, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q8, d21, d1
|
||||
vmlsl.u8 q7, d22, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q9, d23, d1
|
||||
vmlsl.u8 q6, d24, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q8, d25, d4
|
||||
vmlsl.u8 q7, d26, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q9, d27, d4
|
||||
vmlal.u8 q6, d28, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q8, d29, d5
|
||||
|
||||
vext.8 d20, d7, d8, #5
|
||||
vext.8 d21, d10, d11, #5
|
||||
vext.8 d22, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d23, d9, d10, #2
|
||||
vext.8 d24, d7, d8, #2
|
||||
vext.8 d25, d10, d11, #2
|
||||
|
||||
vext.8 d26, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d27, d9, d10, #3
|
||||
vext.8 d28, d7, d8, #3
|
||||
vext.8 d29, d10, d11, #3
|
||||
|
||||
vmlal.u8 q7, d20, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q9, d21, d5
|
||||
vmlal.u8 q6, d22, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q8, d23, d2
|
||||
vmlal.u8 q7, d24, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q9, d25, d2
|
||||
|
||||
vmull.u8 q10, d26, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q11, d27, d3
|
||||
vmull.u8 q12, d28, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q15, d29, d3
|
||||
|
||||
vqadd.s16 q6, q10 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q11
|
||||
vqadd.s16 q7, q12
|
||||
vqadd.s16 q9, q15
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vqrshrun.s16 d6, q6, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q7, #7
|
||||
vqrshrun.s16 d8, q8, #7
|
||||
vqrshrun.s16 d9, q9, #7
|
||||
|
||||
vst1.u8 {q3}, [r4], r5 ;store result
|
||||
vst1.u8 {q4}, [r4], r5
|
||||
|
||||
bne filt_blk2d_fpo16x16_loop_neon
|
||||
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;--------------------
|
||||
secondpass_filter16x16_only
|
||||
;Second pass: 16x16
|
||||
add r3, r12, r3, lsl #5
|
||||
sub r0, r0, r1, lsl #1
|
||||
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
mov r3, #2 ;loop counter
|
||||
|
||||
vabs.s32 q7, q5
|
||||
vabs.s32 q8, q6
|
||||
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vdup.8 d1, d14[4]
|
||||
vdup.8 d2, d15[0]
|
||||
vdup.8 d3, d15[4]
|
||||
vdup.8 d4, d16[0]
|
||||
vdup.8 d5, d16[4]
|
||||
|
||||
filt_blk2d_spo16x16_outloop_neon
|
||||
vld1.u8 {d18}, [r0], r1 ;load src data
|
||||
vld1.u8 {d19}, [r0], r1
|
||||
vld1.u8 {d20}, [r0], r1
|
||||
vld1.u8 {d21}, [r0], r1
|
||||
mov r12, #4 ;loop counter
|
||||
vld1.u8 {d22}, [r0], r1
|
||||
|
||||
secondpass_only_inner_loop_neon
|
||||
vld1.u8 {d23}, [r0], r1 ;load src data
|
||||
vld1.u8 {d24}, [r0], r1
|
||||
vld1.u8 {d25}, [r0], r1
|
||||
vld1.u8 {d26}, [r0], r1
|
||||
|
||||
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q4, d19, d0
|
||||
vmull.u8 q5, d20, d0
|
||||
vmull.u8 q6, d21, d0
|
||||
|
||||
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q4, d20, d1
|
||||
vmlsl.u8 q5, d21, d1
|
||||
vmlsl.u8 q6, d22, d1
|
||||
|
||||
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q4, d23, d4
|
||||
vmlsl.u8 q5, d24, d4
|
||||
vmlsl.u8 q6, d25, d4
|
||||
|
||||
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q4, d21, d2
|
||||
vmlal.u8 q5, d22, d2
|
||||
vmlal.u8 q6, d23, d2
|
||||
|
||||
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q4, d24, d5
|
||||
vmlal.u8 q5, d25, d5
|
||||
vmlal.u8 q6, d26, d5
|
||||
|
||||
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q8, d22, d3
|
||||
vmull.u8 q9, d23, d3
|
||||
vmull.u8 q10, d24, d3
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q8, #7
|
||||
vqrshrun.s16 d8, q9, #7
|
||||
vqrshrun.s16 d9, q10, #7
|
||||
|
||||
vst1.u8 {d6}, [r4], r5 ;store result
|
||||
vmov q9, q11
|
||||
vst1.u8 {d7}, [r4], r5
|
||||
vmov q10, q12
|
||||
vst1.u8 {d8}, [r4], r5
|
||||
vmov d22, d26
|
||||
vst1.u8 {d9}, [r4], r5
|
||||
|
||||
bne secondpass_only_inner_loop_neon
|
||||
|
||||
subs r3, r3, #1
|
||||
sub r0, r0, r1, lsl #4
|
||||
sub r0, r0, r1, lsl #2
|
||||
sub r0, r0, r1
|
||||
add r0, r0, #8
|
||||
|
||||
sub r4, r4, r5, lsl #4
|
||||
add r4, r4, #8
|
||||
|
||||
bne filt_blk2d_spo16x16_outloop_neon
|
||||
|
||||
pop {r4-r5,pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
END
|
@ -1,422 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_sixtap_predict4x4_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
filter4_coeff
|
||||
DCD 0, 0, 128, 0, 0, 0, 0, 0
|
||||
DCD 0, -6, 123, 12, -1, 0, 0, 0
|
||||
DCD 2, -11, 108, 36, -8, 1, 0, 0
|
||||
DCD 0, -9, 93, 50, -6, 0, 0, 0
|
||||
DCD 3, -16, 77, 77, -16, 3, 0, 0
|
||||
DCD 0, -6, 50, 93, -9, 0, 0, 0
|
||||
DCD 1, -8, 36, 108, -11, 2, 0, 0
|
||||
DCD 0, -1, 12, 123, -6, 0, 0, 0
|
||||
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; stack(r4) unsigned char *dst_ptr,
|
||||
; stack(lr) int dst_pitch
|
||||
|
||||
|vp8_sixtap_predict4x4_neon| PROC
|
||||
push {r4, lr}
|
||||
|
||||
adr r12, filter4_coeff
|
||||
ldr r4, [sp, #8] ;load parameters from stack
|
||||
ldr lr, [sp, #12] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_filter4x4_only
|
||||
|
||||
add r2, r12, r2, lsl #5 ;calculate filter location
|
||||
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
|
||||
|
||||
beq firstpass_filter4x4_only
|
||||
|
||||
vabs.s32 q12, q14 ;get abs(filer_parameters)
|
||||
vabs.s32 q13, q15
|
||||
|
||||
sub r0, r0, #2 ;go back 2 columns of src data
|
||||
sub r0, r0, r1, lsl #1 ;go back 2 lines of src data
|
||||
|
||||
;First pass: output_height lines x output_width columns (9x4)
|
||||
vld1.u8 {q3}, [r0], r1 ;load first 4-line src data
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vdup.8 d1, d24[4]
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vdup.8 d2, d25[0]
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
vdup.8 d3, d25[4]
|
||||
vdup.8 d4, d26[0]
|
||||
vdup.8 d5, d26[4]
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d19, d8, d9, #5
|
||||
vext.8 d20, d10, d11, #5
|
||||
vext.8 d21, d12, d13, #5
|
||||
|
||||
vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
|
||||
vswp d11, d12
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
|
||||
vzip.32 d20, d21
|
||||
vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmull.u8 q8, d20, d5
|
||||
|
||||
vmov q4, q3 ;keep original src data in q4 q6
|
||||
vmov q6, q5
|
||||
|
||||
vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
|
||||
vzip.32 d10, d11
|
||||
vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
|
||||
vshr.u64 q10, q6, #8
|
||||
vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0])
|
||||
vmlal.u8 q8, d10, d0
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
|
||||
vzip.32 d20, d21
|
||||
vshr.u64 q3, q4, #32 ;construct src_ptr[2]
|
||||
vshr.u64 q5, q6, #32
|
||||
vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q8, d20, d1
|
||||
|
||||
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
|
||||
vzip.32 d10, d11
|
||||
vshr.u64 q9, q4, #16 ;construct src_ptr[0]
|
||||
vshr.u64 q10, q6, #16
|
||||
vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q8, d10, d4
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
|
||||
vzip.32 d20, d21
|
||||
vshr.u64 q3, q4, #24 ;construct src_ptr[1]
|
||||
vshr.u64 q5, q6, #24
|
||||
vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q8, d20, d2
|
||||
|
||||
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
|
||||
vzip.32 d10, d11
|
||||
vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q10, d10, d3
|
||||
|
||||
vld1.u8 {q3}, [r0], r1 ;load rest 5-line src data
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
|
||||
vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q10
|
||||
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
|
||||
vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d28, q8, #7
|
||||
|
||||
;First Pass on rest 5-line data
|
||||
vld1.u8 {q11}, [r0], r1
|
||||
|
||||
vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d19, d8, d9, #5
|
||||
vext.8 d20, d10, d11, #5
|
||||
vext.8 d21, d12, d13, #5
|
||||
|
||||
vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
|
||||
vswp d11, d12
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
|
||||
vzip.32 d20, d21
|
||||
vext.8 d31, d22, d23, #5 ;construct src_ptr[3]
|
||||
vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmull.u8 q8, d20, d5
|
||||
vmull.u8 q12, d31, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
|
||||
vmov q4, q3 ;keep original src data in q4 q6
|
||||
vmov q6, q5
|
||||
|
||||
vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
|
||||
vzip.32 d10, d11
|
||||
vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
|
||||
vshr.u64 q10, q6, #8
|
||||
|
||||
vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0])
|
||||
vmlal.u8 q8, d10, d0
|
||||
vmlal.u8 q12, d22, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
|
||||
vzip.32 d20, d21
|
||||
vshr.u64 q3, q4, #32 ;construct src_ptr[2]
|
||||
vshr.u64 q5, q6, #32
|
||||
vext.8 d31, d22, d23, #1 ;construct src_ptr[-1]
|
||||
|
||||
vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q8, d20, d1
|
||||
vmlsl.u8 q12, d31, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
|
||||
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
|
||||
vzip.32 d10, d11
|
||||
vshr.u64 q9, q4, #16 ;construct src_ptr[0]
|
||||
vshr.u64 q10, q6, #16
|
||||
vext.8 d31, d22, d23, #4 ;construct src_ptr[2]
|
||||
|
||||
vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q8, d10, d4
|
||||
vmlsl.u8 q12, d31, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
|
||||
vzip.32 d20, d21
|
||||
vshr.u64 q3, q4, #24 ;construct src_ptr[1]
|
||||
vshr.u64 q5, q6, #24
|
||||
vext.8 d31, d22, d23, #2 ;construct src_ptr[0]
|
||||
|
||||
vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q8, d20, d2
|
||||
vmlal.u8 q12, d31, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
|
||||
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
|
||||
vzip.32 d10, d11
|
||||
vext.8 d31, d22, d23, #3 ;construct src_ptr[1]
|
||||
vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q10, d10, d3
|
||||
vmull.u8 q11, d31, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
|
||||
add r3, r12, r3, lsl #5
|
||||
|
||||
vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q10
|
||||
vqadd.s16 q12, q11
|
||||
|
||||
vext.8 d23, d27, d28, #4
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
|
||||
vqrshrun.s16 d29, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d30, q8, #7
|
||||
vqrshrun.s16 d31, q12, #7
|
||||
|
||||
;Second pass: 4x4
|
||||
vabs.s32 q7, q5
|
||||
vabs.s32 q8, q6
|
||||
|
||||
vext.8 d24, d28, d29, #4
|
||||
vext.8 d25, d29, d30, #4
|
||||
vext.8 d26, d30, d31, #4
|
||||
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vdup.8 d1, d14[4]
|
||||
vdup.8 d2, d15[0]
|
||||
vdup.8 d3, d15[4]
|
||||
vdup.8 d4, d16[0]
|
||||
vdup.8 d5, d16[4]
|
||||
|
||||
vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q4, d28, d0
|
||||
|
||||
vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmull.u8 q6, d26, d5
|
||||
|
||||
vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q4, d30, d4
|
||||
|
||||
vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q6, d24, d1
|
||||
|
||||
vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q4, d29, d2
|
||||
|
||||
vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmlal.u8 q6, d25, d3
|
||||
|
||||
add r0, r4, lr
|
||||
add r1, r0, lr
|
||||
add r2, r1, lr
|
||||
|
||||
vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q6, q4
|
||||
|
||||
vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d4, q6, #7
|
||||
|
||||
vst1.32 {d3[0]}, [r4] ;store result
|
||||
vst1.32 {d3[1]}, [r0]
|
||||
vst1.32 {d4[0]}, [r1]
|
||||
vst1.32 {d4[1]}, [r2]
|
||||
|
||||
pop {r4, pc}
|
||||
|
||||
|
||||
;---------------------
|
||||
firstpass_filter4x4_only
|
||||
vabs.s32 q12, q14 ;get abs(filer_parameters)
|
||||
vabs.s32 q13, q15
|
||||
|
||||
sub r0, r0, #2 ;go back 2 columns of src data
|
||||
|
||||
;First pass: output_height lines x output_width columns (4x4)
|
||||
vld1.u8 {q3}, [r0], r1 ;load first 4-line src data
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vdup.8 d1, d24[4]
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vdup.8 d2, d25[0]
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
|
||||
vdup.8 d3, d25[4]
|
||||
vdup.8 d4, d26[0]
|
||||
vdup.8 d5, d26[4]
|
||||
|
||||
vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d19, d8, d9, #5
|
||||
vext.8 d20, d10, d11, #5
|
||||
vext.8 d21, d12, d13, #5
|
||||
|
||||
vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
|
||||
vswp d11, d12
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
|
||||
vzip.32 d20, d21
|
||||
vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmull.u8 q8, d20, d5
|
||||
|
||||
vmov q4, q3 ;keep original src data in q4 q6
|
||||
vmov q6, q5
|
||||
|
||||
vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
|
||||
vzip.32 d10, d11
|
||||
vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
|
||||
vshr.u64 q10, q6, #8
|
||||
vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0])
|
||||
vmlal.u8 q8, d10, d0
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
|
||||
vzip.32 d20, d21
|
||||
vshr.u64 q3, q4, #32 ;construct src_ptr[2]
|
||||
vshr.u64 q5, q6, #32
|
||||
vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q8, d20, d1
|
||||
|
||||
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
|
||||
vzip.32 d10, d11
|
||||
vshr.u64 q9, q4, #16 ;construct src_ptr[0]
|
||||
vshr.u64 q10, q6, #16
|
||||
vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q8, d10, d4
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
|
||||
vzip.32 d20, d21
|
||||
vshr.u64 q3, q4, #24 ;construct src_ptr[1]
|
||||
vshr.u64 q5, q6, #24
|
||||
vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q8, d20, d2
|
||||
|
||||
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
|
||||
vzip.32 d10, d11
|
||||
vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q10, d10, d3
|
||||
|
||||
add r0, r4, lr
|
||||
add r1, r0, lr
|
||||
add r2, r1, lr
|
||||
|
||||
vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q10
|
||||
|
||||
vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d28, q8, #7
|
||||
|
||||
vst1.32 {d27[0]}, [r4] ;store result
|
||||
vst1.32 {d27[1]}, [r0]
|
||||
vst1.32 {d28[0]}, [r1]
|
||||
vst1.32 {d28[1]}, [r2]
|
||||
|
||||
pop {r4, pc}
|
||||
|
||||
|
||||
;---------------------
|
||||
secondpass_filter4x4_only
|
||||
sub r0, r0, r1, lsl #1
|
||||
add r3, r12, r3, lsl #5
|
||||
|
||||
vld1.32 {d27[0]}, [r0], r1 ;load src data
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
vld1.32 {d27[1]}, [r0], r1
|
||||
vabs.s32 q7, q5
|
||||
vld1.32 {d28[0]}, [r0], r1
|
||||
vabs.s32 q8, q6
|
||||
vld1.32 {d28[1]}, [r0], r1
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vld1.32 {d29[0]}, [r0], r1
|
||||
vdup.8 d1, d14[4]
|
||||
vld1.32 {d29[1]}, [r0], r1
|
||||
vdup.8 d2, d15[0]
|
||||
vld1.32 {d30[0]}, [r0], r1
|
||||
vdup.8 d3, d15[4]
|
||||
vld1.32 {d30[1]}, [r0], r1
|
||||
vdup.8 d4, d16[0]
|
||||
vld1.32 {d31[0]}, [r0], r1
|
||||
vdup.8 d5, d16[4]
|
||||
|
||||
vext.8 d23, d27, d28, #4
|
||||
vext.8 d24, d28, d29, #4
|
||||
vext.8 d25, d29, d30, #4
|
||||
vext.8 d26, d30, d31, #4
|
||||
|
||||
vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q4, d28, d0
|
||||
|
||||
vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmull.u8 q6, d26, d5
|
||||
|
||||
vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q4, d30, d4
|
||||
|
||||
vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q6, d24, d1
|
||||
|
||||
vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q4, d29, d2
|
||||
|
||||
vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmlal.u8 q6, d25, d3
|
||||
|
||||
add r0, r4, lr
|
||||
add r1, r0, lr
|
||||
add r2, r1, lr
|
||||
|
||||
vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q6, q4
|
||||
|
||||
vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d4, q6, #7
|
||||
|
||||
vst1.32 {d3[0]}, [r4] ;store result
|
||||
vst1.32 {d3[1]}, [r0]
|
||||
vst1.32 {d4[0]}, [r1]
|
||||
vst1.32 {d4[1]}, [r2]
|
||||
|
||||
pop {r4, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
END
|
@ -1,473 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_sixtap_predict8x4_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
filter8_coeff
|
||||
DCD 0, 0, 128, 0, 0, 0, 0, 0
|
||||
DCD 0, -6, 123, 12, -1, 0, 0, 0
|
||||
DCD 2, -11, 108, 36, -8, 1, 0, 0
|
||||
DCD 0, -9, 93, 50, -6, 0, 0, 0
|
||||
DCD 3, -16, 77, 77, -16, 3, 0, 0
|
||||
DCD 0, -6, 50, 93, -9, 0, 0, 0
|
||||
DCD 1, -8, 36, 108, -11, 2, 0, 0
|
||||
DCD 0, -1, 12, 123, -6, 0, 0, 0
|
||||
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; r4 unsigned char *dst_ptr,
|
||||
; stack(r5) int dst_pitch
|
||||
|
||||
|vp8_sixtap_predict8x4_neon| PROC
|
||||
push {r4-r5, lr}
|
||||
|
||||
adr r12, filter8_coeff
|
||||
ldr r4, [sp, #12] ;load parameters from stack
|
||||
ldr r5, [sp, #16] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_filter8x4_only
|
||||
|
||||
add r2, r12, r2, lsl #5 ;calculate filter location
|
||||
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
|
||||
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
|
||||
|
||||
beq firstpass_filter8x4_only
|
||||
|
||||
sub sp, sp, #32 ;reserve space on stack for temporary storage
|
||||
vabs.s32 q12, q14
|
||||
vabs.s32 q13, q15
|
||||
|
||||
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
|
||||
mov lr, sp
|
||||
sub r0, r0, r1, lsl #1
|
||||
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vdup.8 d1, d24[4]
|
||||
vdup.8 d2, d25[0]
|
||||
|
||||
;First pass: output_height lines x output_width columns (9x8)
|
||||
vld1.u8 {q3}, [r0], r1 ;load src data
|
||||
vdup.8 d3, d25[4]
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vdup.8 d4, d26[0]
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vdup.8 d5, d26[4]
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q8, d8, d0
|
||||
vmull.u8 q9, d10, d0
|
||||
vmull.u8 q10, d12, d0
|
||||
|
||||
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d29, d8, d9, #1
|
||||
vext.8 d30, d10, d11, #1
|
||||
vext.8 d31, d12, d13, #1
|
||||
|
||||
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q8, d29, d1
|
||||
vmlsl.u8 q9, d30, d1
|
||||
vmlsl.u8 q10, d31, d1
|
||||
|
||||
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d29, d8, d9, #4
|
||||
vext.8 d30, d10, d11, #4
|
||||
vext.8 d31, d12, d13, #4
|
||||
|
||||
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q8, d29, d4
|
||||
vmlsl.u8 q9, d30, d4
|
||||
vmlsl.u8 q10, d31, d4
|
||||
|
||||
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d29, d8, d9, #2
|
||||
vext.8 d30, d10, d11, #2
|
||||
vext.8 d31, d12, d13, #2
|
||||
|
||||
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q8, d29, d2
|
||||
vmlal.u8 q9, d30, d2
|
||||
vmlal.u8 q10, d31, d2
|
||||
|
||||
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d29, d8, d9, #5
|
||||
vext.8 d30, d10, d11, #5
|
||||
vext.8 d31, d12, d13, #5
|
||||
|
||||
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q8, d29, d5
|
||||
vmlal.u8 q9, d30, d5
|
||||
vmlal.u8 q10, d31, d5
|
||||
|
||||
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d29, d8, d9, #3
|
||||
vext.8 d30, d10, d11, #3
|
||||
vext.8 d31, d12, d13, #3
|
||||
|
||||
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q4, d29, d3
|
||||
vmull.u8 q5, d30, d3
|
||||
vmull.u8 q6, d31, d3
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vld1.u8 {q3}, [r0], r1 ;load src data
|
||||
|
||||
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d23, q8, #7
|
||||
vqrshrun.s16 d24, q9, #7
|
||||
vqrshrun.s16 d25, q10, #7
|
||||
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vst1.u8 {d22}, [lr]! ;store result
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vst1.u8 {d23}, [lr]!
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
vst1.u8 {d24}, [lr]!
|
||||
vld1.u8 {q7}, [r0], r1
|
||||
vst1.u8 {d25}, [lr]!
|
||||
|
||||
;first_pass filtering on the rest 5-line data
|
||||
vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q9, d8, d0
|
||||
vmull.u8 q10, d10, d0
|
||||
vmull.u8 q11, d12, d0
|
||||
vmull.u8 q12, d14, d0
|
||||
|
||||
vext.8 d27, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d28, d8, d9, #1
|
||||
vext.8 d29, d10, d11, #1
|
||||
vext.8 d30, d12, d13, #1
|
||||
vext.8 d31, d14, d15, #1
|
||||
|
||||
vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q9, d28, d1
|
||||
vmlsl.u8 q10, d29, d1
|
||||
vmlsl.u8 q11, d30, d1
|
||||
vmlsl.u8 q12, d31, d1
|
||||
|
||||
vext.8 d27, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d28, d8, d9, #4
|
||||
vext.8 d29, d10, d11, #4
|
||||
vext.8 d30, d12, d13, #4
|
||||
vext.8 d31, d14, d15, #4
|
||||
|
||||
vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q9, d28, d4
|
||||
vmlsl.u8 q10, d29, d4
|
||||
vmlsl.u8 q11, d30, d4
|
||||
vmlsl.u8 q12, d31, d4
|
||||
|
||||
vext.8 d27, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d28, d8, d9, #2
|
||||
vext.8 d29, d10, d11, #2
|
||||
vext.8 d30, d12, d13, #2
|
||||
vext.8 d31, d14, d15, #2
|
||||
|
||||
vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q9, d28, d2
|
||||
vmlal.u8 q10, d29, d2
|
||||
vmlal.u8 q11, d30, d2
|
||||
vmlal.u8 q12, d31, d2
|
||||
|
||||
vext.8 d27, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d28, d8, d9, #5
|
||||
vext.8 d29, d10, d11, #5
|
||||
vext.8 d30, d12, d13, #5
|
||||
vext.8 d31, d14, d15, #5
|
||||
|
||||
vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q9, d28, d5
|
||||
vmlal.u8 q10, d29, d5
|
||||
vmlal.u8 q11, d30, d5
|
||||
vmlal.u8 q12, d31, d5
|
||||
|
||||
vext.8 d27, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d28, d8, d9, #3
|
||||
vext.8 d29, d10, d11, #3
|
||||
vext.8 d30, d12, d13, #3
|
||||
vext.8 d31, d14, d15, #3
|
||||
|
||||
vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q4, d28, d3
|
||||
vmull.u8 q5, d29, d3
|
||||
vmull.u8 q6, d30, d3
|
||||
vmull.u8 q7, d31, d3
|
||||
|
||||
vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q9, q4
|
||||
vqadd.s16 q10, q5
|
||||
vqadd.s16 q11, q6
|
||||
vqadd.s16 q12, q7
|
||||
|
||||
vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d27, q9, #7
|
||||
vqrshrun.s16 d28, q10, #7
|
||||
vqrshrun.s16 d29, q11, #7 ;load intermediate data from stack
|
||||
vqrshrun.s16 d30, q12, #7
|
||||
|
||||
;Second pass: 8x4
|
||||
;secondpass_filter
|
||||
add r3, r12, r3, lsl #5
|
||||
sub lr, lr, #32
|
||||
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
vld1.u8 {q11}, [lr]!
|
||||
|
||||
vabs.s32 q7, q5
|
||||
vabs.s32 q8, q6
|
||||
|
||||
vld1.u8 {q12}, [lr]!
|
||||
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vdup.8 d1, d14[4]
|
||||
vdup.8 d2, d15[0]
|
||||
vdup.8 d3, d15[4]
|
||||
vdup.8 d4, d16[0]
|
||||
vdup.8 d5, d16[4]
|
||||
|
||||
vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q4, d23, d0
|
||||
vmull.u8 q5, d24, d0
|
||||
vmull.u8 q6, d25, d0
|
||||
|
||||
vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q4, d24, d1
|
||||
vmlsl.u8 q5, d25, d1
|
||||
vmlsl.u8 q6, d26, d1
|
||||
|
||||
vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q4, d27, d4
|
||||
vmlsl.u8 q5, d28, d4
|
||||
vmlsl.u8 q6, d29, d4
|
||||
|
||||
vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q4, d25, d2
|
||||
vmlal.u8 q5, d26, d2
|
||||
vmlal.u8 q6, d27, d2
|
||||
|
||||
vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q4, d28, d5
|
||||
vmlal.u8 q5, d29, d5
|
||||
vmlal.u8 q6, d30, d5
|
||||
|
||||
vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q8, d26, d3
|
||||
vmull.u8 q9, d27, d3
|
||||
vmull.u8 q10, d28, d3
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q8, #7
|
||||
vqrshrun.s16 d8, q9, #7
|
||||
vqrshrun.s16 d9, q10, #7
|
||||
|
||||
vst1.u8 {d6}, [r4], r5 ;store result
|
||||
vst1.u8 {d7}, [r4], r5
|
||||
vst1.u8 {d8}, [r4], r5
|
||||
vst1.u8 {d9}, [r4], r5
|
||||
|
||||
add sp, sp, #32
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;--------------------
|
||||
firstpass_filter8x4_only
|
||||
vabs.s32 q12, q14
|
||||
vabs.s32 q13, q15
|
||||
|
||||
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
|
||||
vld1.u8 {q3}, [r0], r1 ;load src data
|
||||
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vdup.8 d1, d24[4]
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vdup.8 d2, d25[0]
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
vdup.8 d3, d25[4]
|
||||
vdup.8 d4, d26[0]
|
||||
vdup.8 d5, d26[4]
|
||||
|
||||
;First pass: output_height lines x output_width columns (4x8)
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q8, d8, d0
|
||||
vmull.u8 q9, d10, d0
|
||||
vmull.u8 q10, d12, d0
|
||||
|
||||
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d29, d8, d9, #1
|
||||
vext.8 d30, d10, d11, #1
|
||||
vext.8 d31, d12, d13, #1
|
||||
|
||||
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q8, d29, d1
|
||||
vmlsl.u8 q9, d30, d1
|
||||
vmlsl.u8 q10, d31, d1
|
||||
|
||||
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d29, d8, d9, #4
|
||||
vext.8 d30, d10, d11, #4
|
||||
vext.8 d31, d12, d13, #4
|
||||
|
||||
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q8, d29, d4
|
||||
vmlsl.u8 q9, d30, d4
|
||||
vmlsl.u8 q10, d31, d4
|
||||
|
||||
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d29, d8, d9, #2
|
||||
vext.8 d30, d10, d11, #2
|
||||
vext.8 d31, d12, d13, #2
|
||||
|
||||
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q8, d29, d2
|
||||
vmlal.u8 q9, d30, d2
|
||||
vmlal.u8 q10, d31, d2
|
||||
|
||||
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d29, d8, d9, #5
|
||||
vext.8 d30, d10, d11, #5
|
||||
vext.8 d31, d12, d13, #5
|
||||
|
||||
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q8, d29, d5
|
||||
vmlal.u8 q9, d30, d5
|
||||
vmlal.u8 q10, d31, d5
|
||||
|
||||
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d29, d8, d9, #3
|
||||
vext.8 d30, d10, d11, #3
|
||||
vext.8 d31, d12, d13, #3
|
||||
|
||||
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q4, d29, d3
|
||||
vmull.u8 q5, d30, d3
|
||||
vmull.u8 q6, d31, d3
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d23, q8, #7
|
||||
vqrshrun.s16 d24, q9, #7
|
||||
vqrshrun.s16 d25, q10, #7
|
||||
|
||||
vst1.u8 {d22}, [r4], r5 ;store result
|
||||
vst1.u8 {d23}, [r4], r5
|
||||
vst1.u8 {d24}, [r4], r5
|
||||
vst1.u8 {d25}, [r4], r5
|
||||
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;---------------------
|
||||
secondpass_filter8x4_only
|
||||
;Second pass: 8x4
|
||||
add r3, r12, r3, lsl #5
|
||||
sub r0, r0, r1, lsl #1
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
vabs.s32 q7, q5
|
||||
vabs.s32 q8, q6
|
||||
|
||||
vld1.u8 {d22}, [r0], r1
|
||||
vld1.u8 {d23}, [r0], r1
|
||||
vld1.u8 {d24}, [r0], r1
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vld1.u8 {d25}, [r0], r1
|
||||
vdup.8 d1, d14[4]
|
||||
vld1.u8 {d26}, [r0], r1
|
||||
vdup.8 d2, d15[0]
|
||||
vld1.u8 {d27}, [r0], r1
|
||||
vdup.8 d3, d15[4]
|
||||
vld1.u8 {d28}, [r0], r1
|
||||
vdup.8 d4, d16[0]
|
||||
vld1.u8 {d29}, [r0], r1
|
||||
vdup.8 d5, d16[4]
|
||||
vld1.u8 {d30}, [r0], r1
|
||||
|
||||
vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q4, d23, d0
|
||||
vmull.u8 q5, d24, d0
|
||||
vmull.u8 q6, d25, d0
|
||||
|
||||
vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q4, d24, d1
|
||||
vmlsl.u8 q5, d25, d1
|
||||
vmlsl.u8 q6, d26, d1
|
||||
|
||||
vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q4, d27, d4
|
||||
vmlsl.u8 q5, d28, d4
|
||||
vmlsl.u8 q6, d29, d4
|
||||
|
||||
vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q4, d25, d2
|
||||
vmlal.u8 q5, d26, d2
|
||||
vmlal.u8 q6, d27, d2
|
||||
|
||||
vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q4, d28, d5
|
||||
vmlal.u8 q5, d29, d5
|
||||
vmlal.u8 q6, d30, d5
|
||||
|
||||
vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q8, d26, d3
|
||||
vmull.u8 q9, d27, d3
|
||||
vmull.u8 q10, d28, d3
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q8, #7
|
||||
vqrshrun.s16 d8, q9, #7
|
||||
vqrshrun.s16 d9, q10, #7
|
||||
|
||||
vst1.u8 {d6}, [r4], r5 ;store result
|
||||
vst1.u8 {d7}, [r4], r5
|
||||
vst1.u8 {d8}, [r4], r5
|
||||
vst1.u8 {d9}, [r4], r5
|
||||
|
||||
pop {r4-r5,pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
END
|
@ -1,524 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_sixtap_predict8x8_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
filter8_coeff
|
||||
DCD 0, 0, 128, 0, 0, 0, 0, 0
|
||||
DCD 0, -6, 123, 12, -1, 0, 0, 0
|
||||
DCD 2, -11, 108, 36, -8, 1, 0, 0
|
||||
DCD 0, -9, 93, 50, -6, 0, 0, 0
|
||||
DCD 3, -16, 77, 77, -16, 3, 0, 0
|
||||
DCD 0, -6, 50, 93, -9, 0, 0, 0
|
||||
DCD 1, -8, 36, 108, -11, 2, 0, 0
|
||||
DCD 0, -1, 12, 123, -6, 0, 0, 0
|
||||
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; stack(r4) unsigned char *dst_ptr,
|
||||
; stack(r5) int dst_pitch
|
||||
|
||||
|vp8_sixtap_predict8x8_neon| PROC
|
||||
push {r4-r5, lr}
|
||||
|
||||
adr r12, filter8_coeff
|
||||
|
||||
ldr r4, [sp, #12] ;load parameters from stack
|
||||
ldr r5, [sp, #16] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_filter8x8_only
|
||||
|
||||
add r2, r12, r2, lsl #5 ;calculate filter location
|
||||
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
|
||||
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
|
||||
|
||||
beq firstpass_filter8x8_only
|
||||
|
||||
sub sp, sp, #64 ;reserve space on stack for temporary storage
|
||||
mov lr, sp
|
||||
|
||||
vabs.s32 q12, q14
|
||||
vabs.s32 q13, q15
|
||||
|
||||
mov r2, #2 ;loop counter
|
||||
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
|
||||
sub r0, r0, r1, lsl #1
|
||||
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vdup.8 d1, d24[4]
|
||||
vdup.8 d2, d25[0]
|
||||
|
||||
;First pass: output_height lines x output_width columns (13x8)
|
||||
vld1.u8 {q3}, [r0], r1 ;load src data
|
||||
vdup.8 d3, d25[4]
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vdup.8 d4, d26[0]
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vdup.8 d5, d26[4]
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
|
||||
filt_blk2d_fp8x8_loop_neon
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q8, d8, d0
|
||||
vmull.u8 q9, d10, d0
|
||||
vmull.u8 q10, d12, d0
|
||||
|
||||
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d29, d8, d9, #1
|
||||
vext.8 d30, d10, d11, #1
|
||||
vext.8 d31, d12, d13, #1
|
||||
|
||||
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q8, d29, d1
|
||||
vmlsl.u8 q9, d30, d1
|
||||
vmlsl.u8 q10, d31, d1
|
||||
|
||||
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d29, d8, d9, #4
|
||||
vext.8 d30, d10, d11, #4
|
||||
vext.8 d31, d12, d13, #4
|
||||
|
||||
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q8, d29, d4
|
||||
vmlsl.u8 q9, d30, d4
|
||||
vmlsl.u8 q10, d31, d4
|
||||
|
||||
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d29, d8, d9, #2
|
||||
vext.8 d30, d10, d11, #2
|
||||
vext.8 d31, d12, d13, #2
|
||||
|
||||
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q8, d29, d2
|
||||
vmlal.u8 q9, d30, d2
|
||||
vmlal.u8 q10, d31, d2
|
||||
|
||||
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d29, d8, d9, #5
|
||||
vext.8 d30, d10, d11, #5
|
||||
vext.8 d31, d12, d13, #5
|
||||
|
||||
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q8, d29, d5
|
||||
vmlal.u8 q9, d30, d5
|
||||
vmlal.u8 q10, d31, d5
|
||||
|
||||
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d29, d8, d9, #3
|
||||
vext.8 d30, d10, d11, #3
|
||||
vext.8 d31, d12, d13, #3
|
||||
|
||||
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q4, d29, d3
|
||||
vmull.u8 q5, d30, d3
|
||||
vmull.u8 q6, d31, d3
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vld1.u8 {q3}, [r0], r1 ;load src data
|
||||
|
||||
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d23, q8, #7
|
||||
vqrshrun.s16 d24, q9, #7
|
||||
vqrshrun.s16 d25, q10, #7
|
||||
|
||||
vst1.u8 {d22}, [lr]! ;store result
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vst1.u8 {d23}, [lr]!
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vst1.u8 {d24}, [lr]!
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
vst1.u8 {d25}, [lr]!
|
||||
|
||||
bne filt_blk2d_fp8x8_loop_neon
|
||||
|
||||
;first_pass filtering on the rest 5-line data
|
||||
;vld1.u8 {q3}, [r0], r1 ;load src data
|
||||
;vld1.u8 {q4}, [r0], r1
|
||||
;vld1.u8 {q5}, [r0], r1
|
||||
;vld1.u8 {q6}, [r0], r1
|
||||
vld1.u8 {q7}, [r0], r1
|
||||
|
||||
vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q9, d8, d0
|
||||
vmull.u8 q10, d10, d0
|
||||
vmull.u8 q11, d12, d0
|
||||
vmull.u8 q12, d14, d0
|
||||
|
||||
vext.8 d27, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d28, d8, d9, #1
|
||||
vext.8 d29, d10, d11, #1
|
||||
vext.8 d30, d12, d13, #1
|
||||
vext.8 d31, d14, d15, #1
|
||||
|
||||
vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q9, d28, d1
|
||||
vmlsl.u8 q10, d29, d1
|
||||
vmlsl.u8 q11, d30, d1
|
||||
vmlsl.u8 q12, d31, d1
|
||||
|
||||
vext.8 d27, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d28, d8, d9, #4
|
||||
vext.8 d29, d10, d11, #4
|
||||
vext.8 d30, d12, d13, #4
|
||||
vext.8 d31, d14, d15, #4
|
||||
|
||||
vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q9, d28, d4
|
||||
vmlsl.u8 q10, d29, d4
|
||||
vmlsl.u8 q11, d30, d4
|
||||
vmlsl.u8 q12, d31, d4
|
||||
|
||||
vext.8 d27, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d28, d8, d9, #2
|
||||
vext.8 d29, d10, d11, #2
|
||||
vext.8 d30, d12, d13, #2
|
||||
vext.8 d31, d14, d15, #2
|
||||
|
||||
vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q9, d28, d2
|
||||
vmlal.u8 q10, d29, d2
|
||||
vmlal.u8 q11, d30, d2
|
||||
vmlal.u8 q12, d31, d2
|
||||
|
||||
vext.8 d27, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d28, d8, d9, #5
|
||||
vext.8 d29, d10, d11, #5
|
||||
vext.8 d30, d12, d13, #5
|
||||
vext.8 d31, d14, d15, #5
|
||||
|
||||
vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q9, d28, d5
|
||||
vmlal.u8 q10, d29, d5
|
||||
vmlal.u8 q11, d30, d5
|
||||
vmlal.u8 q12, d31, d5
|
||||
|
||||
vext.8 d27, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d28, d8, d9, #3
|
||||
vext.8 d29, d10, d11, #3
|
||||
vext.8 d30, d12, d13, #3
|
||||
vext.8 d31, d14, d15, #3
|
||||
|
||||
vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q4, d28, d3
|
||||
vmull.u8 q5, d29, d3
|
||||
vmull.u8 q6, d30, d3
|
||||
vmull.u8 q7, d31, d3
|
||||
|
||||
vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q9, q4
|
||||
vqadd.s16 q10, q5
|
||||
vqadd.s16 q11, q6
|
||||
vqadd.s16 q12, q7
|
||||
|
||||
add r3, r12, r3, lsl #5
|
||||
|
||||
vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8
|
||||
sub lr, lr, #64
|
||||
vqrshrun.s16 d27, q9, #7
|
||||
vld1.u8 {q9}, [lr]! ;load intermediate data from stack
|
||||
vqrshrun.s16 d28, q10, #7
|
||||
vld1.u8 {q10}, [lr]!
|
||||
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
|
||||
vqrshrun.s16 d29, q11, #7
|
||||
vld1.u8 {q11}, [lr]!
|
||||
|
||||
vabs.s32 q7, q5
|
||||
vabs.s32 q8, q6
|
||||
|
||||
vqrshrun.s16 d30, q12, #7
|
||||
vld1.u8 {q12}, [lr]!
|
||||
|
||||
;Second pass: 8x8
|
||||
mov r3, #2 ;loop counter
|
||||
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vdup.8 d1, d14[4]
|
||||
vdup.8 d2, d15[0]
|
||||
vdup.8 d3, d15[4]
|
||||
vdup.8 d4, d16[0]
|
||||
vdup.8 d5, d16[4]
|
||||
|
||||
filt_blk2d_sp8x8_loop_neon
|
||||
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q4, d19, d0
|
||||
vmull.u8 q5, d20, d0
|
||||
vmull.u8 q6, d21, d0
|
||||
|
||||
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q4, d20, d1
|
||||
vmlsl.u8 q5, d21, d1
|
||||
vmlsl.u8 q6, d22, d1
|
||||
|
||||
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q4, d23, d4
|
||||
vmlsl.u8 q5, d24, d4
|
||||
vmlsl.u8 q6, d25, d4
|
||||
|
||||
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q4, d21, d2
|
||||
vmlal.u8 q5, d22, d2
|
||||
vmlal.u8 q6, d23, d2
|
||||
|
||||
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q4, d24, d5
|
||||
vmlal.u8 q5, d25, d5
|
||||
vmlal.u8 q6, d26, d5
|
||||
|
||||
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q8, d22, d3
|
||||
vmull.u8 q9, d23, d3
|
||||
vmull.u8 q10, d24, d3
|
||||
|
||||
subs r3, r3, #1
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q8, #7
|
||||
vqrshrun.s16 d8, q9, #7
|
||||
vqrshrun.s16 d9, q10, #7
|
||||
|
||||
vmov q9, q11
|
||||
vst1.u8 {d6}, [r4], r5 ;store result
|
||||
vmov q10, q12
|
||||
vst1.u8 {d7}, [r4], r5
|
||||
vmov q11, q13
|
||||
vst1.u8 {d8}, [r4], r5
|
||||
vmov q12, q14
|
||||
vst1.u8 {d9}, [r4], r5
|
||||
vmov d26, d30
|
||||
|
||||
bne filt_blk2d_sp8x8_loop_neon
|
||||
|
||||
add sp, sp, #64
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;---------------------
|
||||
firstpass_filter8x8_only
|
||||
;add r2, r12, r2, lsl #5 ;calculate filter location
|
||||
;vld1.s32 {q14, q15}, [r2] ;load first_pass filter
|
||||
vabs.s32 q12, q14
|
||||
vabs.s32 q13, q15
|
||||
|
||||
mov r2, #2 ;loop counter
|
||||
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
|
||||
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vdup.8 d1, d24[4]
|
||||
vdup.8 d2, d25[0]
|
||||
vdup.8 d3, d25[4]
|
||||
vdup.8 d4, d26[0]
|
||||
vdup.8 d5, d26[4]
|
||||
|
||||
;First pass: output_height lines x output_width columns (8x8)
|
||||
filt_blk2d_fpo8x8_loop_neon
|
||||
vld1.u8 {q3}, [r0], r1 ;load src data
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q8, d8, d0
|
||||
vmull.u8 q9, d10, d0
|
||||
vmull.u8 q10, d12, d0
|
||||
|
||||
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d29, d8, d9, #1
|
||||
vext.8 d30, d10, d11, #1
|
||||
vext.8 d31, d12, d13, #1
|
||||
|
||||
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q8, d29, d1
|
||||
vmlsl.u8 q9, d30, d1
|
||||
vmlsl.u8 q10, d31, d1
|
||||
|
||||
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d29, d8, d9, #4
|
||||
vext.8 d30, d10, d11, #4
|
||||
vext.8 d31, d12, d13, #4
|
||||
|
||||
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q8, d29, d4
|
||||
vmlsl.u8 q9, d30, d4
|
||||
vmlsl.u8 q10, d31, d4
|
||||
|
||||
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d29, d8, d9, #2
|
||||
vext.8 d30, d10, d11, #2
|
||||
vext.8 d31, d12, d13, #2
|
||||
|
||||
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q8, d29, d2
|
||||
vmlal.u8 q9, d30, d2
|
||||
vmlal.u8 q10, d31, d2
|
||||
|
||||
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d29, d8, d9, #5
|
||||
vext.8 d30, d10, d11, #5
|
||||
vext.8 d31, d12, d13, #5
|
||||
|
||||
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q8, d29, d5
|
||||
vmlal.u8 q9, d30, d5
|
||||
vmlal.u8 q10, d31, d5
|
||||
|
||||
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d29, d8, d9, #3
|
||||
vext.8 d30, d10, d11, #3
|
||||
vext.8 d31, d12, d13, #3
|
||||
|
||||
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q4, d29, d3
|
||||
vmull.u8 q5, d30, d3
|
||||
vmull.u8 q6, d31, d3
|
||||
;
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d23, q8, #7
|
||||
vqrshrun.s16 d24, q9, #7
|
||||
vqrshrun.s16 d25, q10, #7
|
||||
|
||||
vst1.u8 {d22}, [r4], r5 ;store result
|
||||
vst1.u8 {d23}, [r4], r5
|
||||
vst1.u8 {d24}, [r4], r5
|
||||
vst1.u8 {d25}, [r4], r5
|
||||
|
||||
bne filt_blk2d_fpo8x8_loop_neon
|
||||
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;---------------------
|
||||
secondpass_filter8x8_only
|
||||
sub r0, r0, r1, lsl #1
|
||||
add r3, r12, r3, lsl #5
|
||||
|
||||
vld1.u8 {d18}, [r0], r1 ;load src data
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
vld1.u8 {d19}, [r0], r1
|
||||
vabs.s32 q7, q5
|
||||
vld1.u8 {d20}, [r0], r1
|
||||
vabs.s32 q8, q6
|
||||
vld1.u8 {d21}, [r0], r1
|
||||
mov r3, #2 ;loop counter
|
||||
vld1.u8 {d22}, [r0], r1
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vld1.u8 {d23}, [r0], r1
|
||||
vdup.8 d1, d14[4]
|
||||
vld1.u8 {d24}, [r0], r1
|
||||
vdup.8 d2, d15[0]
|
||||
vld1.u8 {d25}, [r0], r1
|
||||
vdup.8 d3, d15[4]
|
||||
vld1.u8 {d26}, [r0], r1
|
||||
vdup.8 d4, d16[0]
|
||||
vld1.u8 {d27}, [r0], r1
|
||||
vdup.8 d5, d16[4]
|
||||
vld1.u8 {d28}, [r0], r1
|
||||
vld1.u8 {d29}, [r0], r1
|
||||
vld1.u8 {d30}, [r0], r1
|
||||
|
||||
;Second pass: 8x8
|
||||
filt_blk2d_spo8x8_loop_neon
|
||||
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q4, d19, d0
|
||||
vmull.u8 q5, d20, d0
|
||||
vmull.u8 q6, d21, d0
|
||||
|
||||
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q4, d20, d1
|
||||
vmlsl.u8 q5, d21, d1
|
||||
vmlsl.u8 q6, d22, d1
|
||||
|
||||
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q4, d23, d4
|
||||
vmlsl.u8 q5, d24, d4
|
||||
vmlsl.u8 q6, d25, d4
|
||||
|
||||
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q4, d21, d2
|
||||
vmlal.u8 q5, d22, d2
|
||||
vmlal.u8 q6, d23, d2
|
||||
|
||||
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q4, d24, d5
|
||||
vmlal.u8 q5, d25, d5
|
||||
vmlal.u8 q6, d26, d5
|
||||
|
||||
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q8, d22, d3
|
||||
vmull.u8 q9, d23, d3
|
||||
vmull.u8 q10, d24, d3
|
||||
|
||||
subs r3, r3, #1
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q8, #7
|
||||
vqrshrun.s16 d8, q9, #7
|
||||
vqrshrun.s16 d9, q10, #7
|
||||
|
||||
vmov q9, q11
|
||||
vst1.u8 {d6}, [r4], r5 ;store result
|
||||
vmov q10, q12
|
||||
vst1.u8 {d7}, [r4], r5
|
||||
vmov q11, q13
|
||||
vst1.u8 {d8}, [r4], r5
|
||||
vmov q12, q14
|
||||
vst1.u8 {d9}, [r4], r5
|
||||
vmov d26, d30
|
||||
|
||||
bne filt_blk2d_spo8x8_loop_neon
|
||||
|
||||
pop {r4-r5,pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
END
|
1757
media/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
Normal file
1757
media/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,276 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_variance16x16_neon|
|
||||
EXPORT |vp8_variance16x8_neon|
|
||||
EXPORT |vp8_variance8x16_neon|
|
||||
EXPORT |vp8_variance8x8_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vp8_variance16x16_neon| PROC
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
mov r12, #8
|
||||
|
||||
variance16x16_neon_loop
|
||||
vld1.8 {q0}, [r0], r1 ;Load up source and reference
|
||||
vld1.8 {q2}, [r2], r3
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q3}, [r2], r3
|
||||
|
||||
vsubl.u8 q11, d0, d4 ;calculate diff
|
||||
vsubl.u8 q12, d1, d5
|
||||
vsubl.u8 q13, d2, d6
|
||||
vsubl.u8 q14, d3, d7
|
||||
|
||||
;VPADAL adds adjacent pairs of elements of a vector, and accumulates
|
||||
;the results into the elements of the destination vector. The explanation
|
||||
;in ARM guide is wrong.
|
||||
vpadal.s16 q8, q11 ;calculate sum
|
||||
vmlal.s16 q9, d22, d22 ;calculate sse
|
||||
vmlal.s16 q10, d23, d23
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q8, q12
|
||||
vmlal.s16 q9, d24, d24
|
||||
vmlal.s16 q10, d25, d25
|
||||
vpadal.s16 q8, q13
|
||||
vmlal.s16 q9, d26, d26
|
||||
vmlal.s16 q10, d27, d27
|
||||
vpadal.s16 q8, q14
|
||||
vmlal.s16 q9, d28, d28
|
||||
vmlal.s16 q10, d29, d29
|
||||
|
||||
bne variance16x16_neon_loop
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
ldr r12, [sp] ;load *sse from stack
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
;vmov.32 r0, d0[0] ;this instruction costs a lot
|
||||
;vmov.32 r1, d1[0]
|
||||
;mul r0, r0, r0
|
||||
;str r1, [r12]
|
||||
;sub r0, r1, r0, lsr #8
|
||||
|
||||
; while sum is signed, sum * sum is always positive and must be treated as
|
||||
; unsigned to avoid propagating the sign bit.
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [r12] ;store sse
|
||||
vshr.u32 d10, d10, #8
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;================================
|
||||
;unsigned int vp8_variance16x8_c(
|
||||
; unsigned char *src_ptr,
|
||||
; int source_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int recon_stride,
|
||||
; unsigned int *sse)
|
||||
|vp8_variance16x8_neon| PROC
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
mov r12, #4
|
||||
|
||||
variance16x8_neon_loop
|
||||
vld1.8 {q0}, [r0], r1 ;Load up source and reference
|
||||
vld1.8 {q2}, [r2], r3
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q3}, [r2], r3
|
||||
|
||||
vsubl.u8 q11, d0, d4 ;calculate diff
|
||||
vsubl.u8 q12, d1, d5
|
||||
vsubl.u8 q13, d2, d6
|
||||
vsubl.u8 q14, d3, d7
|
||||
|
||||
vpadal.s16 q8, q11 ;calculate sum
|
||||
vmlal.s16 q9, d22, d22 ;calculate sse
|
||||
vmlal.s16 q10, d23, d23
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q8, q12
|
||||
vmlal.s16 q9, d24, d24
|
||||
vmlal.s16 q10, d25, d25
|
||||
vpadal.s16 q8, q13
|
||||
vmlal.s16 q9, d26, d26
|
||||
vmlal.s16 q10, d27, d27
|
||||
vpadal.s16 q8, q14
|
||||
vmlal.s16 q9, d28, d28
|
||||
vmlal.s16 q10, d29, d29
|
||||
|
||||
bne variance16x8_neon_loop
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
ldr r12, [sp] ;load *sse from stack
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [r12] ;store sse
|
||||
vshr.u32 d10, d10, #7
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;=================================
|
||||
;unsigned int vp8_variance8x16_c(
|
||||
; unsigned char *src_ptr,
|
||||
; int source_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int recon_stride,
|
||||
; unsigned int *sse)
|
||||
|
||||
|vp8_variance8x16_neon| PROC
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
mov r12, #8
|
||||
|
||||
variance8x16_neon_loop
|
||||
vld1.8 {d0}, [r0], r1 ;Load up source and reference
|
||||
vld1.8 {d4}, [r2], r3
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d6}, [r2], r3
|
||||
|
||||
vsubl.u8 q11, d0, d4 ;calculate diff
|
||||
vsubl.u8 q12, d2, d6
|
||||
|
||||
vpadal.s16 q8, q11 ;calculate sum
|
||||
vmlal.s16 q9, d22, d22 ;calculate sse
|
||||
vmlal.s16 q10, d23, d23
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q8, q12
|
||||
vmlal.s16 q9, d24, d24
|
||||
vmlal.s16 q10, d25, d25
|
||||
|
||||
bne variance8x16_neon_loop
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
ldr r12, [sp] ;load *sse from stack
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [r12] ;store sse
|
||||
vshr.u32 d10, d10, #7
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;==================================
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vp8_variance8x8_neon| PROC
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
mov r12, #2
|
||||
|
||||
variance8x8_neon_loop
|
||||
vld1.8 {d0}, [r0], r1 ;Load up source and reference
|
||||
vld1.8 {d4}, [r2], r3
|
||||
vld1.8 {d1}, [r0], r1
|
||||
vld1.8 {d5}, [r2], r3
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d6}, [r2], r3
|
||||
vld1.8 {d3}, [r0], r1
|
||||
vld1.8 {d7}, [r2], r3
|
||||
|
||||
vsubl.u8 q11, d0, d4 ;calculate diff
|
||||
vsubl.u8 q12, d1, d5
|
||||
vsubl.u8 q13, d2, d6
|
||||
vsubl.u8 q14, d3, d7
|
||||
|
||||
vpadal.s16 q8, q11 ;calculate sum
|
||||
vmlal.s16 q9, d22, d22 ;calculate sse
|
||||
vmlal.s16 q10, d23, d23
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q8, q12
|
||||
vmlal.s16 q9, d24, d24
|
||||
vmlal.s16 q10, d25, d25
|
||||
vpadal.s16 q8, q13
|
||||
vmlal.s16 q9, d26, d26
|
||||
vmlal.s16 q10, d27, d27
|
||||
vpadal.s16 q8, q14
|
||||
vmlal.s16 q9, d28, d28
|
||||
vmlal.s16 q10, d29, d29
|
||||
|
||||
bne variance8x8_neon_loop
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
ldr r12, [sp] ;load *sse from stack
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [r12] ;store sse
|
||||
vshr.u32 d10, d10, #6
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
323
media/libvpx/vp8/common/arm/neon/variance_neon.c
Normal file
323
media/libvpx/vp8/common/arm/neon/variance_neon.c
Normal file
@ -0,0 +1,323 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define __builtin_prefetch(x)
|
||||
#endif
|
||||
|
||||
unsigned int vp8_variance16x16_neon(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse) {
|
||||
int i;
|
||||
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
|
||||
uint32x2_t d0u32, d10u32;
|
||||
int64x1_t d0s64, d1s64;
|
||||
uint8x16_t q0u8, q1u8, q2u8, q3u8;
|
||||
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
||||
int32x4_t q8s32, q9s32, q10s32;
|
||||
int64x2_t q0s64, q1s64, q5s64;
|
||||
|
||||
q8s32 = vdupq_n_s32(0);
|
||||
q9s32 = vdupq_n_s32(0);
|
||||
q10s32 = vdupq_n_s32(0);
|
||||
|
||||
for (i = 0; i < 8; i++) {
|
||||
q0u8 = vld1q_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
q1u8 = vld1q_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
__builtin_prefetch(src_ptr);
|
||||
|
||||
q2u8 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
q3u8 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
__builtin_prefetch(ref_ptr);
|
||||
|
||||
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
|
||||
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
|
||||
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
|
||||
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
|
||||
|
||||
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
||||
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
|
||||
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
|
||||
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
|
||||
|
||||
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
||||
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
|
||||
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
||||
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
||||
|
||||
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
|
||||
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
|
||||
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
|
||||
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
|
||||
|
||||
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
|
||||
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
|
||||
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
|
||||
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
|
||||
}
|
||||
|
||||
q10s32 = vaddq_s32(q10s32, q9s32);
|
||||
q0s64 = vpaddlq_s32(q8s32);
|
||||
q1s64 = vpaddlq_s32(q10s32);
|
||||
|
||||
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
|
||||
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||
|
||||
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
|
||||
vreinterpret_s32_s64(d0s64));
|
||||
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
|
||||
|
||||
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
|
||||
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
|
||||
|
||||
return vget_lane_u32(d0u32, 0);
|
||||
}
|
||||
|
||||
unsigned int vp8_variance16x8_neon(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse) {
|
||||
int i;
|
||||
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
|
||||
uint32x2_t d0u32, d10u32;
|
||||
int64x1_t d0s64, d1s64;
|
||||
uint8x16_t q0u8, q1u8, q2u8, q3u8;
|
||||
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
||||
int32x4_t q8s32, q9s32, q10s32;
|
||||
int64x2_t q0s64, q1s64, q5s64;
|
||||
|
||||
q8s32 = vdupq_n_s32(0);
|
||||
q9s32 = vdupq_n_s32(0);
|
||||
q10s32 = vdupq_n_s32(0);
|
||||
|
||||
for (i = 0; i < 4; i++) { // variance16x8_neon_loop
|
||||
q0u8 = vld1q_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
q1u8 = vld1q_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
__builtin_prefetch(src_ptr);
|
||||
|
||||
q2u8 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
q3u8 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
__builtin_prefetch(ref_ptr);
|
||||
|
||||
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
|
||||
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
|
||||
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
|
||||
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
|
||||
|
||||
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
||||
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
|
||||
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
|
||||
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
|
||||
|
||||
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
||||
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
|
||||
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
||||
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
||||
|
||||
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
|
||||
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
|
||||
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
|
||||
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
|
||||
|
||||
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
|
||||
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
|
||||
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
|
||||
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
|
||||
}
|
||||
|
||||
q10s32 = vaddq_s32(q10s32, q9s32);
|
||||
q0s64 = vpaddlq_s32(q8s32);
|
||||
q1s64 = vpaddlq_s32(q10s32);
|
||||
|
||||
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
|
||||
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||
|
||||
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
|
||||
vreinterpret_s32_s64(d0s64));
|
||||
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
|
||||
|
||||
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
|
||||
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
|
||||
|
||||
return vget_lane_u32(d0u32, 0);
|
||||
}
|
||||
|
||||
unsigned int vp8_variance8x16_neon(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse) {
|
||||
int i;
|
||||
uint8x8_t d0u8, d2u8, d4u8, d6u8;
|
||||
int16x4_t d22s16, d23s16, d24s16, d25s16;
|
||||
uint32x2_t d0u32, d10u32;
|
||||
int64x1_t d0s64, d1s64;
|
||||
uint16x8_t q11u16, q12u16;
|
||||
int32x4_t q8s32, q9s32, q10s32;
|
||||
int64x2_t q0s64, q1s64, q5s64;
|
||||
|
||||
q8s32 = vdupq_n_s32(0);
|
||||
q9s32 = vdupq_n_s32(0);
|
||||
q10s32 = vdupq_n_s32(0);
|
||||
|
||||
for (i = 0; i < 8; i++) { // variance8x16_neon_loop
|
||||
d0u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d2u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
__builtin_prefetch(src_ptr);
|
||||
|
||||
d4u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
d6u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
__builtin_prefetch(ref_ptr);
|
||||
|
||||
q11u16 = vsubl_u8(d0u8, d4u8);
|
||||
q12u16 = vsubl_u8(d2u8, d6u8);
|
||||
|
||||
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
||||
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
|
||||
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
|
||||
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
|
||||
|
||||
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
||||
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
|
||||
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
||||
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
||||
}
|
||||
|
||||
q10s32 = vaddq_s32(q10s32, q9s32);
|
||||
q0s64 = vpaddlq_s32(q8s32);
|
||||
q1s64 = vpaddlq_s32(q10s32);
|
||||
|
||||
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
|
||||
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||
|
||||
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
|
||||
vreinterpret_s32_s64(d0s64));
|
||||
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
|
||||
|
||||
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
|
||||
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
|
||||
|
||||
return vget_lane_u32(d0u32, 0);
|
||||
}
|
||||
|
||||
unsigned int vp8_variance8x8_neon(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse) {
|
||||
int i;
|
||||
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
|
||||
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
|
||||
uint32x2_t d0u32, d10u32;
|
||||
int64x1_t d0s64, d1s64;
|
||||
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
||||
int32x4_t q8s32, q9s32, q10s32;
|
||||
int64x2_t q0s64, q1s64, q5s64;
|
||||
|
||||
q8s32 = vdupq_n_s32(0);
|
||||
q9s32 = vdupq_n_s32(0);
|
||||
q10s32 = vdupq_n_s32(0);
|
||||
|
||||
for (i = 0; i < 2; i++) { // variance8x8_neon_loop
|
||||
d0u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d1u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d2u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d3u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
|
||||
d4u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
d5u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
d6u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
d7u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
|
||||
q11u16 = vsubl_u8(d0u8, d4u8);
|
||||
q12u16 = vsubl_u8(d1u8, d5u8);
|
||||
q13u16 = vsubl_u8(d2u8, d6u8);
|
||||
q14u16 = vsubl_u8(d3u8, d7u8);
|
||||
|
||||
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
||||
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
|
||||
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
|
||||
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
|
||||
|
||||
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
||||
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
|
||||
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
||||
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
||||
|
||||
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
|
||||
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
|
||||
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
|
||||
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
|
||||
|
||||
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
|
||||
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
|
||||
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
|
||||
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
|
||||
}
|
||||
|
||||
q10s32 = vaddq_s32(q10s32, q9s32);
|
||||
q0s64 = vpaddlq_s32(q8s32);
|
||||
q1s64 = vpaddlq_s32(q10s32);
|
||||
|
||||
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
|
||||
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||
|
||||
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
|
||||
vreinterpret_s32_s64(d0s64));
|
||||
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
|
||||
|
||||
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6);
|
||||
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
|
||||
|
||||
return vget_lane_u32(d0u32, 0);
|
||||
}
|
@ -1,423 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
;-----------------
|
||||
|
||||
EXPORT |vp8_sub_pixel_variance16x16_neon_func|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; stack(r4) unsigned char *dst_ptr,
|
||||
; stack(r5) int dst_pixels_per_line,
|
||||
; stack(r6) unsigned int *sse
|
||||
;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon.
|
||||
|
||||
bilinear_taps_coeff
|
||||
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
|
||||
|
||||
|vp8_sub_pixel_variance16x16_neon_func| PROC
|
||||
push {r4-r6, lr}
|
||||
|
||||
adr r12, bilinear_taps_coeff
|
||||
ldr r4, [sp, #16] ;load *dst_ptr from stack
|
||||
ldr r5, [sp, #20] ;load dst_pixels_per_line from stack
|
||||
ldr r6, [sp, #24] ;load *sse from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_bfilter16x16_only
|
||||
|
||||
add r2, r12, r2, lsl #3 ;calculate filter location
|
||||
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
|
||||
vld1.s32 {d31}, [r2] ;load first_pass filter
|
||||
|
||||
beq firstpass_bfilter16x16_only
|
||||
|
||||
sub sp, sp, #272 ;reserve space on stack for temporary storage
|
||||
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
|
||||
mov lr, sp
|
||||
vld1.u8 {d5, d6, d7}, [r0], r1
|
||||
|
||||
mov r2, #3 ;loop counter
|
||||
vld1.u8 {d8, d9, d10}, [r0], r1
|
||||
|
||||
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
|
||||
vld1.u8 {d11, d12, d13}, [r0], r1
|
||||
|
||||
vdup.8 d1, d31[4]
|
||||
|
||||
;First Pass: output_height lines x output_width columns (17x16)
|
||||
vp8e_filt_blk2d_fp16x16_loop_neon
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
|
||||
vmull.u8 q8, d3, d0
|
||||
vmull.u8 q9, d5, d0
|
||||
vmull.u8 q10, d6, d0
|
||||
vmull.u8 q11, d8, d0
|
||||
vmull.u8 q12, d9, d0
|
||||
vmull.u8 q13, d11, d0
|
||||
vmull.u8 q14, d12, d0
|
||||
|
||||
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
|
||||
vext.8 d5, d5, d6, #1
|
||||
vext.8 d8, d8, d9, #1
|
||||
vext.8 d11, d11, d12, #1
|
||||
|
||||
vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q9, d5, d1
|
||||
vmlal.u8 q11, d8, d1
|
||||
vmlal.u8 q13, d11, d1
|
||||
|
||||
vext.8 d3, d3, d4, #1
|
||||
vext.8 d6, d6, d7, #1
|
||||
vext.8 d9, d9, d10, #1
|
||||
vext.8 d12, d12, d13, #1
|
||||
|
||||
vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q10, d6, d1
|
||||
vmlal.u8 q12, d9, d1
|
||||
vmlal.u8 q14, d12, d1
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d15, q8, #7
|
||||
vqrshrn.u16 d16, q9, #7
|
||||
vqrshrn.u16 d17, q10, #7
|
||||
vqrshrn.u16 d18, q11, #7
|
||||
vqrshrn.u16 d19, q12, #7
|
||||
vqrshrn.u16 d20, q13, #7
|
||||
|
||||
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
|
||||
vqrshrn.u16 d21, q14, #7
|
||||
vld1.u8 {d5, d6, d7}, [r0], r1
|
||||
|
||||
vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
|
||||
vld1.u8 {d8, d9, d10}, [r0], r1
|
||||
vst1.u8 {d18, d19, d20, d21}, [lr]!
|
||||
vld1.u8 {d11, d12, d13}, [r0], r1
|
||||
|
||||
bne vp8e_filt_blk2d_fp16x16_loop_neon
|
||||
|
||||
;First-pass filtering for rest 5 lines
|
||||
vld1.u8 {d14, d15, d16}, [r0], r1
|
||||
|
||||
vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0])
|
||||
vmull.u8 q10, d3, d0
|
||||
vmull.u8 q11, d5, d0
|
||||
vmull.u8 q12, d6, d0
|
||||
vmull.u8 q13, d8, d0
|
||||
vmull.u8 q14, d9, d0
|
||||
|
||||
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
|
||||
vext.8 d5, d5, d6, #1
|
||||
vext.8 d8, d8, d9, #1
|
||||
|
||||
vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q11, d5, d1
|
||||
vmlal.u8 q13, d8, d1
|
||||
|
||||
vext.8 d3, d3, d4, #1
|
||||
vext.8 d6, d6, d7, #1
|
||||
vext.8 d9, d9, d10, #1
|
||||
|
||||
vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q12, d6, d1
|
||||
vmlal.u8 q14, d9, d1
|
||||
|
||||
vmull.u8 q1, d11, d0
|
||||
vmull.u8 q2, d12, d0
|
||||
vmull.u8 q3, d14, d0
|
||||
vmull.u8 q4, d15, d0
|
||||
|
||||
vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
|
||||
vext.8 d14, d14, d15, #1
|
||||
|
||||
vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q3, d14, d1
|
||||
|
||||
vext.8 d12, d12, d13, #1
|
||||
vext.8 d15, d15, d16, #1
|
||||
|
||||
vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q4, d15, d1
|
||||
|
||||
vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d11, q10, #7
|
||||
vqrshrn.u16 d12, q11, #7
|
||||
vqrshrn.u16 d13, q12, #7
|
||||
vqrshrn.u16 d14, q13, #7
|
||||
vqrshrn.u16 d15, q14, #7
|
||||
vqrshrn.u16 d16, q1, #7
|
||||
vqrshrn.u16 d17, q2, #7
|
||||
vqrshrn.u16 d18, q3, #7
|
||||
vqrshrn.u16 d19, q4, #7
|
||||
|
||||
vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
|
||||
vst1.u8 {d14, d15, d16, d17}, [lr]!
|
||||
vst1.u8 {d18, d19}, [lr]!
|
||||
|
||||
;Second pass: 16x16
|
||||
;secondpass_filter
|
||||
add r3, r12, r3, lsl #3
|
||||
sub lr, lr, #272
|
||||
|
||||
vld1.u32 {d31}, [r3] ;load second_pass filter
|
||||
|
||||
sub sp, sp, #256
|
||||
mov r3, sp
|
||||
|
||||
vld1.u8 {d22, d23}, [lr]! ;load src data
|
||||
|
||||
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
|
||||
vdup.8 d1, d31[4]
|
||||
mov r12, #4 ;loop counter
|
||||
|
||||
vp8e_filt_blk2d_sp16x16_loop_neon
|
||||
vld1.u8 {d24, d25}, [lr]!
|
||||
vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
|
||||
vld1.u8 {d26, d27}, [lr]!
|
||||
vmull.u8 q2, d23, d0
|
||||
vld1.u8 {d28, d29}, [lr]!
|
||||
vmull.u8 q3, d24, d0
|
||||
vld1.u8 {d30, d31}, [lr]!
|
||||
|
||||
vmull.u8 q4, d25, d0
|
||||
vmull.u8 q5, d26, d0
|
||||
vmull.u8 q6, d27, d0
|
||||
vmull.u8 q7, d28, d0
|
||||
vmull.u8 q8, d29, d0
|
||||
|
||||
vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
|
||||
vmlal.u8 q2, d25, d1
|
||||
vmlal.u8 q3, d26, d1
|
||||
vmlal.u8 q4, d27, d1
|
||||
vmlal.u8 q5, d28, d1
|
||||
vmlal.u8 q6, d29, d1
|
||||
vmlal.u8 q7, d30, d1
|
||||
vmlal.u8 q8, d31, d1
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d3, q2, #7
|
||||
vqrshrn.u16 d4, q3, #7
|
||||
vqrshrn.u16 d5, q4, #7
|
||||
vqrshrn.u16 d6, q5, #7
|
||||
vqrshrn.u16 d7, q6, #7
|
||||
vqrshrn.u16 d8, q7, #7
|
||||
vqrshrn.u16 d9, q8, #7
|
||||
|
||||
vst1.u8 {d2, d3}, [r3]! ;store result
|
||||
vst1.u8 {d4, d5}, [r3]!
|
||||
vst1.u8 {d6, d7}, [r3]!
|
||||
vmov q11, q15
|
||||
vst1.u8 {d8, d9}, [r3]!
|
||||
|
||||
bne vp8e_filt_blk2d_sp16x16_loop_neon
|
||||
|
||||
b sub_pixel_variance16x16_neon
|
||||
|
||||
;--------------------
|
||||
firstpass_bfilter16x16_only
|
||||
mov r2, #4 ;loop counter
|
||||
sub sp, sp, #528 ;reserve space on stack for temporary storage
|
||||
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
|
||||
vdup.8 d1, d31[4]
|
||||
mov r3, sp
|
||||
|
||||
;First Pass: output_height lines x output_width columns (16x16)
|
||||
vp8e_filt_blk2d_fpo16x16_loop_neon
|
||||
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
|
||||
vld1.u8 {d5, d6, d7}, [r0], r1
|
||||
vld1.u8 {d8, d9, d10}, [r0], r1
|
||||
vld1.u8 {d11, d12, d13}, [r0], r1
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
|
||||
vmull.u8 q8, d3, d0
|
||||
vmull.u8 q9, d5, d0
|
||||
vmull.u8 q10, d6, d0
|
||||
vmull.u8 q11, d8, d0
|
||||
vmull.u8 q12, d9, d0
|
||||
vmull.u8 q13, d11, d0
|
||||
vmull.u8 q14, d12, d0
|
||||
|
||||
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
|
||||
vext.8 d5, d5, d6, #1
|
||||
vext.8 d8, d8, d9, #1
|
||||
vext.8 d11, d11, d12, #1
|
||||
|
||||
vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q9, d5, d1
|
||||
vmlal.u8 q11, d8, d1
|
||||
vmlal.u8 q13, d11, d1
|
||||
|
||||
vext.8 d3, d3, d4, #1
|
||||
vext.8 d6, d6, d7, #1
|
||||
vext.8 d9, d9, d10, #1
|
||||
vext.8 d12, d12, d13, #1
|
||||
|
||||
vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q10, d6, d1
|
||||
vmlal.u8 q12, d9, d1
|
||||
vmlal.u8 q14, d12, d1
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d15, q8, #7
|
||||
vqrshrn.u16 d16, q9, #7
|
||||
vqrshrn.u16 d17, q10, #7
|
||||
vqrshrn.u16 d18, q11, #7
|
||||
vqrshrn.u16 d19, q12, #7
|
||||
vqrshrn.u16 d20, q13, #7
|
||||
vst1.u8 {d14, d15}, [r3]! ;store result
|
||||
vqrshrn.u16 d21, q14, #7
|
||||
|
||||
vst1.u8 {d16, d17}, [r3]!
|
||||
vst1.u8 {d18, d19}, [r3]!
|
||||
vst1.u8 {d20, d21}, [r3]!
|
||||
|
||||
bne vp8e_filt_blk2d_fpo16x16_loop_neon
|
||||
|
||||
b sub_pixel_variance16x16_neon
|
||||
|
||||
;---------------------
|
||||
secondpass_bfilter16x16_only
|
||||
;Second pass: 16x16
|
||||
;secondpass_filter
|
||||
sub sp, sp, #528 ;reserve space on stack for temporary storage
|
||||
add r3, r12, r3, lsl #3
|
||||
mov r12, #4 ;loop counter
|
||||
vld1.u32 {d31}, [r3] ;load second_pass filter
|
||||
vld1.u8 {d22, d23}, [r0], r1 ;load src data
|
||||
mov r3, sp
|
||||
|
||||
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
|
||||
vdup.8 d1, d31[4]
|
||||
|
||||
vp8e_filt_blk2d_spo16x16_loop_neon
|
||||
vld1.u8 {d24, d25}, [r0], r1
|
||||
vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
|
||||
vld1.u8 {d26, d27}, [r0], r1
|
||||
vmull.u8 q2, d23, d0
|
||||
vld1.u8 {d28, d29}, [r0], r1
|
||||
vmull.u8 q3, d24, d0
|
||||
vld1.u8 {d30, d31}, [r0], r1
|
||||
|
||||
vmull.u8 q4, d25, d0
|
||||
vmull.u8 q5, d26, d0
|
||||
vmull.u8 q6, d27, d0
|
||||
vmull.u8 q7, d28, d0
|
||||
vmull.u8 q8, d29, d0
|
||||
|
||||
vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
|
||||
vmlal.u8 q2, d25, d1
|
||||
vmlal.u8 q3, d26, d1
|
||||
vmlal.u8 q4, d27, d1
|
||||
vmlal.u8 q5, d28, d1
|
||||
vmlal.u8 q6, d29, d1
|
||||
vmlal.u8 q7, d30, d1
|
||||
vmlal.u8 q8, d31, d1
|
||||
|
||||
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d3, q2, #7
|
||||
vqrshrn.u16 d4, q3, #7
|
||||
vqrshrn.u16 d5, q4, #7
|
||||
vqrshrn.u16 d6, q5, #7
|
||||
vqrshrn.u16 d7, q6, #7
|
||||
vqrshrn.u16 d8, q7, #7
|
||||
vqrshrn.u16 d9, q8, #7
|
||||
|
||||
vst1.u8 {d2, d3}, [r3]! ;store result
|
||||
subs r12, r12, #1
|
||||
vst1.u8 {d4, d5}, [r3]!
|
||||
vmov q11, q15
|
||||
vst1.u8 {d6, d7}, [r3]!
|
||||
vst1.u8 {d8, d9}, [r3]!
|
||||
|
||||
bne vp8e_filt_blk2d_spo16x16_loop_neon
|
||||
|
||||
b sub_pixel_variance16x16_neon
|
||||
|
||||
;----------------------------
|
||||
;variance16x16
|
||||
sub_pixel_variance16x16_neon
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
sub r3, r3, #256
|
||||
mov r12, #8
|
||||
|
||||
sub_pixel_variance16x16_neon_loop
|
||||
vld1.8 {q0}, [r3]! ;Load up source and reference
|
||||
vld1.8 {q2}, [r4], r5
|
||||
vld1.8 {q1}, [r3]!
|
||||
vld1.8 {q3}, [r4], r5
|
||||
|
||||
vsubl.u8 q11, d0, d4 ;diff
|
||||
vsubl.u8 q12, d1, d5
|
||||
vsubl.u8 q13, d2, d6
|
||||
vsubl.u8 q14, d3, d7
|
||||
|
||||
vpadal.s16 q8, q11 ;sum
|
||||
vmlal.s16 q9, d22, d22 ;sse
|
||||
vmlal.s16 q10, d23, d23
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q8, q12
|
||||
vmlal.s16 q9, d24, d24
|
||||
vmlal.s16 q10, d25, d25
|
||||
vpadal.s16 q8, q13
|
||||
vmlal.s16 q9, d26, d26
|
||||
vmlal.s16 q10, d27, d27
|
||||
vpadal.s16 q8, q14
|
||||
vmlal.s16 q9, d28, d28
|
||||
vmlal.s16 q10, d29, d29
|
||||
|
||||
bne sub_pixel_variance16x16_neon_loop
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [r6] ;store sse
|
||||
vshr.u32 d10, d10, #8
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
add sp, sp, #528
|
||||
vmov.32 r0, d0[0] ;return
|
||||
|
||||
pop {r4-r6,pc}
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
@ -1,572 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_variance_halfpixvar16x16_h_neon|
|
||||
EXPORT |vp8_variance_halfpixvar16x16_v_neon|
|
||||
EXPORT |vp8_variance_halfpixvar16x16_hv_neon|
|
||||
EXPORT |vp8_sub_pixel_variance16x16s_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
;================================================
|
||||
;unsigned int vp8_variance_halfpixvar16x16_h_neon
|
||||
;(
|
||||
; unsigned char *src_ptr, r0
|
||||
; int src_pixels_per_line, r1
|
||||
; unsigned char *dst_ptr, r2
|
||||
; int dst_pixels_per_line, r3
|
||||
; unsigned int *sse
|
||||
;);
|
||||
;================================================
|
||||
|vp8_variance_halfpixvar16x16_h_neon| PROC
|
||||
push {lr}
|
||||
|
||||
mov r12, #4 ;loop counter
|
||||
ldr lr, [sp, #4] ;load *sse from stack
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
;First Pass: output_height lines x output_width columns (16x16)
|
||||
vp8_filt_fpo16x16s_4_0_loop_neon
|
||||
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
|
||||
vld1.8 {q11}, [r2], r3
|
||||
vld1.u8 {d4, d5, d6, d7}, [r0], r1
|
||||
vld1.8 {q12}, [r2], r3
|
||||
vld1.u8 {d8, d9, d10, d11}, [r0], r1
|
||||
vld1.8 {q13}, [r2], r3
|
||||
vld1.u8 {d12, d13, d14, d15}, [r0], r1
|
||||
|
||||
;pld [r0]
|
||||
;pld [r0, r1]
|
||||
;pld [r0, r1, lsl #1]
|
||||
|
||||
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
|
||||
vext.8 q3, q2, q3, #1
|
||||
vext.8 q5, q4, q5, #1
|
||||
vext.8 q7, q6, q7, #1
|
||||
|
||||
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
|
||||
vld1.8 {q14}, [r2], r3
|
||||
vrhadd.u8 q1, q2, q3
|
||||
vrhadd.u8 q2, q4, q5
|
||||
vrhadd.u8 q3, q6, q7
|
||||
|
||||
vsubl.u8 q4, d0, d22 ;diff
|
||||
vsubl.u8 q5, d1, d23
|
||||
vsubl.u8 q6, d2, d24
|
||||
vsubl.u8 q7, d3, d25
|
||||
vsubl.u8 q0, d4, d26
|
||||
vsubl.u8 q1, d5, d27
|
||||
vsubl.u8 q2, d6, d28
|
||||
vsubl.u8 q3, d7, d29
|
||||
|
||||
vpadal.s16 q8, q4 ;sum
|
||||
vmlal.s16 q9, d8, d8 ;sse
|
||||
vmlal.s16 q10, d9, d9
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q8, q5
|
||||
vmlal.s16 q9, d10, d10
|
||||
vmlal.s16 q10, d11, d11
|
||||
vpadal.s16 q8, q6
|
||||
vmlal.s16 q9, d12, d12
|
||||
vmlal.s16 q10, d13, d13
|
||||
vpadal.s16 q8, q7
|
||||
vmlal.s16 q9, d14, d14
|
||||
vmlal.s16 q10, d15, d15
|
||||
|
||||
vpadal.s16 q8, q0 ;sum
|
||||
vmlal.s16 q9, d0, d0 ;sse
|
||||
vmlal.s16 q10, d1, d1
|
||||
vpadal.s16 q8, q1
|
||||
vmlal.s16 q9, d2, d2
|
||||
vmlal.s16 q10, d3, d3
|
||||
vpadal.s16 q8, q2
|
||||
vmlal.s16 q9, d4, d4
|
||||
vmlal.s16 q10, d5, d5
|
||||
vpadal.s16 q8, q3
|
||||
vmlal.s16 q9, d6, d6
|
||||
vmlal.s16 q10, d7, d7
|
||||
|
||||
bne vp8_filt_fpo16x16s_4_0_loop_neon
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [lr] ;store sse
|
||||
vshr.u32 d10, d10, #8
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
pop {pc}
|
||||
ENDP
|
||||
|
||||
;================================================
|
||||
;unsigned int vp8_variance_halfpixvar16x16_v_neon
|
||||
;(
|
||||
; unsigned char *src_ptr, r0
|
||||
; int src_pixels_per_line, r1
|
||||
; unsigned char *dst_ptr, r2
|
||||
; int dst_pixels_per_line, r3
|
||||
; unsigned int *sse
|
||||
;);
|
||||
;================================================
|
||||
|vp8_variance_halfpixvar16x16_v_neon| PROC
|
||||
push {lr}
|
||||
|
||||
mov r12, #4 ;loop counter
|
||||
|
||||
vld1.u8 {q0}, [r0], r1 ;load src data
|
||||
ldr lr, [sp, #4] ;load *sse from stack
|
||||
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
vp8_filt_spo16x16s_0_4_loop_neon
|
||||
vld1.u8 {q2}, [r0], r1
|
||||
vld1.8 {q1}, [r2], r3
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vld1.8 {q3}, [r2], r3
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
vld1.8 {q5}, [r2], r3
|
||||
vld1.u8 {q15}, [r0], r1
|
||||
|
||||
vrhadd.u8 q0, q0, q2
|
||||
vld1.8 {q7}, [r2], r3
|
||||
vrhadd.u8 q2, q2, q4
|
||||
vrhadd.u8 q4, q4, q6
|
||||
vrhadd.u8 q6, q6, q15
|
||||
|
||||
vsubl.u8 q11, d0, d2 ;diff
|
||||
vsubl.u8 q12, d1, d3
|
||||
vsubl.u8 q13, d4, d6
|
||||
vsubl.u8 q14, d5, d7
|
||||
vsubl.u8 q0, d8, d10
|
||||
vsubl.u8 q1, d9, d11
|
||||
vsubl.u8 q2, d12, d14
|
||||
vsubl.u8 q3, d13, d15
|
||||
|
||||
vpadal.s16 q8, q11 ;sum
|
||||
vmlal.s16 q9, d22, d22 ;sse
|
||||
vmlal.s16 q10, d23, d23
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q8, q12
|
||||
vmlal.s16 q9, d24, d24
|
||||
vmlal.s16 q10, d25, d25
|
||||
vpadal.s16 q8, q13
|
||||
vmlal.s16 q9, d26, d26
|
||||
vmlal.s16 q10, d27, d27
|
||||
vpadal.s16 q8, q14
|
||||
vmlal.s16 q9, d28, d28
|
||||
vmlal.s16 q10, d29, d29
|
||||
|
||||
vpadal.s16 q8, q0 ;sum
|
||||
vmlal.s16 q9, d0, d0 ;sse
|
||||
vmlal.s16 q10, d1, d1
|
||||
vpadal.s16 q8, q1
|
||||
vmlal.s16 q9, d2, d2
|
||||
vmlal.s16 q10, d3, d3
|
||||
vpadal.s16 q8, q2
|
||||
vmlal.s16 q9, d4, d4
|
||||
vmlal.s16 q10, d5, d5
|
||||
|
||||
vmov q0, q15
|
||||
|
||||
vpadal.s16 q8, q3
|
||||
vmlal.s16 q9, d6, d6
|
||||
vmlal.s16 q10, d7, d7
|
||||
|
||||
bne vp8_filt_spo16x16s_0_4_loop_neon
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [lr] ;store sse
|
||||
vshr.u32 d10, d10, #8
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
pop {pc}
|
||||
ENDP
|
||||
|
||||
;================================================
|
||||
;unsigned int vp8_variance_halfpixvar16x16_hv_neon
|
||||
;(
|
||||
; unsigned char *src_ptr, r0
|
||||
; int src_pixels_per_line, r1
|
||||
; unsigned char *dst_ptr, r2
|
||||
; int dst_pixels_per_line, r3
|
||||
; unsigned int *sse
|
||||
;);
|
||||
;================================================
|
||||
|vp8_variance_halfpixvar16x16_hv_neon| PROC
|
||||
push {lr}
|
||||
|
||||
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
|
||||
|
||||
ldr lr, [sp, #4] ;load *sse from stack
|
||||
vmov.i8 q13, #0 ;q8 - sum
|
||||
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
|
||||
|
||||
vmov.i8 q14, #0 ;q9, q10 - sse
|
||||
vmov.i8 q15, #0
|
||||
|
||||
mov r12, #4 ;loop counter
|
||||
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
|
||||
|
||||
;First Pass: output_height lines x output_width columns (17x16)
|
||||
vp8_filt16x16s_4_4_loop_neon
|
||||
vld1.u8 {d4, d5, d6, d7}, [r0], r1
|
||||
vld1.u8 {d8, d9, d10, d11}, [r0], r1
|
||||
vld1.u8 {d12, d13, d14, d15}, [r0], r1
|
||||
vld1.u8 {d16, d17, d18, d19}, [r0], r1
|
||||
|
||||
;pld [r0]
|
||||
;pld [r0, r1]
|
||||
;pld [r0, r1, lsl #1]
|
||||
|
||||
vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
|
||||
vext.8 q5, q4, q5, #1
|
||||
vext.8 q7, q6, q7, #1
|
||||
vext.8 q9, q8, q9, #1
|
||||
|
||||
vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
|
||||
vrhadd.u8 q2, q4, q5
|
||||
vrhadd.u8 q3, q6, q7
|
||||
vrhadd.u8 q4, q8, q9
|
||||
|
||||
vld1.8 {q5}, [r2], r3
|
||||
vrhadd.u8 q0, q0, q1
|
||||
vld1.8 {q6}, [r2], r3
|
||||
vrhadd.u8 q1, q1, q2
|
||||
vld1.8 {q7}, [r2], r3
|
||||
vrhadd.u8 q2, q2, q3
|
||||
vld1.8 {q8}, [r2], r3
|
||||
vrhadd.u8 q3, q3, q4
|
||||
|
||||
vsubl.u8 q9, d0, d10 ;diff
|
||||
vsubl.u8 q10, d1, d11
|
||||
vsubl.u8 q11, d2, d12
|
||||
vsubl.u8 q12, d3, d13
|
||||
|
||||
vsubl.u8 q0, d4, d14 ;diff
|
||||
vsubl.u8 q1, d5, d15
|
||||
vsubl.u8 q5, d6, d16
|
||||
vsubl.u8 q6, d7, d17
|
||||
|
||||
vpadal.s16 q13, q9 ;sum
|
||||
vmlal.s16 q14, d18, d18 ;sse
|
||||
vmlal.s16 q15, d19, d19
|
||||
|
||||
vpadal.s16 q13, q10 ;sum
|
||||
vmlal.s16 q14, d20, d20 ;sse
|
||||
vmlal.s16 q15, d21, d21
|
||||
|
||||
vpadal.s16 q13, q11 ;sum
|
||||
vmlal.s16 q14, d22, d22 ;sse
|
||||
vmlal.s16 q15, d23, d23
|
||||
|
||||
vpadal.s16 q13, q12 ;sum
|
||||
vmlal.s16 q14, d24, d24 ;sse
|
||||
vmlal.s16 q15, d25, d25
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q13, q0 ;sum
|
||||
vmlal.s16 q14, d0, d0 ;sse
|
||||
vmlal.s16 q15, d1, d1
|
||||
|
||||
vpadal.s16 q13, q1 ;sum
|
||||
vmlal.s16 q14, d2, d2 ;sse
|
||||
vmlal.s16 q15, d3, d3
|
||||
|
||||
vpadal.s16 q13, q5 ;sum
|
||||
vmlal.s16 q14, d10, d10 ;sse
|
||||
vmlal.s16 q15, d11, d11
|
||||
|
||||
vmov q0, q4
|
||||
|
||||
vpadal.s16 q13, q6 ;sum
|
||||
vmlal.s16 q14, d12, d12 ;sse
|
||||
vmlal.s16 q15, d13, d13
|
||||
|
||||
bne vp8_filt16x16s_4_4_loop_neon
|
||||
|
||||
vadd.u32 q15, q14, q15 ;accumulate sse
|
||||
vpaddl.s32 q0, q13 ;accumulate sum
|
||||
|
||||
vpaddl.u32 q1, q15
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [lr] ;store sse
|
||||
vshr.u32 d10, d10, #8
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
pop {pc}
|
||||
ENDP
|
||||
|
||||
;==============================
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; stack unsigned char *dst_ptr,
|
||||
; stack int dst_pixels_per_line,
|
||||
; stack unsigned int *sse
|
||||
;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()
|
||||
;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,
|
||||
;or filter coeff is {64, 64}. This simplified program only works in this situation.
|
||||
;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.
|
||||
|
||||
|vp8_sub_pixel_variance16x16s_neon| PROC
|
||||
push {r4, lr}
|
||||
|
||||
ldr r4, [sp, #8] ;load *dst_ptr from stack
|
||||
ldr r12, [sp, #12] ;load dst_pixels_per_line from stack
|
||||
ldr lr, [sp, #16] ;load *sse from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_bfilter16x16s_only
|
||||
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
beq firstpass_bfilter16x16s_only
|
||||
|
||||
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
|
||||
sub sp, sp, #256 ;reserve space on stack for temporary storage
|
||||
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
|
||||
mov r3, sp
|
||||
mov r2, #4 ;loop counter
|
||||
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
|
||||
|
||||
;First Pass: output_height lines x output_width columns (17x16)
|
||||
vp8e_filt_blk2d_fp16x16s_loop_neon
|
||||
vld1.u8 {d4, d5, d6, d7}, [r0], r1
|
||||
vld1.u8 {d8, d9, d10, d11}, [r0], r1
|
||||
vld1.u8 {d12, d13, d14, d15}, [r0], r1
|
||||
vld1.u8 {d16, d17, d18, d19}, [r0], r1
|
||||
|
||||
;pld [r0]
|
||||
;pld [r0, r1]
|
||||
;pld [r0, r1, lsl #1]
|
||||
|
||||
vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
|
||||
vext.8 q5, q4, q5, #1
|
||||
vext.8 q7, q6, q7, #1
|
||||
vext.8 q9, q8, q9, #1
|
||||
|
||||
vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
|
||||
vrhadd.u8 q2, q4, q5
|
||||
vrhadd.u8 q3, q6, q7
|
||||
vrhadd.u8 q4, q8, q9
|
||||
|
||||
vrhadd.u8 q0, q0, q1
|
||||
vrhadd.u8 q1, q1, q2
|
||||
vrhadd.u8 q2, q2, q3
|
||||
vrhadd.u8 q3, q3, q4
|
||||
|
||||
subs r2, r2, #1
|
||||
vst1.u8 {d0, d1 ,d2, d3}, [r3]! ;store result
|
||||
vmov q0, q4
|
||||
vst1.u8 {d4, d5, d6, d7}, [r3]!
|
||||
|
||||
bne vp8e_filt_blk2d_fp16x16s_loop_neon
|
||||
|
||||
b sub_pixel_variance16x16s_neon
|
||||
|
||||
;--------------------
|
||||
firstpass_bfilter16x16s_only
|
||||
mov r2, #2 ;loop counter
|
||||
sub sp, sp, #256 ;reserve space on stack for temporary storage
|
||||
mov r3, sp
|
||||
|
||||
;First Pass: output_height lines x output_width columns (16x16)
|
||||
vp8e_filt_blk2d_fpo16x16s_loop_neon
|
||||
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
|
||||
vld1.u8 {d4, d5, d6, d7}, [r0], r1
|
||||
vld1.u8 {d8, d9, d10, d11}, [r0], r1
|
||||
vld1.u8 {d12, d13, d14, d15}, [r0], r1
|
||||
|
||||
;pld [r0]
|
||||
;pld [r0, r1]
|
||||
;pld [r0, r1, lsl #1]
|
||||
|
||||
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
|
||||
vld1.u8 {d16, d17, d18, d19}, [r0], r1
|
||||
vext.8 q3, q2, q3, #1
|
||||
vld1.u8 {d20, d21, d22, d23}, [r0], r1
|
||||
vext.8 q5, q4, q5, #1
|
||||
vld1.u8 {d24, d25, d26, d27}, [r0], r1
|
||||
vext.8 q7, q6, q7, #1
|
||||
vld1.u8 {d28, d29, d30, d31}, [r0], r1
|
||||
vext.8 q9, q8, q9, #1
|
||||
vext.8 q11, q10, q11, #1
|
||||
vext.8 q13, q12, q13, #1
|
||||
vext.8 q15, q14, q15, #1
|
||||
|
||||
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
|
||||
vrhadd.u8 q1, q2, q3
|
||||
vrhadd.u8 q2, q4, q5
|
||||
vrhadd.u8 q3, q6, q7
|
||||
vrhadd.u8 q4, q8, q9
|
||||
vrhadd.u8 q5, q10, q11
|
||||
vrhadd.u8 q6, q12, q13
|
||||
vrhadd.u8 q7, q14, q15
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
|
||||
vst1.u8 {d4, d5, d6, d7}, [r3]!
|
||||
vst1.u8 {d8, d9, d10, d11}, [r3]!
|
||||
vst1.u8 {d12, d13, d14, d15}, [r3]!
|
||||
|
||||
bne vp8e_filt_blk2d_fpo16x16s_loop_neon
|
||||
|
||||
b sub_pixel_variance16x16s_neon
|
||||
|
||||
;---------------------
|
||||
secondpass_bfilter16x16s_only
|
||||
sub sp, sp, #256 ;reserve space on stack for temporary storage
|
||||
|
||||
mov r2, #2 ;loop counter
|
||||
vld1.u8 {d0, d1}, [r0], r1 ;load src data
|
||||
mov r3, sp
|
||||
|
||||
vp8e_filt_blk2d_spo16x16s_loop_neon
|
||||
vld1.u8 {d2, d3}, [r0], r1
|
||||
vld1.u8 {d4, d5}, [r0], r1
|
||||
vld1.u8 {d6, d7}, [r0], r1
|
||||
vld1.u8 {d8, d9}, [r0], r1
|
||||
|
||||
vrhadd.u8 q0, q0, q1
|
||||
vld1.u8 {d10, d11}, [r0], r1
|
||||
vrhadd.u8 q1, q1, q2
|
||||
vld1.u8 {d12, d13}, [r0], r1
|
||||
vrhadd.u8 q2, q2, q3
|
||||
vld1.u8 {d14, d15}, [r0], r1
|
||||
vrhadd.u8 q3, q3, q4
|
||||
vld1.u8 {d16, d17}, [r0], r1
|
||||
vrhadd.u8 q4, q4, q5
|
||||
vrhadd.u8 q5, q5, q6
|
||||
vrhadd.u8 q6, q6, q7
|
||||
vrhadd.u8 q7, q7, q8
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
|
||||
vmov q0, q8
|
||||
vst1.u8 {d4, d5, d6, d7}, [r3]!
|
||||
vst1.u8 {d8, d9, d10, d11}, [r3]! ;store result
|
||||
vst1.u8 {d12, d13, d14, d15}, [r3]!
|
||||
|
||||
bne vp8e_filt_blk2d_spo16x16s_loop_neon
|
||||
|
||||
b sub_pixel_variance16x16s_neon
|
||||
|
||||
;----------------------------
|
||||
;variance16x16
|
||||
sub_pixel_variance16x16s_neon
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
sub r3, r3, #256
|
||||
mov r2, #4
|
||||
|
||||
sub_pixel_variance16x16s_neon_loop
|
||||
vld1.8 {q0}, [r3]! ;Load up source and reference
|
||||
vld1.8 {q1}, [r4], r12
|
||||
vld1.8 {q2}, [r3]!
|
||||
vld1.8 {q3}, [r4], r12
|
||||
vld1.8 {q4}, [r3]!
|
||||
vld1.8 {q5}, [r4], r12
|
||||
vld1.8 {q6}, [r3]!
|
||||
vld1.8 {q7}, [r4], r12
|
||||
|
||||
vsubl.u8 q11, d0, d2 ;diff
|
||||
vsubl.u8 q12, d1, d3
|
||||
vsubl.u8 q13, d4, d6
|
||||
vsubl.u8 q14, d5, d7
|
||||
vsubl.u8 q0, d8, d10
|
||||
vsubl.u8 q1, d9, d11
|
||||
vsubl.u8 q2, d12, d14
|
||||
vsubl.u8 q3, d13, d15
|
||||
|
||||
vpadal.s16 q8, q11 ;sum
|
||||
vmlal.s16 q9, d22, d22 ;sse
|
||||
vmlal.s16 q10, d23, d23
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vpadal.s16 q8, q12
|
||||
vmlal.s16 q9, d24, d24
|
||||
vmlal.s16 q10, d25, d25
|
||||
vpadal.s16 q8, q13
|
||||
vmlal.s16 q9, d26, d26
|
||||
vmlal.s16 q10, d27, d27
|
||||
vpadal.s16 q8, q14
|
||||
vmlal.s16 q9, d28, d28
|
||||
vmlal.s16 q10, d29, d29
|
||||
|
||||
vpadal.s16 q8, q0 ;sum
|
||||
vmlal.s16 q9, d0, d0 ;sse
|
||||
vmlal.s16 q10, d1, d1
|
||||
vpadal.s16 q8, q1
|
||||
vmlal.s16 q9, d2, d2
|
||||
vmlal.s16 q10, d3, d3
|
||||
vpadal.s16 q8, q2
|
||||
vmlal.s16 q9, d4, d4
|
||||
vmlal.s16 q10, d5, d5
|
||||
vpadal.s16 q8, q3
|
||||
vmlal.s16 q9, d6, d6
|
||||
vmlal.s16 q10, d7, d7
|
||||
|
||||
bne sub_pixel_variance16x16s_neon_loop
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [lr] ;store sse
|
||||
vshr.u32 d10, d10, #8
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
add sp, sp, #256
|
||||
vmov.32 r0, d0[0] ;return
|
||||
|
||||
pop {r4, pc}
|
||||
ENDP
|
||||
|
||||
END
|
@ -1,222 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_sub_pixel_variance8x8_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; stack(r4) unsigned char *dst_ptr,
|
||||
; stack(r5) int dst_pixels_per_line,
|
||||
; stack(r6) unsigned int *sse
|
||||
;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon.
|
||||
|
||||
|vp8_sub_pixel_variance8x8_neon| PROC
|
||||
push {r4-r5, lr}
|
||||
|
||||
adr r12, bilinear_taps_coeff
|
||||
ldr r4, [sp, #12] ;load *dst_ptr from stack
|
||||
ldr r5, [sp, #16] ;load dst_pixels_per_line from stack
|
||||
ldr lr, [sp, #20] ;load *sse from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq skip_firstpass_filter
|
||||
|
||||
;First pass: output_height lines x output_width columns (9x8)
|
||||
add r2, r12, r2, lsl #3 ;calculate filter location
|
||||
|
||||
vld1.u8 {q1}, [r0], r1 ;load src data
|
||||
vld1.u32 {d31}, [r2] ;load first_pass filter
|
||||
vld1.u8 {q2}, [r0], r1
|
||||
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
|
||||
vld1.u8 {q3}, [r0], r1
|
||||
vdup.8 d1, d31[4]
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
|
||||
vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
|
||||
vmull.u8 q7, d4, d0
|
||||
vmull.u8 q8, d6, d0
|
||||
vmull.u8 q9, d8, d0
|
||||
|
||||
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
|
||||
vext.8 d5, d4, d5, #1
|
||||
vext.8 d7, d6, d7, #1
|
||||
vext.8 d9, d8, d9, #1
|
||||
|
||||
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
|
||||
vmlal.u8 q7, d5, d1
|
||||
vmlal.u8 q8, d7, d1
|
||||
vmlal.u8 q9, d9, d1
|
||||
|
||||
vld1.u8 {q1}, [r0], r1 ;load src data
|
||||
vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
|
||||
vld1.u8 {q2}, [r0], r1
|
||||
vqrshrn.u16 d23, q7, #7
|
||||
vld1.u8 {q3}, [r0], r1
|
||||
vqrshrn.u16 d24, q8, #7
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vqrshrn.u16 d25, q9, #7
|
||||
|
||||
;first_pass filtering on the rest 5-line data
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
|
||||
vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
|
||||
vmull.u8 q7, d4, d0
|
||||
vmull.u8 q8, d6, d0
|
||||
vmull.u8 q9, d8, d0
|
||||
vmull.u8 q10, d10, d0
|
||||
|
||||
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
|
||||
vext.8 d5, d4, d5, #1
|
||||
vext.8 d7, d6, d7, #1
|
||||
vext.8 d9, d8, d9, #1
|
||||
vext.8 d11, d10, d11, #1
|
||||
|
||||
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
|
||||
vmlal.u8 q7, d5, d1
|
||||
vmlal.u8 q8, d7, d1
|
||||
vmlal.u8 q9, d9, d1
|
||||
vmlal.u8 q10, d11, d1
|
||||
|
||||
vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d27, q7, #7
|
||||
vqrshrn.u16 d28, q8, #7
|
||||
vqrshrn.u16 d29, q9, #7
|
||||
vqrshrn.u16 d30, q10, #7
|
||||
|
||||
;Second pass: 8x8
|
||||
secondpass_filter
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
;skip_secondpass_filter
|
||||
beq sub_pixel_variance8x8_neon
|
||||
|
||||
add r3, r12, r3, lsl #3
|
||||
|
||||
vld1.u32 {d31}, [r3] ;load second_pass filter
|
||||
|
||||
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
|
||||
vdup.8 d1, d31[4]
|
||||
|
||||
vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
|
||||
vmull.u8 q2, d23, d0
|
||||
vmull.u8 q3, d24, d0
|
||||
vmull.u8 q4, d25, d0
|
||||
vmull.u8 q5, d26, d0
|
||||
vmull.u8 q6, d27, d0
|
||||
vmull.u8 q7, d28, d0
|
||||
vmull.u8 q8, d29, d0
|
||||
|
||||
vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * Filter[1])
|
||||
vmlal.u8 q2, d24, d1
|
||||
vmlal.u8 q3, d25, d1
|
||||
vmlal.u8 q4, d26, d1
|
||||
vmlal.u8 q5, d27, d1
|
||||
vmlal.u8 q6, d28, d1
|
||||
vmlal.u8 q7, d29, d1
|
||||
vmlal.u8 q8, d30, d1
|
||||
|
||||
vqrshrn.u16 d22, q1, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d23, q2, #7
|
||||
vqrshrn.u16 d24, q3, #7
|
||||
vqrshrn.u16 d25, q4, #7
|
||||
vqrshrn.u16 d26, q5, #7
|
||||
vqrshrn.u16 d27, q6, #7
|
||||
vqrshrn.u16 d28, q7, #7
|
||||
vqrshrn.u16 d29, q8, #7
|
||||
|
||||
b sub_pixel_variance8x8_neon
|
||||
|
||||
;--------------------
|
||||
skip_firstpass_filter
|
||||
vld1.u8 {d22}, [r0], r1 ;load src data
|
||||
vld1.u8 {d23}, [r0], r1
|
||||
vld1.u8 {d24}, [r0], r1
|
||||
vld1.u8 {d25}, [r0], r1
|
||||
vld1.u8 {d26}, [r0], r1
|
||||
vld1.u8 {d27}, [r0], r1
|
||||
vld1.u8 {d28}, [r0], r1
|
||||
vld1.u8 {d29}, [r0], r1
|
||||
vld1.u8 {d30}, [r0], r1
|
||||
|
||||
b secondpass_filter
|
||||
|
||||
;----------------------
|
||||
;vp8_variance8x8_neon
|
||||
sub_pixel_variance8x8_neon
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
mov r12, #2
|
||||
|
||||
sub_pixel_variance8x8_neon_loop
|
||||
vld1.8 {d0}, [r4], r5 ;load dst data
|
||||
subs r12, r12, #1
|
||||
vld1.8 {d1}, [r4], r5
|
||||
vld1.8 {d2}, [r4], r5
|
||||
vsubl.u8 q4, d22, d0 ;calculate diff
|
||||
vld1.8 {d3}, [r4], r5
|
||||
|
||||
vsubl.u8 q5, d23, d1
|
||||
vsubl.u8 q6, d24, d2
|
||||
|
||||
vpadal.s16 q8, q4 ;sum
|
||||
vmlal.s16 q9, d8, d8 ;sse
|
||||
vmlal.s16 q10, d9, d9
|
||||
|
||||
vsubl.u8 q7, d25, d3
|
||||
|
||||
vpadal.s16 q8, q5
|
||||
vmlal.s16 q9, d10, d10
|
||||
vmlal.s16 q10, d11, d11
|
||||
|
||||
vmov q11, q13
|
||||
|
||||
vpadal.s16 q8, q6
|
||||
vmlal.s16 q9, d12, d12
|
||||
vmlal.s16 q10, d13, d13
|
||||
|
||||
vmov q12, q14
|
||||
|
||||
vpadal.s16 q8, q7
|
||||
vmlal.s16 q9, d14, d14
|
||||
vmlal.s16 q10, d15, d15
|
||||
|
||||
bne sub_pixel_variance8x8_neon_loop
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [lr] ;store sse
|
||||
vshr.u32 d10, d10, #6
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
pop {r4-r5, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
bilinear_taps_coeff
|
||||
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
|
||||
|
||||
END
|
1028
media/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
Normal file
1028
media/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,58 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#include "vpx_config.h"
|
||||
#include "vp8_rtcd.h"
|
||||
#include "vp8/common/blockd.h"
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
|
||||
#if HAVE_NEON
|
||||
extern void vp8_build_intra_predictors_mby_neon_func(
|
||||
unsigned char *y_buffer,
|
||||
unsigned char *ypred_ptr,
|
||||
int y_stride,
|
||||
int mode,
|
||||
int Up,
|
||||
int Left);
|
||||
|
||||
void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x)
|
||||
{
|
||||
unsigned char *y_buffer = x->dst.y_buffer;
|
||||
unsigned char *ypred_ptr = x->predictor;
|
||||
int y_stride = x->dst.y_stride;
|
||||
int mode = x->mode_info_context->mbmi.mode;
|
||||
int Up = x->up_available;
|
||||
int Left = x->left_available;
|
||||
|
||||
vp8_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
|
||||
}
|
||||
|
||||
extern void vp8_build_intra_predictors_mby_s_neon_func(
|
||||
unsigned char *y_buffer,
|
||||
unsigned char *ypred_ptr,
|
||||
int y_stride,
|
||||
int mode,
|
||||
int Up,
|
||||
int Left);
|
||||
|
||||
void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x)
|
||||
{
|
||||
unsigned char *y_buffer = x->dst.y_buffer;
|
||||
unsigned char *ypred_ptr = x->predictor;
|
||||
int y_stride = x->dst.y_stride;
|
||||
int mode = x->mode_info_context->mbmi.mode;
|
||||
int Up = x->up_available;
|
||||
int Left = x->left_available;
|
||||
|
||||
vp8_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
|
||||
}
|
||||
|
||||
#endif
|
@ -9,8 +9,8 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __INC_BLOCKD_H
|
||||
#define __INC_BLOCKD_H
|
||||
#ifndef VP8_COMMON_BLOCKD_H_
|
||||
#define VP8_COMMON_BLOCKD_H_
|
||||
|
||||
void vpx_log(const char *format, ...);
|
||||
|
||||
@ -20,6 +20,10 @@ void vpx_log(const char *format, ...);
|
||||
#include "treecoder.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*#define DCPRED 1*/
|
||||
#define DCPREDSIMTHRESH 0
|
||||
#define DCPREDCNTTHRESH 3
|
||||
@ -297,4 +301,8 @@ typedef struct macroblockd
|
||||
extern void vp8_build_block_doffsets(MACROBLOCKD *x);
|
||||
extern void vp8_setup_block_dptrs(MACROBLOCKD *x);
|
||||
|
||||
#endif /* __INC_BLOCKD_H */
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_BLOCKD_H_
|
||||
|
@ -8,6 +8,12 @@
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef VP8_COMMON_COEFUPDATEPROBS_H_
|
||||
#define VP8_COMMON_COEFUPDATEPROBS_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* Update probabilities for the nodes in the token entropy tree.
|
||||
Generated file included by entropy.c */
|
||||
@ -183,3 +189,9 @@ const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTE
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_COEFUPDATEPROBS_H_
|
||||
|
@ -9,8 +9,8 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef common_h
|
||||
#define common_h 1
|
||||
#ifndef VP8_COMMON_COMMON_H_
|
||||
#define VP8_COMMON_COMMON_H_
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
@ -18,6 +18,13 @@
|
||||
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
|
||||
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
|
||||
|
||||
/* Only need this for fixed-size arrays, for structs just assign. */
|
||||
|
||||
#define vp8_copy( Dest, Src) { \
|
||||
@ -37,4 +44,8 @@
|
||||
#define vp8_zero_array( Dest, N) vpx_memset( Dest, 0, N * sizeof( *Dest));
|
||||
|
||||
|
||||
#endif /* common_h */
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_COMMON_H_
|
||||
|
@ -8,6 +8,12 @@
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef VP8_COMMON_DEFAULT_COEF_PROBS_H_
|
||||
#define VP8_COMMON_DEFAULT_COEF_PROBS_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*Generated file, included by entropy.c*/
|
||||
|
||||
@ -186,3 +192,9 @@ static const vp8_prob default_coef_probs [BLOCK_TYPES]
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_DEFAULT_COEF_PROBS_H_
|
||||
|
@ -9,12 +9,16 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __INC_ENTROPY_H
|
||||
#define __INC_ENTROPY_H
|
||||
#ifndef VP8_COMMON_ENTROPY_H_
|
||||
#define VP8_COMMON_ENTROPY_H_
|
||||
|
||||
#include "treecoder.h"
|
||||
#include "blockd.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* Coefficient token alphabet */
|
||||
|
||||
#define ZERO_TOKEN 0 /* 0 Extra Bits 0+0 */
|
||||
@ -98,4 +102,8 @@ extern DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]);
|
||||
extern const int vp8_mb_feature_data_bits[MB_LVL_MAX];
|
||||
|
||||
void vp8_coef_tree_initialize(void);
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_ENTROPY_H_
|
||||
|
@ -9,12 +9,16 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __INC_ENTROPYMODE_H
|
||||
#define __INC_ENTROPYMODE_H
|
||||
#ifndef VP8_COMMON_ENTROPYMODE_H_
|
||||
#define VP8_COMMON_ENTROPYMODE_H_
|
||||
|
||||
#include "onyxc_int.h"
|
||||
#include "treecoder.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef enum
|
||||
{
|
||||
SUBMVREF_NORMAL,
|
||||
@ -77,4 +81,8 @@ void vp8_init_mbmode_probs(VP8_COMMON *x);
|
||||
void vp8_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES-1]);
|
||||
void vp8_kf_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1]);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_ENTROPYMODE_H_
|
||||
|
@ -9,11 +9,15 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __INC_ENTROPYMV_H
|
||||
#define __INC_ENTROPYMV_H
|
||||
#ifndef VP8_COMMON_ENTROPYMV_H_
|
||||
#define VP8_COMMON_ENTROPYMV_H_
|
||||
|
||||
#include "treecoder.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
enum
|
||||
{
|
||||
mv_max = 1023, /* max absolute value of a MV component */
|
||||
@ -41,4 +45,8 @@ typedef struct mv_context
|
||||
|
||||
extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2];
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_ENTROPYMV_H_
|
||||
|
@ -9,11 +9,15 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __INC_EXTEND_H
|
||||
#define __INC_EXTEND_H
|
||||
#ifndef VP8_COMMON_EXTEND_H_
|
||||
#define VP8_COMMON_EXTEND_H_
|
||||
|
||||
#include "vpx_scale/yv12config.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr);
|
||||
void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
|
||||
YV12_BUFFER_CONFIG *dst);
|
||||
@ -22,4 +26,8 @@ void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
|
||||
int srcy, int srcx,
|
||||
int srch, int srcw);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_EXTEND_H_
|
||||
|
@ -9,11 +9,15 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef FILTER_H
|
||||
#define FILTER_H
|
||||
#ifndef VP8_COMMON_FILTER_H_
|
||||
#define VP8_COMMON_FILTER_H_
|
||||
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define BLOCK_HEIGHT_WIDTH 4
|
||||
#define VP8_FILTER_WEIGHT 128
|
||||
#define VP8_FILTER_SHIFT 7
|
||||
@ -21,4 +25,8 @@
|
||||
extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]);
|
||||
extern DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_FILTER_H_
|
||||
|
@ -9,14 +9,18 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __INC_FINDNEARMV_H
|
||||
#define __INC_FINDNEARMV_H
|
||||
#ifndef VP8_COMMON_FINDNEARMV_H_
|
||||
#define VP8_COMMON_FINDNEARMV_H_
|
||||
|
||||
#include "mv.h"
|
||||
#include "blockd.h"
|
||||
#include "modecont.h"
|
||||
#include "treecoder.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp,
|
||||
const int *ref_frame_sign_bias)
|
||||
@ -179,4 +183,8 @@ static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b, int mi
|
||||
return (cur_mb->bmi + b - 4)->as_mode;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_FINDNEARMV_H_
|
||||
|
@ -9,8 +9,12 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __INC_HEADER_H
|
||||
#define __INC_HEADER_H
|
||||
#ifndef VP8_COMMON_HEADER_H_
|
||||
#define VP8_COMMON_HEADER_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* 24 bits total */
|
||||
typedef struct
|
||||
@ -40,4 +44,8 @@ typedef struct
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_HEADER_H_
|
||||
|
@ -9,8 +9,8 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __INC_INVTRANS_H
|
||||
#define __INC_INVTRANS_H
|
||||
#ifndef VP8_COMMON_INVTRANS_H_
|
||||
#define VP8_COMMON_INVTRANS_H_
|
||||
|
||||
#include "vpx_config.h"
|
||||
#include "vp8_rtcd.h"
|
||||
@ -21,6 +21,10 @@
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
static void eob_adjust(char *eobs, short *diff)
|
||||
{
|
||||
/* eob adjust.... the idct can only skip if both the dc and eob are zero */
|
||||
@ -59,4 +63,8 @@ static void vp8_inverse_transform_mby(MACROBLOCKD *xd)
|
||||
xd->dst.y_buffer,
|
||||
xd->dst.y_stride, xd->eobs);
|
||||
}
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_INVTRANS_H_
|
||||
|
@ -15,6 +15,7 @@
|
||||
#include "onyxc_int.h"
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
|
||||
|
||||
static void lf_init_lut(loop_filter_info_n *lfi)
|
||||
{
|
||||
int filt_lvl;
|
||||
|
@ -9,13 +9,17 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef loopfilter_h
|
||||
#define loopfilter_h
|
||||
#ifndef VP8_COMMON_LOOPFILTER_H_
|
||||
#define VP8_COMMON_LOOPFILTER_H_
|
||||
|
||||
#include "vpx_ports/mem.h"
|
||||
#include "vpx_config.h"
|
||||
#include "vp8_rtcd.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define MAX_LOOP_FILTER 63
|
||||
/* fraction of total macroblock rows to be used in fast filter level picking */
|
||||
/* has to be > 2 */
|
||||
@ -102,4 +106,8 @@ void vp8_loop_filter_row_simple(struct VP8Common *cm,
|
||||
int mb_row, int post_ystride, int post_uvstride,
|
||||
unsigned char *y_ptr, unsigned char *u_ptr,
|
||||
unsigned char *v_ptr);
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_LOOPFILTER_H_
|
||||
|
@ -9,9 +9,17 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __INC_MODECONT_H
|
||||
#define __INC_MODECONT_H
|
||||
#ifndef VP8_COMMON_MODECONT_H_
|
||||
#define VP8_COMMON_MODECONT_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern const int vp8_mode_contexts[6][4];
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_MODECONT_H_
|
||||
|
@ -9,10 +9,14 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __INC_MV_H
|
||||
#define __INC_MV_H
|
||||
#ifndef VP8_COMMON_MV_H_
|
||||
#define VP8_COMMON_MV_H_
|
||||
#include "vpx/vpx_integer.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct
|
||||
{
|
||||
short row;
|
||||
@ -25,4 +29,8 @@ typedef union int_mv
|
||||
MV as_mv;
|
||||
} int_mv; /* facilitates faster equality tests and copies */
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_MV_H_
|
||||
|
@ -9,8 +9,8 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __INC_VP8_H
|
||||
#define __INC_VP8_H
|
||||
#ifndef VP8_COMMON_ONYX_H_
|
||||
#define VP8_COMMON_ONYX_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
@ -39,8 +39,8 @@ extern "C"
|
||||
|
||||
typedef enum
|
||||
{
|
||||
USAGE_STREAM_FROM_SERVER = 0x0,
|
||||
USAGE_LOCAL_FILE_PLAYBACK = 0x1,
|
||||
USAGE_LOCAL_FILE_PLAYBACK = 0x0,
|
||||
USAGE_STREAM_FROM_SERVER = 0x1,
|
||||
USAGE_CONSTRAINED_QUALITY = 0x2,
|
||||
USAGE_CONSTANT_QUALITY = 0x3
|
||||
} END_USAGE;
|
||||
@ -104,7 +104,18 @@ extern "C"
|
||||
struct vpx_rational timebase;
|
||||
unsigned int target_bandwidth; /* kilobits per second */
|
||||
|
||||
/* parameter used for applying pre processing blur: recommendation 0 */
|
||||
/* Parameter used for applying denoiser.
|
||||
* For temporal denoiser: noise_sensitivity = 0 means off,
|
||||
* noise_sensitivity = 1 means temporal denoiser on for Y channel only,
|
||||
* noise_sensitivity = 2 means temporal denoiser on for all channels.
|
||||
* noise_sensitivity = 3 means aggressive denoising mode.
|
||||
* noise_sensitivity >= 4 means adaptive denoising mode.
|
||||
* Temporal denoiser is enabled via the configuration option:
|
||||
* CONFIG_TEMPORAL_DENOISING.
|
||||
* For spatial denoiser: noise_sensitivity controls the amount of
|
||||
* pre-processing blur: noise_sensitivity = 0 means off.
|
||||
* Spatial denoiser invoked under !CONFIG_TEMPORAL_DENOISING.
|
||||
*/
|
||||
int noise_sensitivity;
|
||||
|
||||
/* parameter used for sharpening output: recommendation 0: */
|
||||
@ -213,7 +224,7 @@ extern "C"
|
||||
int arnr_strength;
|
||||
int arnr_type;
|
||||
|
||||
struct vpx_fixed_buf two_pass_stats_in;
|
||||
vpx_fixed_buf_t two_pass_stats_in;
|
||||
struct vpx_codec_pkt_list *output_pkt_list;
|
||||
|
||||
vp8e_tuning tuning;
|
||||
@ -267,4 +278,4 @@ extern "C"
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif // VP8_COMMON_ONYX_H_
|
||||
|
@ -9,8 +9,8 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __INC_VP8C_INT_H
|
||||
#define __INC_VP8C_INT_H
|
||||
#ifndef VP8_COMMON_ONYXC_INT_H_
|
||||
#define VP8_COMMON_ONYXC_INT_H_
|
||||
|
||||
#include "vpx_config.h"
|
||||
#include "vp8_rtcd.h"
|
||||
@ -26,6 +26,10 @@
|
||||
#include "header.h"
|
||||
/*#endif*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define MINQ 0
|
||||
#define MAXQ 127
|
||||
#define QINDEX_RANGE (MAXQ + 1)
|
||||
@ -174,4 +178,8 @@ typedef struct VP8Common
|
||||
int cpu_caps;
|
||||
} VP8_COMMON;
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_ONYXC_INT_H_
|
||||
|
@ -9,8 +9,8 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __INC_VP8D_H
|
||||
#define __INC_VP8D_H
|
||||
#ifndef VP8_COMMON_ONYXD_H_
|
||||
#define VP8_COMMON_ONYXD_H_
|
||||
|
||||
|
||||
/* Create/destroy static data structures. */
|
||||
@ -60,4 +60,4 @@ extern "C"
|
||||
#endif
|
||||
|
||||
|
||||
#endif
|
||||
#endif // VP8_COMMON_ONYXD_H_
|
||||
|
@ -71,11 +71,6 @@ static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] =
|
||||
};
|
||||
#endif
|
||||
|
||||
static const short kernel5[] =
|
||||
{
|
||||
1, 1, 4, 1, 1
|
||||
};
|
||||
|
||||
const short vp8_rv[] =
|
||||
{
|
||||
8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
|
||||
@ -308,13 +303,14 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, i
|
||||
{
|
||||
d[r&15] = (rv2[r&127] + sum + s[0]) >> 4;
|
||||
}
|
||||
|
||||
s[-8*pitch] = d[(r-8)&15];
|
||||
if (r >= 8)
|
||||
s[-8*pitch] = d[(r-8)&15];
|
||||
s += pitch;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if CONFIG_POSTPROC
|
||||
static void vp8_de_mblock(YV12_BUFFER_CONFIG *post,
|
||||
int q)
|
||||
{
|
||||
@ -387,6 +383,7 @@ void vp8_deblock(VP8_COMMON *cm,
|
||||
vp8_yv12_copy_frame(source, post);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !(CONFIG_TEMPORAL_DENOISING)
|
||||
void vp8_de_noise(VP8_COMMON *cm,
|
||||
@ -396,12 +393,12 @@ void vp8_de_noise(VP8_COMMON *cm,
|
||||
int low_var_thresh,
|
||||
int flag)
|
||||
{
|
||||
int mbr;
|
||||
double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
|
||||
int ppl = (int)(level + .5);
|
||||
int mb_rows = source->y_width >> 4;
|
||||
int mb_cols = source->y_height >> 4;
|
||||
int mb_rows = cm->mb_rows;
|
||||
int mb_cols = cm->mb_cols;
|
||||
unsigned char *limits = cm->pp_limits_buffer;;
|
||||
int mbr, mbc;
|
||||
(void) post;
|
||||
(void) low_var_thresh;
|
||||
(void) flag;
|
||||
|
@ -9,8 +9,8 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef POSTPROC_H
|
||||
#define POSTPROC_H
|
||||
#ifndef VP8_COMMON_POSTPROC_H_
|
||||
#define VP8_COMMON_POSTPROC_H_
|
||||
|
||||
#include "vpx_ports/mem.h"
|
||||
struct postproc_state
|
||||
@ -26,6 +26,10 @@ struct postproc_state
|
||||
};
|
||||
#include "onyxc_int.h"
|
||||
#include "ppflags.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest,
|
||||
vp8_ppflags_t *flags);
|
||||
|
||||
@ -47,4 +51,8 @@ void vp8_deblock(struct VP8Common *oci,
|
||||
#define MFQE_PRECISION 4
|
||||
|
||||
void vp8_multiframe_quality_enhance(struct VP8Common *cm);
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_POSTPROC_H_
|
||||
|
@ -9,8 +9,12 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __INC_PPFLAGS_H
|
||||
#define __INC_PPFLAGS_H
|
||||
#ifndef VP8_COMMON_PPFLAGS_H_
|
||||
#define VP8_COMMON_PPFLAGS_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
enum
|
||||
{
|
||||
VP8D_NOFILTERING = 0,
|
||||
@ -38,4 +42,8 @@ typedef struct
|
||||
int display_mv_flag;
|
||||
} vp8_ppflags_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_PPFLAGS_H_
|
||||
|
@ -1,19 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
#ifdef __INTEL_COMPILER
|
||||
#pragma warning(disable:997 1011 170)
|
||||
#endif
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(disable:4799)
|
||||
#endif
|
@ -8,14 +8,27 @@
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef VP8_COMMON_QUANT_COMMON_H_
|
||||
#define VP8_COMMON_QUANT_COMMON_H_
|
||||
|
||||
|
||||
#include "string.h"
|
||||
#include "blockd.h"
|
||||
#include "onyxc_int.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern int vp8_ac_yquant(int QIndex);
|
||||
extern int vp8_dc_quant(int QIndex, int Delta);
|
||||
extern int vp8_dc2quant(int QIndex, int Delta);
|
||||
extern int vp8_ac2quant(int QIndex, int Delta);
|
||||
extern int vp8_dc_uv_quant(int QIndex, int Delta);
|
||||
extern int vp8_ac_uv_quant(int QIndex, int Delta);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_QUANT_COMMON_H_
|
||||
|
@ -9,8 +9,12 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __INC_RECONINTER_H
|
||||
#define __INC_RECONINTER_H
|
||||
#ifndef VP8_COMMON_RECONINTER_H_
|
||||
#define VP8_COMMON_RECONINTER_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x);
|
||||
extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
|
||||
@ -32,4 +36,8 @@ extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch,
|
||||
extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
|
||||
extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_RECONINTER_H_
|
||||
|
@ -9,10 +9,14 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __INC_RECONINTRA4x4_H
|
||||
#define __INC_RECONINTRA4x4_H
|
||||
#ifndef VP8_COMMON_RECONINTRA4X4_H_
|
||||
#define VP8_COMMON_RECONINTRA4X4_H_
|
||||
#include "vp8/common/blockd.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
static void intra_prediction_down_copy(MACROBLOCKD *xd,
|
||||
unsigned char *above_right_src)
|
||||
{
|
||||
@ -29,4 +33,8 @@ static void intra_prediction_down_copy(MACROBLOCKD *xd,
|
||||
*dst_ptr2 = *src_ptr;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_RECONINTRA4X4_H_
|
||||
|
@ -8,10 +8,14 @@
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef SETUPINTRARECON_H
|
||||
#define SETUPINTRARECON_H
|
||||
#ifndef VP8_COMMON_SETUPINTRARECON_H_
|
||||
#define VP8_COMMON_SETUPINTRARECON_H_
|
||||
|
||||
#include "vpx_scale/yv12config.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
extern void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
|
||||
extern void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf);
|
||||
|
||||
@ -34,4 +38,8 @@ void setup_intra_recon_left(unsigned char *y_buffer,
|
||||
v_buffer[uv_stride *i] = (unsigned char) 129;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_SETUPINTRARECON_H_
|
||||
|
@ -9,11 +9,19 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef SWAPYV12_BUFFER_H
|
||||
#define SWAPYV12_BUFFER_H
|
||||
#ifndef VP8_COMMON_SWAPYV12BUFFER_H_
|
||||
#define VP8_COMMON_SWAPYV12BUFFER_H_
|
||||
|
||||
#include "vpx_scale/yv12config.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_SWAPYV12BUFFER_H_
|
||||
|
@ -8,8 +8,20 @@
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef VP8_COMMON_SYSTEMDEPENDENT_H_
|
||||
#define VP8_COMMON_SYSTEMDEPENDENT_H_
|
||||
|
||||
#include "vpx_config.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct VP8Common;
|
||||
void vp8_machine_specific_config(struct VP8Common *);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_SYSTEMDEPENDENT_H_
|
||||
|
@ -9,8 +9,12 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef _PTHREAD_EMULATION
|
||||
#define _PTHREAD_EMULATION
|
||||
#ifndef VP8_COMMON_THREADING_H_
|
||||
#define VP8_COMMON_THREADING_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
|
||||
|
||||
@ -183,4 +187,8 @@ static inline int sem_destroy(sem_t * sem)
|
||||
|
||||
#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_THREADING_H_
|
||||
|
@ -9,8 +9,12 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __INC_TREECODER_H
|
||||
#define __INC_TREECODER_H
|
||||
#ifndef VP8_COMMON_TREECODER_H_
|
||||
#define VP8_COMMON_TREECODER_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef unsigned char vp8bc_index_t; /* probability index */
|
||||
|
||||
@ -87,4 +91,8 @@ void vp8bc_tree_probs_from_distribution(
|
||||
);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_TREECODER_H_
|
||||
|
@ -9,11 +9,15 @@
|
||||
*/
|
||||
|
||||
|
||||
#ifndef VARIANCE_H
|
||||
#define VARIANCE_H
|
||||
#ifndef VP8_COMMON_VARIANCE_H_
|
||||
#define VP8_COMMON_VARIANCE_H_
|
||||
|
||||
#include "vpx_config.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef unsigned int(*vp8_sad_fn_t)(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
@ -112,4 +116,8 @@ typedef struct variance_vtable
|
||||
#endif
|
||||
} vp8_variance_fn_ptr_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_VARIANCE_H_
|
||||
|
@ -8,6 +8,12 @@
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef VP8_COMMON_VP8_ENTROPYMODEDATA_H_
|
||||
#define VP8_COMMON_VP8_ENTROPYMODEDATA_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*Generated file, included by entropymode.c*/
|
||||
|
||||
@ -240,3 +246,9 @@ const vp8_prob vp8_kf_bmode_prob
|
||||
{ 112, 19, 12, 61, 195, 128, 48, 4, 24 }
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_VP8_ENTROPYMODEDATA_H_
|
||||
|
@ -8,11 +8,15 @@
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef FILTER_X86_H
|
||||
#define FILTER_X86_H
|
||||
#ifndef VP8_COMMON_X86_FILTER_X86_H_
|
||||
#define VP8_COMMON_X86_FILTER_X86_H_
|
||||
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with
|
||||
* duplicated values */
|
||||
|
||||
@ -22,4 +26,8 @@ extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]);
|
||||
/* duplicated 8x */
|
||||
extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]);
|
||||
|
||||
#endif /* FILTER_X86_H */
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_X86_FILTER_X86_H_
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user