Bug 1540760 - Add missing aarch64 files for ffvpx; r=jya

Differential Revision: https://phabricator.services.mozilla.com/D27789

--HG--
extra : moz-landing-system : lando
This commit is contained in:
Dan Minor 2019-05-01 23:06:25 +00:00
parent 8bcd86c1fe
commit 23afbbacd1
37 changed files with 12259 additions and 0 deletions

View File

@ -0,0 +1,50 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/aarch64/cpu.h"
#include "libavcodec/fft.h"
void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
av_cold void ff_fft_init_aarch64(FFTContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
s->fft_permute = ff_fft_permute_neon;
s->fft_calc = ff_fft_calc_neon;
#if CONFIG_MDCT
s->imdct_calc = ff_imdct_calc_neon;
s->imdct_half = ff_imdct_half_neon;
s->mdct_calc = ff_mdct_calc_neon;
s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
#endif
}
}

View File

@ -0,0 +1,442 @@
/*
* ARM NEON optimised FFT
*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2009 Naotoshi Nojiri
* Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
*
* This algorithm (though not any of the implementation details) is
* based on libdjbfft by D. J. Bernstein.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
#define M_SQRT1_2 0.70710678118654752440
.macro transpose d0, d1, s0, s1
trn1 \d0, \s0, \s1
trn2 \d1, \s0, \s1
.endm
function fft4_neon
ld1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
fadd v4.2s, v0.2s, v1.2s // r0+r1,i0+i1
fsub v6.2s, v0.2s, v1.2s // r0-r1,i0-i1
ext v16.8b, v2.8b, v3.8b, #4
ext v17.8b, v3.8b, v2.8b, #4
fadd v5.2s, v2.2s, v3.2s // i2+i3,r2+r3
fsub v7.2s, v16.2s, v17.2s // r3-r2,i2-i3
fadd v0.2s, v4.2s, v5.2s
fsub v2.2s, v4.2s, v5.2s
fadd v1.2s, v6.2s, v7.2s
fsub v3.2s, v6.2s, v7.2s
st1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
ret
endfunc
function fft8_neon
mov x1, x0
ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32
ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
ext v22.8b, v2.8b, v3.8b, #4
ext v23.8b, v3.8b, v2.8b, #4
fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5
fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7
fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5
fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7
rev64 v27.2s, v28.2s // ???
fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1
fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3
fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w
ext v6.8b, v4.8b, v5.8b, #4
ext v7.8b, v5.8b, v4.8b, #4
fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w
fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2
fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1
fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w
fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w
fadd v0.2s, v20.2s, v21.2s
fsub v2.2s, v20.2s, v21.2s
fadd v1.2s, v22.2s, v23.2s
rev64 v26.2s, v26.2s
rev64 v27.2s, v27.2s
fsub v3.2s, v22.2s, v23.2s
fsub v6.2s, v6.2s, v7.2s
fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2
fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6
fadd v7.2s, v4.2s, v5.2s
fsub v18.2s, v2.2s, v6.2s
ext v26.8b, v24.8b, v25.8b, #4
ext v27.8b, v25.8b, v24.8b, #4
fadd v2.2s, v2.2s, v6.2s
fsub v16.2s, v0.2s, v7.2s
fadd v5.2s, v25.2s, v24.2s
fsub v4.2s, v26.2s, v27.2s
fadd v0.2s, v0.2s, v7.2s
fsub v17.2s, v1.2s, v5.2s
fsub v19.2s, v3.2s, v4.2s
fadd v3.2s, v3.2s, v4.2s
fadd v1.2s, v1.2s, v5.2s
st1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x1]
ret
endfunc
function fft16_neon
mov x1, x0
ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32
ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32
ext v22.8b, v2.8b, v3.8b, #4
ext v23.8b, v3.8b, v2.8b, #4
fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5
fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7
fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5
fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7
rev64 v27.2s, v28.2s // ???
fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1
fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3
fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w
ext v6.8b, v4.8b, v5.8b, #4
ext v7.8b, v5.8b, v4.8b, #4
fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w
fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2
fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1
fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w
fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w
fadd v0.2s, v20.2s, v21.2s
fsub v2.2s, v20.2s, v21.2s
fadd v1.2s, v22.2s, v23.2s
rev64 v26.2s, v26.2s
rev64 v27.2s, v27.2s
fsub v3.2s, v22.2s, v23.2s
fsub v6.2s, v6.2s, v7.2s
fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2
fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6
fadd v7.2s, v4.2s, v5.2s
fsub v18.2s, v2.2s, v6.2s
ld1 {v20.4s,v21.4s}, [x0], #32
ld1 {v22.4s,v23.4s}, [x0], #32
ext v26.8b, v24.8b, v25.8b, #4
ext v27.8b, v25.8b, v24.8b, #4
fadd v2.2s, v2.2s, v6.2s
fsub v16.2s, v0.2s, v7.2s
fadd v5.2s, v25.2s, v24.2s
fsub v4.2s, v26.2s, v27.2s
transpose v24.2d, v25.2d, v20.2d, v22.2d
transpose v26.2d, v27.2d, v21.2d, v23.2d
fadd v0.2s, v0.2s, v7.2s
fsub v17.2s, v1.2s, v5.2s
fsub v19.2s, v3.2s, v4.2s
fadd v3.2s, v3.2s, v4.2s
fadd v1.2s, v1.2s, v5.2s
ext v20.16b, v21.16b, v21.16b, #4
ext v21.16b, v23.16b, v23.16b, #4
zip1 v0.2d, v0.2d, v1.2d // {z[0], z[1]}
zip1 v1.2d, v2.2d, v3.2d // {z[2], z[3]}
zip1 v2.2d, v16.2d, v17.2d // {z[o1], z[o1+1]}
zip1 v3.2d, v18.2d, v19.2d // {z[o1+2],z[o1+3]}
// 2 x fft4
transpose v22.2d, v23.2d, v20.2d, v21.2d
fadd v4.4s, v24.4s, v25.4s
fadd v5.4s, v26.4s, v27.4s
fsub v6.4s, v24.4s, v25.4s
fsub v7.4s, v22.4s, v23.4s
ld1 {v23.4s}, [x14]
fadd v24.4s, v4.4s, v5.4s // {z[o2+0],z[o2+1]}
fsub v26.4s, v4.4s, v5.4s // {z[o2+2],z[o2+3]}
fadd v25.4s, v6.4s, v7.4s // {z[o3+0],z[o3+1]}
fsub v27.4s, v6.4s, v7.4s // {z[o3+2],z[o3+3]}
//fft_pass_neon_16
rev64 v7.4s, v25.4s
fmul v25.4s, v25.4s, v23.s[1]
fmul v7.4s, v7.4s, v29.4s
fmla v25.4s, v7.4s, v23.s[3] // {t1a,t2a,t5a,t6a}
zip1 v20.4s, v24.4s, v25.4s
zip2 v21.4s, v24.4s, v25.4s
fneg v22.4s, v20.4s
fadd v4.4s, v21.4s, v20.4s
fsub v6.4s, v20.4s, v21.4s // just the second half
fadd v5.4s, v21.4s, v22.4s // just the first half
tbl v4.16b, {v4.16b}, v30.16b // trans4_float
tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
fsub v20.4s, v0.4s, v4.4s // {z[o2],z[o2+1]}
fadd v16.4s, v0.4s, v4.4s // {z[0], z[1]}
fsub v22.4s, v2.4s, v5.4s // {z[o3],z[o3+1]}
fadd v18.4s, v2.4s, v5.4s // {z[o1],z[o1+1]}
//second half
rev64 v6.4s, v26.4s
fmul v26.4s, v26.4s, v23.s[2]
rev64 v7.4s, v27.4s
fmul v27.4s, v27.4s, v23.s[3]
fmul v6.4s, v6.4s, v29.4s
fmul v7.4s, v7.4s, v29.4s
fmla v26.4s, v6.4s, v23.s[2] // {t1,t2,t5,t6}
fmla v27.4s, v7.4s, v23.s[1] // {t1a,t2a,t5a,t6a}
zip1 v24.4s, v26.4s, v27.4s
zip2 v25.4s, v26.4s, v27.4s
fneg v26.4s, v24.4s
fadd v4.4s, v25.4s, v24.4s
fsub v6.4s, v24.4s, v25.4s // just the second half
fadd v5.4s, v25.4s, v26.4s // just the first half
tbl v4.16b, {v4.16b}, v30.16b // trans4_float
tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
fadd v17.4s, v1.4s, v4.4s // {z[2], z[3]}
fsub v21.4s, v1.4s, v4.4s // {z[o2+2],z[o2+3]}
fadd v19.4s, v3.4s, v5.4s // {z[o1+2],z[o1+3]}
fsub v23.4s, v3.4s, v5.4s // {z[o3+2],z[o3+3]}
st1 {v16.4s,v17.4s}, [x1], #32
st1 {v18.4s,v19.4s}, [x1], #32
st1 {v20.4s,v21.4s}, [x1], #32
st1 {v22.4s,v23.4s}, [x1], #32
ret
endfunc
const trans4_float, align=4
.byte 0, 1, 2, 3
.byte 8, 9, 10, 11
.byte 4, 5, 6, 7
.byte 12, 13, 14, 15
endconst
const trans8_float, align=4
.byte 24, 25, 26, 27
.byte 0, 1, 2, 3
.byte 28, 29, 30, 31
.byte 4, 5, 6, 7
endconst
function fft_pass_neon
sub x6, x2, #1 // n - 1, loop counter
lsl x5, x2, #3 // 2 * n * sizeof FFTSample
lsl x1, x2, #4 // 2 * n * sizeof FFTComplex
add x5, x4, x5 // wim
add x3, x1, x2, lsl #5 // 4 * n * sizeof FFTComplex
add x2, x0, x2, lsl #5 // &z[o2]
add x3, x0, x3 // &z[o3]
add x1, x0, x1 // &z[o1]
ld1 {v20.4s},[x2] // {z[o2],z[o2+1]}
ld1 {v22.4s},[x3] // {z[o3],z[o3+1]}
ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]}
trn2 v25.2d, v20.2d, v22.2d
sub x5, x5, #4 // wim--
trn1 v24.2d, v20.2d, v22.2d
ld1 {v5.s}[0], [x5], x7 // d5[0] = wim[-1]
rev64 v7.4s, v25.4s
fmul v25.4s, v25.4s, v4.s[1]
ld1 {v16.4s}, [x0] // {z[0],z[1]}
fmul v7.4s, v7.4s, v29.4s
ld1 {v17.4s}, [x1] // {z[o1],z[o1+1]}
prfm pldl1keep, [x2, #16]
prfm pldl1keep, [x3, #16]
fmla v25.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a}
prfm pldl1keep, [x0, #16]
prfm pldl1keep, [x1, #16]
zip1 v20.4s, v24.4s, v25.4s
zip2 v21.4s, v24.4s, v25.4s
fneg v22.4s, v20.4s
fadd v4.4s, v21.4s, v20.4s
fsub v6.4s, v20.4s, v21.4s // just the second half
fadd v5.4s, v21.4s, v22.4s // just the first half
tbl v4.16b, {v4.16b}, v30.16b // trans4_float
tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
fadd v20.4s, v16.4s, v4.4s
fsub v22.4s, v16.4s, v4.4s
fadd v21.4s, v17.4s, v5.4s
st1 {v20.4s}, [x0], #16 // {z[0], z[1]}
fsub v23.4s, v17.4s, v5.4s
st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]}
st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]}
st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]}
1:
ld1 {v20.4s},[x2] // {z[o2],z[o2+1]}
ld1 {v22.4s},[x3] // {z[o3],z[o3+1]}
ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]}
transpose v26.2d, v27.2d, v20.2d, v22.2d
ld1 {v5.2s}, [x5], x7 // {wim[-1],wim[0]}
rev64 v6.4s, v26.4s
fmul v26.4s, v26.4s, v4.s[0]
rev64 v7.4s, v27.4s
fmul v27.4s, v27.4s, v4.s[1]
fmul v6.4s, v6.4s, v29.4s
fmul v7.4s, v7.4s, v29.4s
ld1 {v16.4s},[x0] // {z[0],z[1]}
fmla v26.4s, v6.4s, v5.s[1] // {t1,t2,t5,t6}
fmla v27.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a}
ld1 {v17.4s},[x1] // {z[o1],z[o1+1]}
subs x6, x6, #1 // n--
zip1 v20.4s, v26.4s, v27.4s
zip2 v21.4s, v26.4s, v27.4s
fneg v22.4s, v20.4s
fadd v4.4s, v21.4s, v20.4s
fsub v6.4s, v20.4s, v21.4s // just the second half
fadd v5.4s, v21.4s, v22.4s // just the first half
tbl v4.16b, {v4.16b}, v30.16b // trans4_float
tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
fadd v20.4s, v16.4s, v4.4s
fsub v22.4s, v16.4s, v4.4s
fadd v21.4s, v17.4s, v5.4s
st1 {v20.4s}, [x0], #16 // {z[0], z[1]}
fsub v23.4s, v17.4s, v5.4s
st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]}
st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]}
st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]}
b.ne 1b
ret
endfunc
.macro def_fft n, n2, n4
function fft\n\()_neon, align=6
sub sp, sp, #16
stp x28, x30, [sp]
add x28, x0, #\n4*2*8
bl fft\n2\()_neon
mov x0, x28
bl fft\n4\()_neon
add x0, x28, #\n4*1*8
bl fft\n4\()_neon
sub x0, x28, #\n4*2*8
ldp x28, x30, [sp], #16
movrel x4, X(ff_cos_\n)
mov x2, #\n4>>1
b fft_pass_neon
endfunc
.endm
def_fft 32, 16, 8
def_fft 64, 32, 16
def_fft 128, 64, 32
def_fft 256, 128, 64
def_fft 512, 256, 128
def_fft 1024, 512, 256
def_fft 2048, 1024, 512
def_fft 4096, 2048, 1024
def_fft 8192, 4096, 2048
def_fft 16384, 8192, 4096
def_fft 32768, 16384, 8192
def_fft 65536, 32768, 16384
function ff_fft_calc_neon, export=1
prfm pldl1keep, [x1]
movrel x10, trans4_float
ldr w2, [x0]
movrel x11, trans8_float
sub w2, w2, #2
movrel x3, fft_tab_neon
ld1 {v30.16b}, [x10]
mov x7, #-8
movrel x12, pmmp
ldr x3, [x3, x2, lsl #3]
movrel x13, mppm
movrel x14, X(ff_cos_16)
ld1 {v31.16b}, [x11]
mov x0, x1
ld1 {v29.4s}, [x12] // pmmp
ld1 {v28.4s}, [x13]
br x3
endfunc
function ff_fft_permute_neon, export=1
mov x6, #1
ldr w2, [x0] // nbits
ldr x3, [x0, #16] // tmp_buf
ldr x0, [x0, #8] // revtab
lsl x6, x6, x2
mov x2, x6
1:
ld1 {v0.2s,v1.2s}, [x1], #16
ldr w4, [x0], #4
uxth w5, w4
lsr w4, w4, #16
add x5, x3, x5, lsl #3
add x4, x3, x4, lsl #3
st1 {v0.2s}, [x5]
st1 {v1.2s}, [x4]
subs x6, x6, #2
b.gt 1b
sub x1, x1, x2, lsl #3
1:
ld1 {v0.4s,v1.4s}, [x3], #32
st1 {v0.4s,v1.4s}, [x1], #32
subs x2, x2, #4
b.gt 1b
ret
endfunc
const fft_tab_neon, relocate=1
.quad fft4_neon
.quad fft8_neon
.quad fft16_neon
.quad fft32_neon
.quad fft64_neon
.quad fft128_neon
.quad fft256_neon
.quad fft512_neon
.quad fft1024_neon
.quad fft2048_neon
.quad fft4096_neon
.quad fft8192_neon
.quad fft16384_neon
.quad fft32768_neon
.quad fft65536_neon
endconst
const pmmp, align=4
.float +1.0, -1.0, -1.0, +1.0
endconst
const mppm, align=4
.float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
endconst

View File

@ -0,0 +1,59 @@
/*
* ARM NEON optimised H.264 chroma functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/aarch64/cpu.h"
#include "libavcodec/h264chroma.h"
#include "config.h"
void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
void ff_put_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
void ff_put_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
void ff_avg_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
void ff_avg_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
void ff_avg_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
av_cold void ff_h264chroma_init_aarch64(H264ChromaContext *c, int bit_depth)
{
const int high_bit_depth = bit_depth > 8;
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags) && !high_bit_depth) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon;
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon;
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_neon;
}
}

View File

@ -0,0 +1,450 @@
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
/* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
.macro h264_chroma_mc8 type, codec=h264
function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
.ifc \type,avg
mov x8, x0
.endif
prfm pldl1strm, [x1]
prfm pldl1strm, [x1, x2]
.ifc \codec,rv40
movrel x6, rv40bias
lsr w9, w5, #1
lsr w10, w4, #1
lsl w9, w9, #3
lsl w10, w10, #1
add w9, w9, w10
add x6, x6, w9, UXTW
ld1r {v22.8H}, [x6]
.endif
.ifc \codec,vc1
movi v22.8H, #28
.endif
mul w7, w4, w5
lsl w14, w5, #3
lsl w13, w4, #3
cmp w7, #0
sub w6, w14, w7
sub w12, w13, w7
sub w4, w7, w13
sub w4, w4, w14
add w4, w4, #64
b.eq 2f
dup v0.8B, w4
dup v1.8B, w12
ld1 {v4.8B, v5.8B}, [x1], x2
dup v2.8B, w6
dup v3.8B, w7
ext v5.8B, v4.8B, v5.8B, #1
1: ld1 {v6.8B, v7.8B}, [x1], x2
umull v16.8H, v4.8B, v0.8B
umlal v16.8H, v5.8B, v1.8B
ext v7.8B, v6.8B, v7.8B, #1
ld1 {v4.8B, v5.8B}, [x1], x2
umlal v16.8H, v6.8B, v2.8B
prfm pldl1strm, [x1]
ext v5.8B, v4.8B, v5.8B, #1
umlal v16.8H, v7.8B, v3.8B
umull v17.8H, v6.8B, v0.8B
subs w3, w3, #2
umlal v17.8H, v7.8B, v1.8B
umlal v17.8H, v4.8B, v2.8B
umlal v17.8H, v5.8B, v3.8B
prfm pldl1strm, [x1, x2]
.ifc \codec,h264
rshrn v16.8B, v16.8H, #6
rshrn v17.8B, v17.8H, #6
.else
add v16.8H, v16.8H, v22.8H
add v17.8H, v17.8H, v22.8H
shrn v16.8B, v16.8H, #6
shrn v17.8B, v17.8H, #6
.endif
.ifc \type,avg
ld1 {v20.8B}, [x8], x2
ld1 {v21.8B}, [x8], x2
urhadd v16.8B, v16.8B, v20.8B
urhadd v17.8B, v17.8B, v21.8B
.endif
st1 {v16.8B}, [x0], x2
st1 {v17.8B}, [x0], x2
b.gt 1b
ret
2: adds w12, w12, w6
dup v0.8B, w4
b.eq 5f
tst w6, w6
dup v1.8B, w12
b.eq 4f
ld1 {v4.8B}, [x1], x2
3: ld1 {v6.8B}, [x1], x2
umull v16.8H, v4.8B, v0.8B
umlal v16.8H, v6.8B, v1.8B
ld1 {v4.8B}, [x1], x2
umull v17.8H, v6.8B, v0.8B
umlal v17.8H, v4.8B, v1.8B
prfm pldl1strm, [x1]
.ifc \codec,h264
rshrn v16.8B, v16.8H, #6
rshrn v17.8B, v17.8H, #6
.else
add v16.8H, v16.8H, v22.8H
add v17.8H, v17.8H, v22.8H
shrn v16.8B, v16.8H, #6
shrn v17.8B, v17.8H, #6
.endif
prfm pldl1strm, [x1, x2]
.ifc \type,avg
ld1 {v20.8B}, [x8], x2
ld1 {v21.8B}, [x8], x2
urhadd v16.8B, v16.8B, v20.8B
urhadd v17.8B, v17.8B, v21.8B
.endif
subs w3, w3, #2
st1 {v16.8B}, [x0], x2
st1 {v17.8B}, [x0], x2
b.gt 3b
ret
4: ld1 {v4.8B, v5.8B}, [x1], x2
ld1 {v6.8B, v7.8B}, [x1], x2
ext v5.8B, v4.8B, v5.8B, #1
ext v7.8B, v6.8B, v7.8B, #1
prfm pldl1strm, [x1]
subs w3, w3, #2
umull v16.8H, v4.8B, v0.8B
umlal v16.8H, v5.8B, v1.8B
umull v17.8H, v6.8B, v0.8B
umlal v17.8H, v7.8B, v1.8B
prfm pldl1strm, [x1, x2]
.ifc \codec,h264
rshrn v16.8B, v16.8H, #6
rshrn v17.8B, v17.8H, #6
.else
add v16.8H, v16.8H, v22.8H
add v17.8H, v17.8H, v22.8H
shrn v16.8B, v16.8H, #6
shrn v17.8B, v17.8H, #6
.endif
.ifc \type,avg
ld1 {v20.8B}, [x8], x2
ld1 {v21.8B}, [x8], x2
urhadd v16.8B, v16.8B, v20.8B
urhadd v17.8B, v17.8B, v21.8B
.endif
st1 {v16.8B}, [x0], x2
st1 {v17.8B}, [x0], x2
b.gt 4b
ret
5: ld1 {v4.8B}, [x1], x2
ld1 {v5.8B}, [x1], x2
prfm pldl1strm, [x1]
subs w3, w3, #2
umull v16.8H, v4.8B, v0.8B
umull v17.8H, v5.8B, v0.8B
prfm pldl1strm, [x1, x2]
.ifc \codec,h264
rshrn v16.8B, v16.8H, #6
rshrn v17.8B, v17.8H, #6
.else
add v16.8H, v16.8H, v22.8H
add v17.8H, v17.8H, v22.8H
shrn v16.8B, v16.8H, #6
shrn v17.8B, v17.8H, #6
.endif
.ifc \type,avg
ld1 {v20.8B}, [x8], x2
ld1 {v21.8B}, [x8], x2
urhadd v16.8B, v16.8B, v20.8B
urhadd v17.8B, v17.8B, v21.8B
.endif
st1 {v16.8B}, [x0], x2
st1 {v17.8B}, [x0], x2
b.gt 5b
ret
endfunc
.endm
/* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
.macro h264_chroma_mc4 type, codec=h264
function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
.ifc \type,avg
mov x8, x0
.endif
prfm pldl1strm, [x1]
prfm pldl1strm, [x1, x2]
.ifc \codec,rv40
movrel x6, rv40bias
lsr w9, w5, #1
lsr w10, w4, #1
lsl w9, w9, #3
lsl w10, w10, #1
add w9, w9, w10
add x6, x6, w9, UXTW
ld1r {v22.8H}, [x6]
.endif
.ifc \codec,vc1
movi v22.8H, #28
.endif
mul w7, w4, w5
lsl w14, w5, #3
lsl w13, w4, #3
cmp w7, #0
sub w6, w14, w7
sub w12, w13, w7
sub w4, w7, w13
sub w4, w4, w14
add w4, w4, #64
b.eq 2f
dup v24.8B, w4
dup v25.8B, w12
ld1 {v4.8B}, [x1], x2
dup v26.8B, w6
dup v27.8B, w7
ext v5.8B, v4.8B, v5.8B, #1
trn1 v0.2S, v24.2S, v25.2S
trn1 v2.2S, v26.2S, v27.2S
trn1 v4.2S, v4.2S, v5.2S
1: ld1 {v6.8B}, [x1], x2
ext v7.8B, v6.8B, v7.8B, #1
trn1 v6.2S, v6.2S, v7.2S
umull v18.8H, v4.8B, v0.8B
umlal v18.8H, v6.8B, v2.8B
ld1 {v4.8B}, [x1], x2
ext v5.8B, v4.8B, v5.8B, #1
trn1 v4.2S, v4.2S, v5.2S
prfm pldl1strm, [x1]
umull v19.8H, v6.8B, v0.8B
umlal v19.8H, v4.8B, v2.8B
trn1 v30.2D, v18.2D, v19.2D
trn2 v31.2D, v18.2D, v19.2D
add v18.8H, v30.8H, v31.8H
.ifc \codec,h264
rshrn v16.8B, v18.8H, #6
.else
add v18.8H, v18.8H, v22.8H
shrn v16.8B, v18.8H, #6
.endif
subs w3, w3, #2
prfm pldl1strm, [x1, x2]
.ifc \type,avg
ld1 {v20.S}[0], [x8], x2
ld1 {v20.S}[1], [x8], x2
urhadd v16.8B, v16.8B, v20.8B
.endif
st1 {v16.S}[0], [x0], x2
st1 {v16.S}[1], [x0], x2
b.gt 1b
ret
2: adds w12, w12, w6
dup v30.8B, w4
b.eq 5f
tst w6, w6
dup v31.8B, w12
trn1 v0.2S, v30.2S, v31.2S
trn2 v1.2S, v30.2S, v31.2S
b.eq 4f
ext v1.8B, v0.8B, v1.8B, #4
ld1 {v4.S}[0], [x1], x2
3: ld1 {v4.S}[1], [x1], x2
umull v18.8H, v4.8B, v0.8B
ld1 {v4.S}[0], [x1], x2
umull v19.8H, v4.8B, v1.8B
trn1 v30.2D, v18.2D, v19.2D
trn2 v31.2D, v18.2D, v19.2D
add v18.8H, v30.8H, v31.8H
prfm pldl1strm, [x1]
.ifc \codec,h264
rshrn v16.8B, v18.8H, #6
.else
add v18.8H, v18.8H, v22.8H
shrn v16.8B, v18.8H, #6
.endif
.ifc \type,avg
ld1 {v20.S}[0], [x8], x2
ld1 {v20.S}[1], [x8], x2
urhadd v16.8B, v16.8B, v20.8B
.endif
subs w3, w3, #2
prfm pldl1strm, [x1, x2]
st1 {v16.S}[0], [x0], x2
st1 {v16.S}[1], [x0], x2
b.gt 3b
ret
4: ld1 {v4.8B}, [x1], x2
ld1 {v6.8B}, [x1], x2
ext v5.8B, v4.8B, v5.8B, #1
ext v7.8B, v6.8B, v7.8B, #1
trn1 v4.2S, v4.2S, v5.2S
trn1 v6.2S, v6.2S, v7.2S
umull v18.8H, v4.8B, v0.8B
umull v19.8H, v6.8B, v0.8B
subs w3, w3, #2
trn1 v30.2D, v18.2D, v19.2D
trn2 v31.2D, v18.2D, v19.2D
add v18.8H, v30.8H, v31.8H
prfm pldl1strm, [x1]
.ifc \codec,h264
rshrn v16.8B, v18.8H, #6
.else
add v18.8H, v18.8H, v22.8H
shrn v16.8B, v18.8H, #6
.endif
.ifc \type,avg
ld1 {v20.S}[0], [x8], x2
ld1 {v20.S}[1], [x8], x2
urhadd v16.8B, v16.8B, v20.8B
.endif
prfm pldl1strm, [x1]
st1 {v16.S}[0], [x0], x2
st1 {v16.S}[1], [x0], x2
b.gt 4b
ret
5: ld1 {v4.S}[0], [x1], x2
ld1 {v4.S}[1], [x1], x2
umull v18.8H, v4.8B, v30.8B
subs w3, w3, #2
prfm pldl1strm, [x1]
.ifc \codec,h264
rshrn v16.8B, v18.8H, #6
.else
add v18.8H, v18.8H, v22.8H
shrn v16.8B, v18.8H, #6
.endif
.ifc \type,avg
ld1 {v20.S}[0], [x8], x2
ld1 {v20.S}[1], [x8], x2
urhadd v16.8B, v16.8B, v20.8B
.endif
prfm pldl1strm, [x1]
st1 {v16.S}[0], [x0], x2
st1 {v16.S}[1], [x0], x2
b.gt 5b
ret
endfunc
.endm
.macro h264_chroma_mc2 type
function ff_\type\()_h264_chroma_mc2_neon, export=1
prfm pldl1strm, [x1]
prfm pldl1strm, [x1, x2]
orr w7, w4, w5
cbz w7, 2f
mul w7, w4, w5
lsl w14, w5, #3
lsl w13, w4, #3
sub w6, w14, w7
sub w12, w13, w7
sub w4, w7, w13
sub w4, w4, w14
add w4, w4, #64
dup v0.8B, w4
dup v2.8B, w12
dup v1.8B, w6
dup v3.8B, w7
trn1 v0.4H, v0.4H, v2.4H
trn1 v1.4H, v1.4H, v3.4H
1:
ld1 {v4.S}[0], [x1], x2
ld1 {v4.S}[1], [x1], x2
rev64 v5.2S, v4.2S
ld1 {v5.S}[1], [x1]
ext v6.8B, v4.8B, v5.8B, #1
ext v7.8B, v5.8B, v4.8B, #1
trn1 v4.4H, v4.4H, v6.4H
trn1 v5.4H, v5.4H, v7.4H
umull v16.8H, v4.8B, v0.8B
umlal v16.8H, v5.8B, v1.8B
.ifc \type,avg
ld1 {v18.H}[0], [x0], x2
ld1 {v18.H}[2], [x0]
sub x0, x0, x2
.endif
rev64 v17.4S, v16.4S
add v16.8H, v16.8H, v17.8H
rshrn v16.8B, v16.8H, #6
.ifc \type,avg
urhadd v16.8B, v16.8B, v18.8B
.endif
st1 {v16.H}[0], [x0], x2
st1 {v16.H}[2], [x0], x2
subs w3, w3, #2
b.gt 1b
ret
2:
ld1 {v16.H}[0], [x1], x2
ld1 {v16.H}[1], [x1], x2
.ifc \type,avg
ld1 {v18.H}[0], [x0], x2
ld1 {v18.H}[1], [x0]
sub x0, x0, x2
urhadd v16.8B, v16.8B, v18.8B
.endif
st1 {v16.H}[0], [x0], x2
st1 {v16.H}[1], [x0], x2
subs w3, w3, #2
b.gt 2b
ret
endfunc
.endm
h264_chroma_mc8 put
h264_chroma_mc8 avg
h264_chroma_mc4 put
h264_chroma_mc4 avg
h264_chroma_mc2 put
h264_chroma_mc2 avg
#if CONFIG_RV40_DECODER
const rv40bias
.short 0, 16, 32, 16
.short 32, 28, 32, 28
.short 0, 32, 16, 32
.short 32, 28, 32, 28
endconst
h264_chroma_mc8 put, rv40
h264_chroma_mc8 avg, rv40
h264_chroma_mc4 put, rv40
h264_chroma_mc4 avg, rv40
#endif
#if CONFIG_VC1DSP
h264_chroma_mc8 put, vc1
h264_chroma_mc8 avg, vc1
h264_chroma_mc4 put, vc1
h264_chroma_mc4 avg, vc1
#endif

View File

@ -0,0 +1,102 @@
/*
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/aarch64/cpu.h"
#include "libavcodec/h264dsp.h"
void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
int log2_den, int weight, int offset);
void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
int log2_den, int weight, int offset);
void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
int log2_den, int weight, int offset);
void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
int height, int log2_den, int weightd,
int weights, int offset);
void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
int height, int log2_den, int weightd,
int weights, int offset);
void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
int height, int log2_den, int weightd,
int weights, int offset);
void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride);
void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
int16_t *block, int stride,
const uint8_t nnzc[6*8]);
void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset,
int16_t *block, int stride,
const uint8_t nnzc[6*8]);
void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
int16_t *block, int stride,
const uint8_t nnzc[6*8]);
void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, int stride);
void ff_h264_idct8_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
int16_t *block, int stride,
const uint8_t nnzc[6*8]);
av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags) && bit_depth == 8) {
c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon;
c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon;
c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
if (chroma_format_idc <= 1)
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
c->h264_idct_add = ff_h264_idct_add_neon;
c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
c->h264_idct_add16 = ff_h264_idct_add16_neon;
c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
if (chroma_format_idc <= 1)
c->h264_idct_add8 = ff_h264_idct_add8_neon;
c->h264_idct8_add = ff_h264_idct8_add_neon;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon;
c->h264_idct8_add4 = ff_h264_idct8_add4_neon;
}
}

View File

@ -0,0 +1,498 @@
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
#include "neon.S"
.macro h264_loop_filter_start
cmp w2, #0
ldr w6, [x4]
ccmp w3, #0, #0, ne
mov v24.S[0], w6
and w6, w6, w6, lsl #16
b.eq 1f
ands w6, w6, w6, lsl #8
b.ge 2f
1:
ret
2:
.endm
.macro h264_loop_filter_luma
dup v22.16B, w2 // alpha
uxtl v24.8H, v24.8B
uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0)
uxtl v24.4S, v24.4H
uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0)
sli v24.8H, v24.8H, #8
uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0)
sli v24.4S, v24.4S, #16
cmhi v21.16B, v22.16B, v21.16B // < alpha
dup v22.16B, w3 // beta
cmlt v23.16B, v24.16B, #0
cmhi v28.16B, v22.16B, v28.16B // < beta
cmhi v30.16B, v22.16B, v30.16B // < beta
bic v21.16B, v21.16B, v23.16B
uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
and v21.16B, v21.16B, v28.16B
uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
cmhi v17.16B, v22.16B, v17.16B // < beta
and v21.16B, v21.16B, v30.16B
cmhi v19.16B, v22.16B, v19.16B // < beta
and v17.16B, v17.16B, v21.16B
and v19.16B, v19.16B, v21.16B
and v24.16B, v24.16B, v21.16B
urhadd v28.16B, v16.16B, v0.16B
sub v21.16B, v24.16B, v17.16B
uqadd v23.16B, v18.16B, v24.16B
uhadd v20.16B, v20.16B, v28.16B
sub v21.16B, v21.16B, v19.16B
uhadd v28.16B, v4.16B, v28.16B
umin v23.16B, v23.16B, v20.16B
uqsub v22.16B, v18.16B, v24.16B
uqadd v4.16B, v2.16B, v24.16B
umax v23.16B, v23.16B, v22.16B
uqsub v22.16B, v2.16B, v24.16B
umin v28.16B, v4.16B, v28.16B
uxtl v4.8H, v0.8B
umax v28.16B, v28.16B, v22.16B
uxtl2 v20.8H, v0.16B
usubw v4.8H, v4.8H, v16.8B
usubw2 v20.8H, v20.8H, v16.16B
shl v4.8H, v4.8H, #2
shl v20.8H, v20.8H, #2
uaddw v4.8H, v4.8H, v18.8B
uaddw2 v20.8H, v20.8H, v18.16B
usubw v4.8H, v4.8H, v2.8B
usubw2 v20.8H, v20.8H, v2.16B
rshrn v4.8B, v4.8H, #3
rshrn2 v4.16B, v20.8H, #3
bsl v17.16B, v23.16B, v18.16B
bsl v19.16B, v28.16B, v2.16B
neg v23.16B, v21.16B
uxtl v28.8H, v16.8B
smin v4.16B, v4.16B, v21.16B
uxtl2 v21.8H, v16.16B
smax v4.16B, v4.16B, v23.16B
uxtl v22.8H, v0.8B
uxtl2 v24.8H, v0.16B
saddw v28.8H, v28.8H, v4.8B
saddw2 v21.8H, v21.8H, v4.16B
ssubw v22.8H, v22.8H, v4.8B
ssubw2 v24.8H, v24.8H, v4.16B
sqxtun v16.8B, v28.8H
sqxtun2 v16.16B, v21.8H
sqxtun v0.8B, v22.8H
sqxtun2 v0.16B, v24.8H
.endm
function ff_h264_v_loop_filter_luma_neon, export=1
h264_loop_filter_start
sxtw x1, w1
ld1 {v0.16B}, [x0], x1
ld1 {v2.16B}, [x0], x1
ld1 {v4.16B}, [x0], x1
sub x0, x0, x1, lsl #2
sub x0, x0, x1, lsl #1
ld1 {v20.16B}, [x0], x1
ld1 {v18.16B}, [x0], x1
ld1 {v16.16B}, [x0], x1
h264_loop_filter_luma
sub x0, x0, x1, lsl #1
st1 {v17.16B}, [x0], x1
st1 {v16.16B}, [x0], x1
st1 {v0.16B}, [x0], x1
st1 {v19.16B}, [x0]
ret
endfunc
function ff_h264_h_loop_filter_luma_neon, export=1
h264_loop_filter_start
sub x0, x0, #4
ld1 {v6.8B}, [x0], x1
ld1 {v20.8B}, [x0], x1
ld1 {v18.8B}, [x0], x1
ld1 {v16.8B}, [x0], x1
ld1 {v0.8B}, [x0], x1
ld1 {v2.8B}, [x0], x1
ld1 {v4.8B}, [x0], x1
ld1 {v26.8B}, [x0], x1
ld1 {v6.D}[1], [x0], x1
ld1 {v20.D}[1], [x0], x1
ld1 {v18.D}[1], [x0], x1
ld1 {v16.D}[1], [x0], x1
ld1 {v0.D}[1], [x0], x1
ld1 {v2.D}[1], [x0], x1
ld1 {v4.D}[1], [x0], x1
ld1 {v26.D}[1], [x0], x1
transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
h264_loop_filter_luma
transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
sub x0, x0, x1, lsl #4
add x0, x0, #2
st1 {v17.S}[0], [x0], x1
st1 {v16.S}[0], [x0], x1
st1 {v0.S}[0], [x0], x1
st1 {v19.S}[0], [x0], x1
st1 {v17.S}[1], [x0], x1
st1 {v16.S}[1], [x0], x1
st1 {v0.S}[1], [x0], x1
st1 {v19.S}[1], [x0], x1
st1 {v17.S}[2], [x0], x1
st1 {v16.S}[2], [x0], x1
st1 {v0.S}[2], [x0], x1
st1 {v19.S}[2], [x0], x1
st1 {v17.S}[3], [x0], x1
st1 {v16.S}[3], [x0], x1
st1 {v0.S}[3], [x0], x1
st1 {v19.S}[3], [x0], x1
ret
endfunc
.macro h264_loop_filter_chroma
dup v22.8B, w2 // alpha
uxtl v24.8H, v24.8B
uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
uxtl v4.8H, v0.8B
uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
usubw v4.8H, v4.8H, v16.8B
sli v24.8H, v24.8H, #8
shl v4.8H, v4.8H, #2
uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
uaddw v4.8H, v4.8H, v18.8B
cmhi v26.8B, v22.8B, v26.8B // < alpha
usubw v4.8H, v4.8H, v2.8B
dup v22.8B, w3 // beta
rshrn v4.8B, v4.8H, #3
cmhi v28.8B, v22.8B, v28.8B // < beta
cmhi v30.8B, v22.8B, v30.8B // < beta
smin v4.8B, v4.8B, v24.8B
neg v25.8B, v24.8B
and v26.8B, v26.8B, v28.8B
smax v4.8B, v4.8B, v25.8B
and v26.8B, v26.8B, v30.8B
uxtl v22.8H, v0.8B
and v4.8B, v4.8B, v26.8B
uxtl v28.8H, v16.8B
saddw v28.8H, v28.8H, v4.8B
ssubw v22.8H, v22.8H, v4.8B
sqxtun v16.8B, v28.8H
sqxtun v0.8B, v22.8H
.endm
function ff_h264_v_loop_filter_chroma_neon, export=1
h264_loop_filter_start
sub x0, x0, x1, lsl #1
ld1 {v18.8B}, [x0], x1
ld1 {v16.8B}, [x0], x1
ld1 {v0.8B}, [x0], x1
ld1 {v2.8B}, [x0]
h264_loop_filter_chroma
sub x0, x0, x1, lsl #1
st1 {v16.8B}, [x0], x1
st1 {v0.8B}, [x0], x1
ret
endfunc
function ff_h264_h_loop_filter_chroma_neon, export=1
h264_loop_filter_start
sub x0, x0, #2
ld1 {v18.S}[0], [x0], x1
ld1 {v16.S}[0], [x0], x1
ld1 {v0.S}[0], [x0], x1
ld1 {v2.S}[0], [x0], x1
ld1 {v18.S}[1], [x0], x1
ld1 {v16.S}[1], [x0], x1
ld1 {v0.S}[1], [x0], x1
ld1 {v2.S}[1], [x0], x1
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
h264_loop_filter_chroma
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
sub x0, x0, x1, lsl #3
st1 {v18.S}[0], [x0], x1
st1 {v16.S}[0], [x0], x1
st1 {v0.S}[0], [x0], x1
st1 {v2.S}[0], [x0], x1
st1 {v18.S}[1], [x0], x1
st1 {v16.S}[1], [x0], x1
st1 {v0.S}[1], [x0], x1
st1 {v2.S}[1], [x0], x1
ret
endfunc
.macro biweight_16 macs, macd
dup v0.16B, w5
dup v1.16B, w6
mov v4.16B, v16.16B
mov v6.16B, v16.16B
1: subs w3, w3, #2
ld1 {v20.16B}, [x0], x2
\macd v4.8H, v0.8B, v20.8B
\macd\()2 v6.8H, v0.16B, v20.16B
ld1 {v22.16B}, [x1], x2
\macs v4.8H, v1.8B, v22.8B
\macs\()2 v6.8H, v1.16B, v22.16B
mov v24.16B, v16.16B
ld1 {v28.16B}, [x0], x2
mov v26.16B, v16.16B
\macd v24.8H, v0.8B, v28.8B
\macd\()2 v26.8H, v0.16B, v28.16B
ld1 {v30.16B}, [x1], x2
\macs v24.8H, v1.8B, v30.8B
\macs\()2 v26.8H, v1.16B, v30.16B
sshl v4.8H, v4.8H, v18.8H
sshl v6.8H, v6.8H, v18.8H
sqxtun v4.8B, v4.8H
sqxtun2 v4.16B, v6.8H
sshl v24.8H, v24.8H, v18.8H
sshl v26.8H, v26.8H, v18.8H
sqxtun v24.8B, v24.8H
sqxtun2 v24.16B, v26.8H
mov v6.16B, v16.16B
st1 {v4.16B}, [x7], x2
mov v4.16B, v16.16B
st1 {v24.16B}, [x7], x2
b.ne 1b
ret
.endm
.macro biweight_8 macs, macd
dup v0.8B, w5
dup v1.8B, w6
mov v2.16B, v16.16B
mov v20.16B, v16.16B
1: subs w3, w3, #2
ld1 {v4.8B}, [x0], x2
\macd v2.8H, v0.8B, v4.8B
ld1 {v5.8B}, [x1], x2
\macs v2.8H, v1.8B, v5.8B
ld1 {v6.8B}, [x0], x2
\macd v20.8H, v0.8B, v6.8B
ld1 {v7.8B}, [x1], x2
\macs v20.8H, v1.8B, v7.8B
sshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
sshl v20.8H, v20.8H, v18.8H
sqxtun v4.8B, v20.8H
mov v20.16B, v16.16B
st1 {v2.8B}, [x7], x2
mov v2.16B, v16.16B
st1 {v4.8B}, [x7], x2
b.ne 1b
ret
.endm
.macro biweight_4 macs, macd
dup v0.8B, w5
dup v1.8B, w6
mov v2.16B, v16.16B
mov v20.16B,v16.16B
1: subs w3, w3, #4
ld1 {v4.S}[0], [x0], x2
ld1 {v4.S}[1], [x0], x2
\macd v2.8H, v0.8B, v4.8B
ld1 {v5.S}[0], [x1], x2
ld1 {v5.S}[1], [x1], x2
\macs v2.8H, v1.8B, v5.8B
b.lt 2f
ld1 {v6.S}[0], [x0], x2
ld1 {v6.S}[1], [x0], x2
\macd v20.8H, v0.8B, v6.8B
ld1 {v7.S}[0], [x1], x2
ld1 {v7.S}[1], [x1], x2
\macs v20.8H, v1.8B, v7.8B
sshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
sshl v20.8H, v20.8H, v18.8H
sqxtun v4.8B, v20.8H
mov v20.16B, v16.16B
st1 {v2.S}[0], [x7], x2
st1 {v2.S}[1], [x7], x2
mov v2.16B, v16.16B
st1 {v4.S}[0], [x7], x2
st1 {v4.S}[1], [x7], x2
b.ne 1b
ret
2: sshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
st1 {v2.S}[0], [x7], x2
st1 {v2.S}[1], [x7], x2
ret
.endm
.macro biweight_func w
function ff_biweight_h264_pixels_\w\()_neon, export=1
sxtw x2, w2
lsr w8, w5, #31
add w7, w7, #1
eor w8, w8, w6, lsr #30
orr w7, w7, #1
dup v18.8H, w4
lsl w7, w7, w4
not v18.16B, v18.16B
dup v16.8H, w7
mov x7, x0
cbz w8, 10f
subs w8, w8, #1
b.eq 20f
subs w8, w8, #1
b.eq 30f
b 40f
10: biweight_\w umlal, umlal
20: neg w5, w5
biweight_\w umlal, umlsl
30: neg w5, w5
neg w6, w6
biweight_\w umlsl, umlsl
40: neg w6, w6
biweight_\w umlsl, umlal
endfunc
.endm
biweight_func 16
biweight_func 8
biweight_func 4
.macro weight_16 add
dup v0.16B, w4
1: subs w2, w2, #2
ld1 {v20.16B}, [x0], x1
umull v4.8H, v0.8B, v20.8B
umull2 v6.8H, v0.16B, v20.16B
ld1 {v28.16B}, [x0], x1
umull v24.8H, v0.8B, v28.8B
umull2 v26.8H, v0.16B, v28.16B
\add v4.8H, v16.8H, v4.8H
srshl v4.8H, v4.8H, v18.8H
\add v6.8H, v16.8H, v6.8H
srshl v6.8H, v6.8H, v18.8H
sqxtun v4.8B, v4.8H
sqxtun2 v4.16B, v6.8H
\add v24.8H, v16.8H, v24.8H
srshl v24.8H, v24.8H, v18.8H
\add v26.8H, v16.8H, v26.8H
srshl v26.8H, v26.8H, v18.8H
sqxtun v24.8B, v24.8H
sqxtun2 v24.16B, v26.8H
st1 {v4.16B}, [x5], x1
st1 {v24.16B}, [x5], x1
b.ne 1b
ret
.endm
.macro weight_8 add
dup v0.8B, w4
1: subs w2, w2, #2
ld1 {v4.8B}, [x0], x1
umull v2.8H, v0.8B, v4.8B
ld1 {v6.8B}, [x0], x1
umull v20.8H, v0.8B, v6.8B
\add v2.8H, v16.8H, v2.8H
srshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
\add v20.8H, v16.8H, v20.8H
srshl v20.8H, v20.8H, v18.8H
sqxtun v4.8B, v20.8H
st1 {v2.8B}, [x5], x1
st1 {v4.8B}, [x5], x1
b.ne 1b
ret
.endm
.macro weight_4 add
dup v0.8B, w4
1: subs w2, w2, #4
ld1 {v4.S}[0], [x0], x1
ld1 {v4.S}[1], [x0], x1
umull v2.8H, v0.8B, v4.8B
b.lt 2f
ld1 {v6.S}[0], [x0], x1
ld1 {v6.S}[1], [x0], x1
umull v20.8H, v0.8B, v6.8B
\add v2.8H, v16.8H, v2.8H
srshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
\add v20.8H, v16.8H, v20.8H
srshl v20.8H, v20.8h, v18.8H
sqxtun v4.8B, v20.8H
st1 {v2.S}[0], [x5], x1
st1 {v2.S}[1], [x5], x1
st1 {v4.S}[0], [x5], x1
st1 {v4.S}[1], [x5], x1
b.ne 1b
ret
2: \add v2.8H, v16.8H, v2.8H
srshl v2.8H, v2.8H, v18.8H
sqxtun v2.8B, v2.8H
st1 {v2.S}[0], [x5], x1
st1 {v2.S}[1], [x5], x1
ret
.endm
.macro weight_func w
function ff_weight_h264_pixels_\w\()_neon, export=1
sxtw x1, w1
cmp w3, #1
mov w6, #1
lsl w5, w5, w3
dup v16.8H, w5
mov x5, x0
b.le 20f
sub w6, w6, w3
dup v18.8H, w6
cmp w4, #0
b.lt 10f
weight_\w shadd
10: neg w4, w4
weight_\w shsub
20: neg w6, w3
dup v18.8H, w6
cmp w4, #0
b.lt 10f
weight_\w add
10: neg w4, w4
weight_\w sub
endfunc
.endm
weight_func 16
weight_func 8
weight_func 4

View File

@ -0,0 +1,409 @@
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
#include "neon.S"
function ff_h264_idct_add_neon, export=1
ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1]
sxtw x2, w2
movi v30.8H, #0
add v4.4H, v0.4H, v2.4H
sshr v16.4H, v1.4H, #1
st1 {v30.8H}, [x1], #16
sshr v17.4H, v3.4H, #1
st1 {v30.8H}, [x1], #16
sub v5.4H, v0.4H, v2.4H
sub v6.4H, v16.4H, v3.4H
add v7.4H, v1.4H, v17.4H
add v0.4H, v4.4H, v7.4H
add v1.4H, v5.4H, v6.4H
sub v2.4H, v5.4H, v6.4H
sub v3.4H, v4.4H, v7.4H
transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
add v4.4H, v0.4H, v2.4H
ld1 {v18.S}[0], [x0], x2
sshr v16.4H, v3.4H, #1
sshr v17.4H, v1.4H, #1
ld1 {v18.S}[1], [x0], x2
sub v5.4H, v0.4H, v2.4H
ld1 {v19.S}[1], [x0], x2
add v6.4H, v16.4H, v1.4H
ins v4.D[1], v5.D[0]
sub v7.4H, v17.4H, v3.4H
ld1 {v19.S}[0], [x0], x2
ins v6.D[1], v7.D[0]
sub x0, x0, x2, lsl #2
add v0.8H, v4.8H, v6.8H
sub v1.8H, v4.8H, v6.8H
srshr v0.8H, v0.8H, #6
srshr v1.8H, v1.8H, #6
uaddw v0.8H, v0.8H, v18.8B
uaddw v1.8H, v1.8H, v19.8B
sqxtun v0.8B, v0.8H
sqxtun v1.8B, v1.8H
st1 {v0.S}[0], [x0], x2
st1 {v0.S}[1], [x0], x2
st1 {v1.S}[1], [x0], x2
st1 {v1.S}[0], [x0], x2
sub x1, x1, #32
ret
endfunc
function ff_h264_idct_dc_add_neon, export=1
sxtw x2, w2
mov w3, #0
ld1r {v2.8H}, [x1]
strh w3, [x1]
srshr v2.8H, v2.8H, #6
ld1 {v0.S}[0], [x0], x2
ld1 {v0.S}[1], [x0], x2
uaddw v3.8H, v2.8H, v0.8B
ld1 {v1.S}[0], [x0], x2
ld1 {v1.S}[1], [x0], x2
uaddw v4.8H, v2.8H, v1.8B
sqxtun v0.8B, v3.8H
sqxtun v1.8B, v4.8H
sub x0, x0, x2, lsl #2
st1 {v0.S}[0], [x0], x2
st1 {v0.S}[1], [x0], x2
st1 {v1.S}[0], [x0], x2
st1 {v1.S}[1], [x0], x2
ret
endfunc
function ff_h264_idct_add16_neon, export=1
mov x12, x30
mov x6, x0 // dest
mov x5, x1 // block_offset
mov x1, x2 // block
mov w9, w3 // stride
movrel x7, scan8
mov x10, #16
movrel x13, X(ff_h264_idct_dc_add_neon)
movrel x14, X(ff_h264_idct_add_neon)
1: mov w2, w9
ldrb w3, [x7], #1
ldrsw x0, [x5], #4
ldrb w3, [x4, w3, uxtw]
subs w3, w3, #1
b.lt 2f
ldrsh w3, [x1]
add x0, x0, x6
ccmp w3, #0, #4, eq
csel x15, x13, x14, ne
blr x15
2: subs x10, x10, #1
add x1, x1, #32
b.ne 1b
ret x12
endfunc
function ff_h264_idct_add16intra_neon, export=1
mov x12, x30
mov x6, x0 // dest
mov x5, x1 // block_offset
mov x1, x2 // block
mov w9, w3 // stride
movrel x7, scan8
mov x10, #16
movrel x13, X(ff_h264_idct_dc_add_neon)
movrel x14, X(ff_h264_idct_add_neon)
1: mov w2, w9
ldrb w3, [x7], #1
ldrsw x0, [x5], #4
ldrb w3, [x4, w3, uxtw]
add x0, x0, x6
cmp w3, #0
ldrsh w3, [x1]
csel x15, x13, x14, eq
ccmp w3, #0, #0, eq
b.eq 2f
blr x15
2: subs x10, x10, #1
add x1, x1, #32
b.ne 1b
ret x12
endfunc
function ff_h264_idct_add8_neon, export=1
sub sp, sp, #0x40
stp x19, x20, [sp]
mov x12, x30
ldp x6, x15, [x0] // dest[0], dest[1]
add x5, x1, #16*4 // block_offset
add x9, x2, #16*32 // block
mov w19, w3 // stride
movrel x13, X(ff_h264_idct_dc_add_neon)
movrel x14, X(ff_h264_idct_add_neon)
movrel x7, scan8, 16
mov x10, #0
mov x11, #16
1: mov w2, w19
ldrb w3, [x7, x10] // scan8[i]
ldrsw x0, [x5, x10, lsl #2] // block_offset[i]
ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ]
add x0, x0, x6 // block_offset[i] + dst[j-1]
add x1, x9, x10, lsl #5 // block + i * 16
cmp w3, #0
ldrsh w3, [x1] // block[i*16]
csel x20, x13, x14, eq
ccmp w3, #0, #0, eq
b.eq 2f
blr x20
2: add x10, x10, #1
cmp x10, #4
csel x10, x11, x10, eq // mov x10, #16
csel x6, x15, x6, eq
cmp x10, #20
b.lt 1b
ldp x19, x20, [sp]
add sp, sp, #0x40
ret x12
endfunc
.macro idct8x8_cols pass
.if \pass == 0
va .req v18
vb .req v30
sshr v18.8H, v26.8H, #1
add v16.8H, v24.8H, v28.8H
ld1 {v30.8H, v31.8H}, [x1]
st1 {v19.8H}, [x1], #16
st1 {v19.8H}, [x1], #16
sub v17.8H, v24.8H, v28.8H
sshr v19.8H, v30.8H, #1
sub v18.8H, v18.8H, v30.8H
add v19.8H, v19.8H, v26.8H
.else
va .req v30
vb .req v18
sshr v30.8H, v26.8H, #1
sshr v19.8H, v18.8H, #1
add v16.8H, v24.8H, v28.8H
sub v17.8H, v24.8H, v28.8H
sub v30.8H, v30.8H, v18.8H
add v19.8H, v19.8H, v26.8H
.endif
add v26.8H, v17.8H, va.8H
sub v28.8H, v17.8H, va.8H
add v24.8H, v16.8H, v19.8H
sub vb.8H, v16.8H, v19.8H
sub v16.8H, v29.8H, v27.8H
add v17.8H, v31.8H, v25.8H
sub va.8H, v31.8H, v25.8H
add v19.8H, v29.8H, v27.8H
sub v16.8H, v16.8H, v31.8H
sub v17.8H, v17.8H, v27.8H
add va.8H, va.8H, v29.8H
add v19.8H, v19.8H, v25.8H
sshr v25.8H, v25.8H, #1
sshr v27.8H, v27.8H, #1
sshr v29.8H, v29.8H, #1
sshr v31.8H, v31.8H, #1
sub v16.8H, v16.8H, v31.8H
sub v17.8H, v17.8H, v27.8H
add va.8H, va.8H, v29.8H
add v19.8H, v19.8H, v25.8H
sshr v25.8H, v16.8H, #2
sshr v27.8H, v17.8H, #2
sshr v29.8H, va.8H, #2
sshr v31.8H, v19.8H, #2
sub v19.8H, v19.8H, v25.8H
sub va.8H, v27.8H, va.8H
add v17.8H, v17.8H, v29.8H
add v16.8H, v16.8H, v31.8H
.if \pass == 0
sub v31.8H, v24.8H, v19.8H
add v24.8H, v24.8H, v19.8H
add v25.8H, v26.8H, v18.8H
sub v18.8H, v26.8H, v18.8H
add v26.8H, v28.8H, v17.8H
add v27.8H, v30.8H, v16.8H
sub v29.8H, v28.8H, v17.8H
sub v28.8H, v30.8H, v16.8H
.else
sub v31.8H, v24.8H, v19.8H
add v24.8H, v24.8H, v19.8H
add v25.8H, v26.8H, v30.8H
sub v30.8H, v26.8H, v30.8H
add v26.8H, v28.8H, v17.8H
sub v29.8H, v28.8H, v17.8H
add v27.8H, v18.8H, v16.8H
sub v28.8H, v18.8H, v16.8H
.endif
.unreq va
.unreq vb
.endm
function ff_h264_idct8_add_neon, export=1
movi v19.8H, #0
sxtw x2, w2
ld1 {v24.8H, v25.8H}, [x1]
st1 {v19.8H}, [x1], #16
st1 {v19.8H}, [x1], #16
ld1 {v26.8H, v27.8H}, [x1]
st1 {v19.8H}, [x1], #16
st1 {v19.8H}, [x1], #16
ld1 {v28.8H, v29.8H}, [x1]
st1 {v19.8H}, [x1], #16
st1 {v19.8H}, [x1], #16
idct8x8_cols 0
transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
idct8x8_cols 1
mov x3, x0
srshr v24.8H, v24.8H, #6
ld1 {v0.8B}, [x0], x2
srshr v25.8H, v25.8H, #6
ld1 {v1.8B}, [x0], x2
srshr v26.8H, v26.8H, #6
ld1 {v2.8B}, [x0], x2
srshr v27.8H, v27.8H, #6
ld1 {v3.8B}, [x0], x2
srshr v28.8H, v28.8H, #6
ld1 {v4.8B}, [x0], x2
srshr v29.8H, v29.8H, #6
ld1 {v5.8B}, [x0], x2
srshr v30.8H, v30.8H, #6
ld1 {v6.8B}, [x0], x2
srshr v31.8H, v31.8H, #6
ld1 {v7.8B}, [x0], x2
uaddw v24.8H, v24.8H, v0.8B
uaddw v25.8H, v25.8H, v1.8B
uaddw v26.8H, v26.8H, v2.8B
sqxtun v0.8B, v24.8H
uaddw v27.8H, v27.8H, v3.8B
sqxtun v1.8B, v25.8H
uaddw v28.8H, v28.8H, v4.8B
sqxtun v2.8B, v26.8H
st1 {v0.8B}, [x3], x2
uaddw v29.8H, v29.8H, v5.8B
sqxtun v3.8B, v27.8H
st1 {v1.8B}, [x3], x2
uaddw v30.8H, v30.8H, v6.8B
sqxtun v4.8B, v28.8H
st1 {v2.8B}, [x3], x2
uaddw v31.8H, v31.8H, v7.8B
sqxtun v5.8B, v29.8H
st1 {v3.8B}, [x3], x2
sqxtun v6.8B, v30.8H
sqxtun v7.8B, v31.8H
st1 {v4.8B}, [x3], x2
st1 {v5.8B}, [x3], x2
st1 {v6.8B}, [x3], x2
st1 {v7.8B}, [x3], x2
sub x1, x1, #128
ret
endfunc
function ff_h264_idct8_dc_add_neon, export=1
mov w3, #0
sxtw x2, w2
ld1r {v31.8H}, [x1]
strh w3, [x1]
ld1 {v0.8B}, [x0], x2
srshr v31.8H, v31.8H, #6
ld1 {v1.8B}, [x0], x2
ld1 {v2.8B}, [x0], x2
uaddw v24.8H, v31.8H, v0.8B
ld1 {v3.8B}, [x0], x2
uaddw v25.8H, v31.8H, v1.8B
ld1 {v4.8B}, [x0], x2
uaddw v26.8H, v31.8H, v2.8B
ld1 {v5.8B}, [x0], x2
uaddw v27.8H, v31.8H, v3.8B
ld1 {v6.8B}, [x0], x2
uaddw v28.8H, v31.8H, v4.8B
ld1 {v7.8B}, [x0], x2
uaddw v29.8H, v31.8H, v5.8B
uaddw v30.8H, v31.8H, v6.8B
uaddw v31.8H, v31.8H, v7.8B
sqxtun v0.8B, v24.8H
sqxtun v1.8B, v25.8H
sqxtun v2.8B, v26.8H
sqxtun v3.8B, v27.8H
sub x0, x0, x2, lsl #3
st1 {v0.8B}, [x0], x2
sqxtun v4.8B, v28.8H
st1 {v1.8B}, [x0], x2
sqxtun v5.8B, v29.8H
st1 {v2.8B}, [x0], x2
sqxtun v6.8B, v30.8H
st1 {v3.8B}, [x0], x2
sqxtun v7.8B, v31.8H
st1 {v4.8B}, [x0], x2
st1 {v5.8B}, [x0], x2
st1 {v6.8B}, [x0], x2
st1 {v7.8B}, [x0], x2
ret
endfunc
function ff_h264_idct8_add4_neon, export=1
mov x12, x30
mov x6, x0
mov x5, x1
mov x1, x2
mov w2, w3
movrel x7, scan8
mov w10, #16
movrel x13, X(ff_h264_idct8_dc_add_neon)
movrel x14, X(ff_h264_idct8_add_neon)
1: ldrb w9, [x7], #4
ldrsw x0, [x5], #16
ldrb w9, [x4, w9, UXTW]
subs w9, w9, #1
b.lt 2f
ldrsh w11, [x1]
add x0, x6, x0
ccmp w11, #0, #4, eq
csel x15, x13, x14, ne
blr x15
2: subs w10, w10, #4
add x1, x1, #128
b.ne 1b
ret x12
endfunc
const scan8
.byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
.byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
.byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
.byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
.byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
.byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
.byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
.byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
.byte 4+11*8, 5+11*8, 4+12*8, 5+12*8
.byte 6+11*8, 7+11*8, 6+12*8, 7+12*8
.byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
.byte 6+13*8, 7+13*8, 6+14*8, 7+14*8
endconst

View File

@ -0,0 +1,93 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/aarch64/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/h264pred.h"
void ff_pred16x16_vert_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred16x16_hor_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred16x16_plane_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred16x16_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred16x16_128_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred16x16_left_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred16x16_top_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_vert_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_hor_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_plane_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_128_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_left_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_top_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_l0t_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride);
void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride);
static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
const int bit_depth,
const int chroma_format_idc)
{
const int high_depth = bit_depth > 8;
if (high_depth)
return;
if (chroma_format_idc <= 1) {
h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon;
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon;
if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon;
h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon;
if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 &&
codec_id != AV_CODEC_ID_VP8) {
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon;
h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;
}
}
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon;
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon;
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon;
h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon;
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon;
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon;
if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon;
}
av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id,
int bit_depth, const int chroma_format_idc)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags))
h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc);
}

View File

@ -0,0 +1,361 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
.macro ldcol.8 rd, rs, rt, n=8, hi=0
.if \n >= 8 || \hi == 0
ld1 {\rd\().b}[0], [\rs], \rt
ld1 {\rd\().b}[1], [\rs], \rt
ld1 {\rd\().b}[2], [\rs], \rt
ld1 {\rd\().b}[3], [\rs], \rt
.endif
.if \n >= 8 || \hi == 1
ld1 {\rd\().b}[4], [\rs], \rt
ld1 {\rd\().b}[5], [\rs], \rt
ld1 {\rd\().b}[6], [\rs], \rt
ld1 {\rd\().b}[7], [\rs], \rt
.endif
.if \n == 16
ld1 {\rd\().b}[8], [\rs], \rt
ld1 {\rd\().b}[9], [\rs], \rt
ld1 {\rd\().b}[10], [\rs], \rt
ld1 {\rd\().b}[11], [\rs], \rt
ld1 {\rd\().b}[12], [\rs], \rt
ld1 {\rd\().b}[13], [\rs], \rt
ld1 {\rd\().b}[14], [\rs], \rt
ld1 {\rd\().b}[15], [\rs], \rt
.endif
.endm
function ff_pred16x16_128_dc_neon, export=1
movi v0.16b, #128
b .L_pred16x16_dc_end
endfunc
function ff_pred16x16_top_dc_neon, export=1
sub x2, x0, x1
ld1 {v0.16b}, [x2]
uaddlv h0, v0.16b
rshrn v0.8b, v0.8h, #4
dup v0.16b, v0.b[0]
b .L_pred16x16_dc_end
endfunc
function ff_pred16x16_left_dc_neon, export=1
sub x2, x0, #1
ldcol.8 v0, x2, x1, 16
uaddlv h0, v0.16b
rshrn v0.8b, v0.8h, #4
dup v0.16b, v0.b[0]
b .L_pred16x16_dc_end
endfunc
function ff_pred16x16_dc_neon, export=1
sub x2, x0, x1
sub x3, x0, #1
ld1 {v0.16b}, [x2]
ldcol.8 v1, x3, x1, 16
uaddlv h0, v0.16b
uaddlv h1, v1.16b
add v0.4h, v0.4h, v1.4h
rshrn v0.8b, v0.8h, #5
dup v0.16b, v0.b[0]
.L_pred16x16_dc_end:
mov w3, #8
6: st1 {v0.16b}, [x0], x1
st1 {v0.16b}, [x0], x1
subs w3, w3, #1
b.ne 6b
ret
endfunc
function ff_pred16x16_hor_neon, export=1
sub x2, x0, #1
mov w3, #16
1: ld1r {v0.16b}, [x2], x1
st1 {v0.16b}, [x0], x1
subs w3, w3, #1
b.ne 1b
ret
endfunc
function ff_pred16x16_vert_neon, export=1
sub x2, x0, x1
add x1, x1, x1
ld1 {v0.16b}, [x2], x1
mov w3, #8
1: st1 {v0.16b}, [x0], x1
st1 {v0.16b}, [x2], x1
subs w3, w3, #1
b.ne 1b
ret
endfunc
function ff_pred16x16_plane_neon, export=1
sub x3, x0, x1
movrel x4, p16weight
add x2, x3, #8
sub x3, x3, #1
ld1 {v0.8b}, [x3]
ld1 {v2.8b}, [x2], x1
ldcol.8 v1, x3, x1
add x3, x3, x1
ldcol.8 v3, x3, x1
rev64 v0.8b, v0.8b
rev64 v1.8b, v1.8b
uaddl v7.8h, v2.8b, v3.8b
usubl v2.8h, v2.8b, v0.8b
usubl v3.8h, v3.8b, v1.8b
ld1 {v0.8h}, [x4]
mul v2.8h, v2.8h, v0.8h
mul v3.8h, v3.8h, v0.8h
addp v2.8h, v2.8h, v3.8h
addp v2.8h, v2.8h, v2.8h
addp v2.4h, v2.4h, v2.4h
sshll v3.4s, v2.4h, #2
saddw v2.4s, v3.4s, v2.4h
rshrn v4.4h, v2.4s, #6
trn2 v5.4h, v4.4h, v4.4h
add v2.4h, v4.4h, v5.4h
shl v3.4h, v2.4h, #3
ext v7.16b, v7.16b, v7.16b, #14
sub v3.4h, v3.4h, v2.4h // 7 * (b + c)
add v7.4h, v7.4h, v0.4h
shl v2.4h, v7.4h, #4
sub v2.4h, v2.4h, v3.4h
shl v3.4h, v4.4h, #4
ext v0.16b, v0.16b, v0.16b, #14
sub v6.4h, v5.4h, v3.4h
mov v0.h[0], wzr
mul v0.8h, v0.8h, v4.h[0]
dup v1.8h, v2.h[0]
dup v2.8h, v4.h[0]
dup v3.8h, v6.h[0]
shl v2.8h, v2.8h, #3
add v1.8h, v1.8h, v0.8h
add v3.8h, v3.8h, v2.8h
mov w3, #16
1:
sqshrun v0.8b, v1.8h, #5
add v1.8h, v1.8h, v2.8h
sqshrun2 v0.16b, v1.8h, #5
add v1.8h, v1.8h, v3.8h
st1 {v0.16b}, [x0], x1
subs w3, w3, #1
b.ne 1b
ret
endfunc
const p16weight, align=4
.short 1,2,3,4,5,6,7,8
endconst
const p8weight, align=4
.short 1,2,3,4,1,2,3,4
endconst
function ff_pred8x8_hor_neon, export=1
sub x2, x0, #1
mov w3, #8
1: ld1r {v0.8b}, [x2], x1
st1 {v0.8b}, [x0], x1
subs w3, w3, #1
b.ne 1b
ret
endfunc
function ff_pred8x8_vert_neon, export=1
sub x2, x0, x1
lsl x1, x1, #1
ld1 {v0.8b}, [x2], x1
mov w3, #4
1: st1 {v0.8b}, [x0], x1
st1 {v0.8b}, [x2], x1
subs w3, w3, #1
b.ne 1b
ret
endfunc
function ff_pred8x8_plane_neon, export=1
sub x3, x0, x1
movrel x4, p8weight
movrel x5, p16weight
add x2, x3, #4
sub x3, x3, #1
ld1 {v0.s}[0], [x3]
ld1 {v2.s}[0], [x2], x1
ldcol.8 v0, x3, x1, 4, hi=1
add x3, x3, x1
ldcol.8 v3, x3, x1, 4
uaddl v7.8h, v2.8b, v3.8b
rev32 v0.8b, v0.8b
trn1 v2.2s, v2.2s, v3.2s
usubl v2.8h, v2.8b, v0.8b
ld1 {v6.8h}, [x4]
mul v2.8h, v2.8h, v6.8h
ld1 {v0.8h}, [x5]
saddlp v2.4s, v2.8h
addp v2.4s, v2.4s, v2.4s
shl v3.4s, v2.4s, #4
add v2.4s, v3.4s, v2.4s
rshrn v5.4h, v2.4s, #5
addp v2.4h, v5.4h, v5.4h
shl v3.4h, v2.4h, #1
add v3.4h, v3.4h, v2.4h
rev64 v7.4h, v7.4h
add v7.4h, v7.4h, v0.4h
shl v2.4h, v7.4h, #4
sub v2.4h, v2.4h, v3.4h
ext v0.16b, v0.16b, v0.16b, #14
mov v0.h[0], wzr
mul v0.8h, v0.8h, v5.h[0]
dup v1.8h, v2.h[0]
dup v2.8h, v5.h[1]
add v1.8h, v1.8h, v0.8h
mov w3, #8
1:
sqshrun v0.8b, v1.8h, #5
add v1.8h, v1.8h, v2.8h
st1 {v0.8b}, [x0], x1
subs w3, w3, #1
b.ne 1b
ret
endfunc
function ff_pred8x8_128_dc_neon, export=1
movi v0.8b, #128
movi v1.8b, #128
b .L_pred8x8_dc_end
endfunc
function ff_pred8x8_top_dc_neon, export=1
sub x2, x0, x1
ld1 {v0.8b}, [x2]
uaddlp v0.4h, v0.8b
addp v0.4h, v0.4h, v0.4h
zip1 v0.8h, v0.8h, v0.8h
rshrn v2.8b, v0.8h, #2
zip1 v0.8b, v2.8b, v2.8b
zip1 v1.8b, v2.8b, v2.8b
b .L_pred8x8_dc_end
endfunc
function ff_pred8x8_left_dc_neon, export=1
sub x2, x0, #1
ldcol.8 v0, x2, x1
uaddlp v0.4h, v0.8b
addp v0.4h, v0.4h, v0.4h
rshrn v2.8b, v0.8h, #2
dup v1.8b, v2.b[1]
dup v0.8b, v2.b[0]
b .L_pred8x8_dc_end
endfunc
function ff_pred8x8_dc_neon, export=1
sub x2, x0, x1
sub x3, x0, #1
ld1 {v0.8b}, [x2]
ldcol.8 v1, x3, x1
uaddlp v0.4h, v0.8b
uaddlp v1.4h, v1.8b
trn1 v2.2s, v0.2s, v1.2s
trn2 v3.2s, v0.2s, v1.2s
addp v4.4h, v2.4h, v3.4h
addp v5.4h, v4.4h, v4.4h
rshrn v6.8b, v5.8h, #3
rshrn v7.8b, v4.8h, #2
dup v0.8b, v6.b[0]
dup v2.8b, v7.b[2]
dup v1.8b, v7.b[3]
dup v3.8b, v6.b[1]
zip1 v0.2s, v0.2s, v2.2s
zip1 v1.2s, v1.2s, v3.2s
.L_pred8x8_dc_end:
mov w3, #4
add x2, x0, x1, lsl #2
6: st1 {v0.8b}, [x0], x1
st1 {v1.8b}, [x2], x1
subs w3, w3, #1
b.ne 6b
ret
endfunc
function ff_pred8x8_l0t_dc_neon, export=1
sub x2, x0, x1
sub x3, x0, #1
ld1 {v0.8b}, [x2]
ldcol.8 v1, x3, x1, 4
zip1 v0.4s, v0.4s, v1.4s
uaddlp v0.8h, v0.16b
addp v0.8h, v0.8h, v0.8h
addp v1.4h, v0.4h, v0.4h
rshrn v2.8b, v0.8h, #2
rshrn v3.8b, v1.8h, #3
dup v4.8b, v3.b[0]
dup v6.8b, v2.b[2]
dup v5.8b, v2.b[0]
zip1 v0.2s, v4.2s, v6.2s
zip1 v1.2s, v5.2s, v6.2s
b .L_pred8x8_dc_end
endfunc
function ff_pred8x8_l00_dc_neon, export=1
sub x2, x0, #1
ldcol.8 v0, x2, x1, 4
uaddlp v0.4h, v0.8b
addp v0.4h, v0.4h, v0.4h
rshrn v0.8b, v0.8h, #2
movi v1.8b, #128
dup v0.8b, v0.b[0]
b .L_pred8x8_dc_end
endfunc
function ff_pred8x8_0lt_dc_neon, export=1
add x3, x0, x1, lsl #2
sub x2, x0, x1
sub x3, x3, #1
ld1 {v0.8b}, [x2]
ldcol.8 v1, x3, x1, 4, hi=1
zip1 v0.4s, v0.4s, v1.4s
uaddlp v0.8h, v0.16b
addp v0.8h, v0.8h, v0.8h
addp v1.4h, v0.4h, v0.4h
rshrn v2.8b, v0.8h, #2
rshrn v3.8b, v1.8h, #3
dup v4.8b, v2.b[0]
dup v5.8b, v2.b[3]
dup v6.8b, v2.b[2]
dup v7.8b, v3.b[1]
zip1 v0.2s, v4.2s, v6.2s
zip1 v1.2s, v5.2s, v7.2s
b .L_pred8x8_dc_end
endfunc
function ff_pred8x8_0l0_dc_neon, export=1
add x2, x0, x1, lsl #2
sub x2, x2, #1
ldcol.8 v1, x2, x1, 4
uaddlp v2.4h, v1.8b
addp v2.4h, v2.4h, v2.4h
rshrn v1.8b, v2.8h, #2
movi v0.8b, #128
dup v1.8b, v1.b[0]
b .L_pred8x8_dc_end
endfunc

View File

@ -0,0 +1,123 @@
/*
* ARM NEON optimised DSP functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stddef.h>
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/aarch64/cpu.h"
#include "libavcodec/hpeldsp.h"
void ff_put_pixels16_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
av_cold void ff_hpeldsp_init_aarch64(HpelDSPContext *c, int flags)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon;
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon;
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon;
c->put_pixels_tab[1][0] = ff_put_pixels8_neon;
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon;
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon;
c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon;
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon;
c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon;
c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon;
c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon;
c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon;
c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon;
c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon;
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon;
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon;
c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon;
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon;
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon;
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon;
c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon;
c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon;
c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon;
c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon;
}
}

View File

@ -0,0 +1,397 @@
/*
* ARM NEON optimised DSP functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
.macro pixels16 rnd=1, avg=0
.if \avg
mov x12, x0
.endif
1: ld1 {v0.16B}, [x1], x2
ld1 {v1.16B}, [x1], x2
ld1 {v2.16B}, [x1], x2
ld1 {v3.16B}, [x1], x2
.if \avg
ld1 {v4.16B}, [x12], x2
urhadd v0.16B, v0.16B, v4.16B
ld1 {v5.16B}, [x12], x2
urhadd v1.16B, v1.16B, v5.16B
ld1 {v6.16B}, [x12], x2
urhadd v2.16B, v2.16B, v6.16B
ld1 {v7.16B}, [x12], x2
urhadd v3.16B, v3.16B, v7.16B
.endif
subs w3, w3, #4
st1 {v0.16B}, [x0], x2
st1 {v1.16B}, [x0], x2
st1 {v2.16B}, [x0], x2
st1 {v3.16B}, [x0], x2
b.ne 1b
ret
.endm
.macro pixels16_x2 rnd=1, avg=0
1: ld1 {v0.16B, v1.16B}, [x1], x2
ld1 {v2.16B, v3.16B}, [x1], x2
subs w3, w3, #2
ext v1.16B, v0.16B, v1.16B, #1
avg v0.16B, v0.16B, v1.16B
ext v3.16B, v2.16B, v3.16B, #1
avg v2.16B, v2.16B, v3.16B
.if \avg
ld1 {v1.16B}, [x0], x2
ld1 {v3.16B}, [x0]
urhadd v0.16B, v0.16B, v1.16B
urhadd v2.16B, v2.16B, v3.16B
sub x0, x0, x2
.endif
st1 {v0.16B}, [x0], x2
st1 {v2.16B}, [x0], x2
b.ne 1b
ret
.endm
.macro pixels16_y2 rnd=1, avg=0
sub w3, w3, #2
ld1 {v0.16B}, [x1], x2
ld1 {v1.16B}, [x1], x2
1: subs w3, w3, #2
avg v2.16B, v0.16B, v1.16B
ld1 {v0.16B}, [x1], x2
avg v3.16B, v0.16B, v1.16B
ld1 {v1.16B}, [x1], x2
.if \avg
ld1 {v4.16B}, [x0], x2
ld1 {v5.16B}, [x0]
urhadd v2.16B, v2.16B, v4.16B
urhadd v3.16B, v3.16B, v5.16B
sub x0, x0, x2
.endif
st1 {v2.16B}, [x0], x2
st1 {v3.16B}, [x0], x2
b.ne 1b
avg v2.16B, v0.16B, v1.16B
ld1 {v0.16B}, [x1], x2
avg v3.16B, v0.16B, v1.16B
.if \avg
ld1 {v4.16B}, [x0], x2
ld1 {v5.16B}, [x0]
urhadd v2.16B, v2.16B, v4.16B
urhadd v3.16B, v3.16B, v5.16B
sub x0, x0, x2
.endif
st1 {v2.16B}, [x0], x2
st1 {v3.16B}, [x0], x2
ret
.endm
.macro pixels16_xy2 rnd=1, avg=0
sub w3, w3, #2
ld1 {v0.16B, v1.16B}, [x1], x2
ld1 {v4.16B, v5.16B}, [x1], x2
NRND movi v26.8H, #1
ext v1.16B, v0.16B, v1.16B, #1
ext v5.16B, v4.16B, v5.16B, #1
uaddl v16.8H, v0.8B, v1.8B
uaddl2 v20.8H, v0.16B, v1.16B
uaddl v18.8H, v4.8B, v5.8B
uaddl2 v22.8H, v4.16B, v5.16B
1: subs w3, w3, #2
ld1 {v0.16B, v1.16B}, [x1], x2
add v24.8H, v16.8H, v18.8H
NRND add v24.8H, v24.8H, v26.8H
ext v30.16B, v0.16B, v1.16B, #1
add v1.8H, v20.8H, v22.8H
mshrn v28.8B, v24.8H, #2
NRND add v1.8H, v1.8H, v26.8H
mshrn2 v28.16B, v1.8H, #2
.if \avg
ld1 {v16.16B}, [x0]
urhadd v28.16B, v28.16B, v16.16B
.endif
uaddl v16.8H, v0.8B, v30.8B
ld1 {v2.16B, v3.16B}, [x1], x2
uaddl2 v20.8H, v0.16B, v30.16B
st1 {v28.16B}, [x0], x2
add v24.8H, v16.8H, v18.8H
NRND add v24.8H, v24.8H, v26.8H
ext v3.16B, v2.16B, v3.16B, #1
add v0.8H, v20.8H, v22.8H
mshrn v30.8B, v24.8H, #2
NRND add v0.8H, v0.8H, v26.8H
mshrn2 v30.16B, v0.8H, #2
.if \avg
ld1 {v18.16B}, [x0]
urhadd v30.16B, v30.16B, v18.16B
.endif
uaddl v18.8H, v2.8B, v3.8B
uaddl2 v22.8H, v2.16B, v3.16B
st1 {v30.16B}, [x0], x2
b.gt 1b
ld1 {v0.16B, v1.16B}, [x1], x2
add v24.8H, v16.8H, v18.8H
NRND add v24.8H, v24.8H, v26.8H
ext v30.16B, v0.16B, v1.16B, #1
add v1.8H, v20.8H, v22.8H
mshrn v28.8B, v24.8H, #2
NRND add v1.8H, v1.8H, v26.8H
mshrn2 v28.16B, v1.8H, #2
.if \avg
ld1 {v16.16B}, [x0]
urhadd v28.16B, v28.16B, v16.16B
.endif
uaddl v16.8H, v0.8B, v30.8B
uaddl2 v20.8H, v0.16B, v30.16B
st1 {v28.16B}, [x0], x2
add v24.8H, v16.8H, v18.8H
NRND add v24.8H, v24.8H, v26.8H
add v0.8H, v20.8H, v22.8H
mshrn v30.8B, v24.8H, #2
NRND add v0.8H, v0.8H, v26.8H
mshrn2 v30.16B, v0.8H, #2
.if \avg
ld1 {v18.16B}, [x0]
urhadd v30.16B, v30.16B, v18.16B
.endif
st1 {v30.16B}, [x0], x2
ret
.endm
.macro pixels8 rnd=1, avg=0
1: ld1 {v0.8B}, [x1], x2
ld1 {v1.8B}, [x1], x2
ld1 {v2.8B}, [x1], x2
ld1 {v3.8B}, [x1], x2
.if \avg
ld1 {v4.8B}, [x0], x2
urhadd v0.8B, v0.8B, v4.8B
ld1 {v5.8B}, [x0], x2
urhadd v1.8B, v1.8B, v5.8B
ld1 {v6.8B}, [x0], x2
urhadd v2.8B, v2.8B, v6.8B
ld1 {v7.8B}, [x0], x2
urhadd v3.8B, v3.8B, v7.8B
sub x0, x0, x2, lsl #2
.endif
subs w3, w3, #4
st1 {v0.8B}, [x0], x2
st1 {v1.8B}, [x0], x2
st1 {v2.8B}, [x0], x2
st1 {v3.8B}, [x0], x2
b.ne 1b
ret
.endm
.macro pixels8_x2 rnd=1, avg=0
1: ld1 {v0.8B, v1.8B}, [x1], x2
ext v1.8B, v0.8B, v1.8B, #1
ld1 {v2.8B, v3.8B}, [x1], x2
ext v3.8B, v2.8B, v3.8B, #1
subs w3, w3, #2
avg v0.8B, v0.8B, v1.8B
avg v2.8B, v2.8B, v3.8B
.if \avg
ld1 {v4.8B}, [x0], x2
ld1 {v5.8B}, [x0]
urhadd v0.8B, v0.8B, v4.8B
urhadd v2.8B, v2.8B, v5.8B
sub x0, x0, x2
.endif
st1 {v0.8B}, [x0], x2
st1 {v2.8B}, [x0], x2
b.ne 1b
ret
.endm
.macro pixels8_y2 rnd=1, avg=0
sub w3, w3, #2
ld1 {v0.8B}, [x1], x2
ld1 {v1.8B}, [x1], x2
1: subs w3, w3, #2
avg v4.8B, v0.8B, v1.8B
ld1 {v0.8B}, [x1], x2
avg v5.8B, v0.8B, v1.8B
ld1 {v1.8B}, [x1], x2
.if \avg
ld1 {v2.8B}, [x0], x2
ld1 {v3.8B}, [x0]
urhadd v4.8B, v4.8B, v2.8B
urhadd v5.8B, v5.8B, v3.8B
sub x0, x0, x2
.endif
st1 {v4.8B}, [x0], x2
st1 {v5.8B}, [x0], x2
b.ne 1b
avg v4.8B, v0.8B, v1.8B
ld1 {v0.8B}, [x1], x2
avg v5.8B, v0.8B, v1.8B
.if \avg
ld1 {v2.8B}, [x0], x2
ld1 {v3.8B}, [x0]
urhadd v4.8B, v4.8B, v2.8B
urhadd v5.8B, v5.8B, v3.8B
sub x0, x0, x2
.endif
st1 {v4.8B}, [x0], x2
st1 {v5.8B}, [x0], x2
ret
.endm
.macro pixels8_xy2 rnd=1, avg=0
sub w3, w3, #2
ld1 {v0.16B}, [x1], x2
ld1 {v1.16B}, [x1], x2
NRND movi v19.8H, #1
ext v4.16B, v0.16B, v4.16B, #1
ext v6.16B, v1.16B, v6.16B, #1
uaddl v16.8H, v0.8B, v4.8B
uaddl v17.8H, v1.8B, v6.8B
1: subs w3, w3, #2
ld1 {v0.16B}, [x1], x2
add v18.8H, v16.8H, v17.8H
ext v4.16B, v0.16B, v4.16B, #1
NRND add v18.8H, v18.8H, v19.8H
uaddl v16.8H, v0.8B, v4.8B
mshrn v5.8B, v18.8H, #2
ld1 {v1.16B}, [x1], x2
add v18.8H, v16.8H, v17.8H
.if \avg
ld1 {v7.8B}, [x0]
urhadd v5.8B, v5.8B, v7.8B
.endif
NRND add v18.8H, v18.8H, v19.8H
st1 {v5.8B}, [x0], x2
mshrn v7.8B, v18.8H, #2
.if \avg
ld1 {v5.8B}, [x0]
urhadd v7.8B, v7.8B, v5.8B
.endif
ext v6.16B, v1.16B, v6.16B, #1
uaddl v17.8H, v1.8B, v6.8B
st1 {v7.8B}, [x0], x2
b.gt 1b
ld1 {v0.16B}, [x1], x2
add v18.8H, v16.8H, v17.8H
ext v4.16B, v0.16B, v4.16B, #1
NRND add v18.8H, v18.8H, v19.8H
uaddl v16.8H, v0.8B, v4.8B
mshrn v5.8B, v18.8H, #2
add v18.8H, v16.8H, v17.8H
.if \avg
ld1 {v7.8B}, [x0]
urhadd v5.8B, v5.8B, v7.8B
.endif
NRND add v18.8H, v18.8H, v19.8H
st1 {v5.8B}, [x0], x2
mshrn v7.8B, v18.8H, #2
.if \avg
ld1 {v5.8B}, [x0]
urhadd v7.8B, v7.8B, v5.8B
.endif
st1 {v7.8B}, [x0], x2
ret
.endm
.macro pixfunc pfx, name, suf, rnd=1, avg=0
.if \rnd
.macro avg rd, rn, rm
urhadd \rd, \rn, \rm
.endm
.macro mshrn rd, rn, rm
rshrn \rd, \rn, \rm
.endm
.macro mshrn2 rd, rn, rm
rshrn2 \rd, \rn, \rm
.endm
.macro NRND insn:vararg
.endm
.else
.macro avg rd, rn, rm
uhadd \rd, \rn, \rm
.endm
.macro mshrn rd, rn, rm
shrn \rd, \rn, \rm
.endm
.macro mshrn2 rd, rn, rm
shrn2 \rd, \rn, \rm
.endm
.macro NRND insn:vararg
\insn
.endm
.endif
function ff_\pfx\name\suf\()_neon, export=1
\name \rnd, \avg
endfunc
.purgem avg
.purgem mshrn
.purgem mshrn2
.purgem NRND
.endm
.macro pixfunc2 pfx, name, avg=0
pixfunc \pfx, \name, rnd=1, avg=\avg
pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg
.endm
function ff_put_h264_qpel16_mc00_neon, export=1
mov w3, #16
endfunc
pixfunc put_, pixels16, avg=0
pixfunc2 put_, pixels16_x2, avg=0
pixfunc2 put_, pixels16_y2, avg=0
pixfunc2 put_, pixels16_xy2, avg=0
function ff_avg_h264_qpel16_mc00_neon, export=1
mov w3, #16
endfunc
pixfunc avg_, pixels16, avg=1
pixfunc2 avg_, pixels16_x2, avg=1
pixfunc2 avg_, pixels16_y2, avg=1
pixfunc2 avg_, pixels16_xy2, avg=1
function ff_put_h264_qpel8_mc00_neon, export=1
mov w3, #8
endfunc
pixfunc put_, pixels8, avg=0
pixfunc2 put_, pixels8_x2, avg=0
pixfunc2 put_, pixels8_y2, avg=0
pixfunc2 put_, pixels8_xy2, avg=0
function ff_avg_h264_qpel8_mc00_neon, export=1
mov w3, #8
endfunc
pixfunc avg_, pixels8, avg=1
pixfunc avg_, pixels8_x2, avg=1
pixfunc avg_, pixels8_y2, avg=1
pixfunc avg_, pixels8_xy2, avg=1

View File

@ -0,0 +1,28 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_AARCH64_IDCT_H
#define AVCODEC_AARCH64_IDCT_H
#include <stdint.h>
void ff_simple_idct_neon(int16_t *data);
void ff_simple_idct_put_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
void ff_simple_idct_add_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
#endif /* AVCODEC_AARCH64_IDCT_H */

View File

@ -0,0 +1,41 @@
/*
* ARM-NEON-optimized IDCT functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/idctdsp.h"
#include "idct.h"
av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth)
{
if (!avctx->lowres && !high_bit_depth) {
if (avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLENEON) {
c->idct_put = ff_simple_idct_put_neon;
c->idct_add = ff_simple_idct_add_neon;
c->idct = ff_simple_idct_neon;
c->perm_type = FF_IDCT_PERM_PARTTRANS;
}
}
}

View File

@ -0,0 +1,323 @@
/*
* AArch64 NEON optimised MDCT
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
function ff_imdct_half_neon, export=1
sub sp, sp, #32
stp x19, x20, [sp]
str x30, [sp, #16]
mov x12, #1
ldr w14, [x0, #28] // mdct_bits
ldr x4, [x0, #32] // tcos
ldr x3, [x0, #8] // revtab
lsl x12, x12, x14 // n = 1 << nbits
lsr x14, x12, #2 // n4 = n >> 2
add x7, x2, x12, lsl #1
mov x12, #-16
sub x7, x7, #16
ld2 {v16.2s,v17.2s}, [x7], x12 // d16=x,n1 d17=x,n0
ld2 {v0.2s,v1.2s}, [x2], #16 // d0 =m0,x d1 =m1,x
rev64 v17.2s, v17.2s
ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2
fmul v6.2s, v17.2s, v2.2s
fmul v7.2s, v0.2s, v2.2s
1:
subs x14, x14, #2
ldr w6, [x3], #4
fmul v4.2s, v0.2s, v3.2s
fmul v5.2s, v17.2s, v3.2s
fsub v4.2s, v6.2s, v4.2s
fadd v5.2s, v5.2s, v7.2s
ubfm x8, x6, #16, #31
ubfm x6, x6, #0, #15
add x8, x1, x8, lsl #3
add x6, x1, x6, lsl #3
b.eq 2f
ld2 {v16.2s,v17.2s}, [x7], x12
ld2 {v0.2s,v1.2s}, [x2], #16
rev64 v17.2s, v17.2s
ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2
fmul v6.2s, v17.2s, v2.2s
fmul v7.2s, v0.2s, v2.2s
st2 {v4.s,v5.s}[0], [x6]
st2 {v4.s,v5.s}[1], [x8]
b 1b
2:
st2 {v4.s,v5.s}[0], [x6]
st2 {v4.s,v5.s}[1], [x8]
mov x19, x0
mov x20, x1
bl X(ff_fft_calc_neon)
mov x12, #1
ldr w14, [x19, #28] // mdct_bits
ldr x4, [x19, #32] // tcos
lsl x12, x12, x14 // n = 1 << nbits
lsr x14, x12, #3 // n8 = n >> 3
add x4, x4, x14, lsl #3
add x6, x20, x14, lsl #3
sub x1, x4, #16
sub x3, x6, #16
mov x7, #-16
mov x8, x6
mov x0, x3
ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =i1,r1 d1 =i0,r0
ld2 {v20.2s,v21.2s},[x6], #16 // d20=i2,r2 d21=i3,r3
ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
3:
subs x14, x14, #2
fmul v7.2s, v0.2s, v17.2s
ld2 {v18.2s,v19.2s},[x4], #16 // d17=c2,c3 d19=s2,s3
fmul v4.2s, v1.2s, v17.2s
fmul v6.2s, v21.2s, v19.2s
fmul v5.2s, v20.2s, v19.2s
fmul v22.2s, v1.2s, v16.2s
fmul v23.2s, v21.2s, v18.2s
fmul v24.2s, v0.2s, v16.2s
fmul v25.2s, v20.2s, v18.2s
fadd v7.2s, v7.2s, v22.2s
fadd v5.2s, v5.2s, v23.2s
fsub v4.2s, v4.2s, v24.2s
fsub v6.2s, v6.2s, v25.2s
b.eq 4f
ld2 {v0.2s,v1.2s}, [x3], x7
ld2 {v20.2s,v21.2s},[x6], #16
ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
rev64 v5.2s, v5.2s
rev64 v7.2s, v7.2s
st2 {v4.2s,v5.2s}, [x0], x7
st2 {v6.2s,v7.2s}, [x8], #16
b 3b
4:
rev64 v5.2s, v5.2s
rev64 v7.2s, v7.2s
st2 {v4.2s,v5.2s}, [x0]
st2 {v6.2s,v7.2s}, [x8]
ldp x19, x20, [sp]
ldr x30, [sp, #16]
add sp, sp, #32
ret
endfunc
function ff_imdct_calc_neon, export=1
sub sp, sp, #32
stp x19, x20, [sp]
str x30, [sp, #16]
ldr w3, [x0, #28] // mdct_bits
mov x19, #1
mov x20, x1
lsl x19, x19, x3
add x1, x1, x19
bl X(ff_imdct_half_neon)
add x0, x20, x19, lsl #2
add x1, x20, x19, lsl #1
sub x0, x0, #8
sub x2, x1, #16
mov x3, #-16
mov x6, #-8
1:
ld1 {v0.4s}, [x2], x3
prfum pldl1keep, [x0, #-16]
rev64 v0.4s, v0.4s
ld1 {v2.2s,v3.2s}, [x1], #16
fneg v4.4s, v0.4s
prfum pldl1keep, [x2, #-16]
rev64 v2.2s, v2.2s
rev64 v3.2s, v3.2s
ext v4.16b, v4.16b, v4.16b, #8
st1 {v2.2s}, [x0], x6
st1 {v3.2s}, [x0], x6
st1 {v4.4s}, [x20], #16
subs x19, x19, #16
b.gt 1b
ldp x19, x20, [sp], #16
ldr x30, [sp], #16
ret
endfunc
function ff_mdct_calc_neon, export=1
sub sp, sp, #32
stp x19, x20, [sp]
str x30, [sp, #16]
mov x12, #1
ldr w14, [x0, #28] // mdct_bits
ldr x4, [x0, #32] // tcos
ldr x3, [x0, #8] // revtab
lsl x14, x12, x14 // n = 1 << nbits
add x7, x2, x14 // in4u
sub x9, x7, #16 // in4d
add x2, x7, x14, lsl #1 // in3u
add x8, x9, x14, lsl #1 // in3d
add x5, x4, x14, lsl #1
sub x5, x5, #16
sub x3, x3, #4
mov x12, #-16
lsr x13, x14, #1
ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0
ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0
ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0
rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1
rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1
ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0
fsub v0.2s, v17.2s, v0.2s // in4d-in4u I
ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1
rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1
rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1
ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3
fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R
fsub v16.2s, v16.2s, v1.2s // in0u-in2d R
fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I
1:
fmul v7.2s, v0.2s, v21.2s // I*s
ldr w10, [x3, x13]
fmul v6.2s, v2.2s, v20.2s // -R*c
ldr w6, [x3, #4]!
fmul v4.2s, v2.2s, v21.2s // -R*s
fmul v5.2s, v0.2s, v20.2s // I*c
fmul v24.2s, v16.2s, v30.2s // R*c
fmul v25.2s, v18.2s, v31.2s // -I*s
fmul v22.2s, v16.2s, v31.2s // R*s
fmul v23.2s, v18.2s, v30.2s // I*c
subs x14, x14, #16
subs x13, x13, #8
fsub v6.2s, v6.2s, v7.2s // -R*c-I*s
fadd v7.2s, v4.2s, v5.2s // -R*s+I*c
fsub v24.2s, v25.2s, v24.2s // I*s-R*c
fadd v25.2s, v22.2s, v23.2s // R*s-I*c
b.eq 1f
mov x12, #-16
ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0
ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0
fneg v7.2s, v7.2s // R*s-I*c
ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0
rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1
rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1
ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0
fsub v0.2s, v17.2s, v0.2s // in4d-in4u I
ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1
rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1
rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1
ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3
fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R
fsub v16.2s, v16.2s, v1.2s // in0u-in2d R
fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I
ubfm x12, x6, #16, #31
ubfm x6, x6, #0, #15
add x12, x1, x12, lsl #3
add x6, x1, x6, lsl #3
st2 {v6.s,v7.s}[0], [x6]
st2 {v6.s,v7.s}[1], [x12]
ubfm x6, x10, #16, #31
ubfm x10, x10, #0, #15
add x6 , x1, x6, lsl #3
add x10, x1, x10, lsl #3
st2 {v24.s,v25.s}[0], [x10]
st2 {v24.s,v25.s}[1], [x6]
b 1b
1:
fneg v7.2s, v7.2s // R*s-I*c
ubfm x12, x6, #16, #31
ubfm x6, x6, #0, #15
add x12, x1, x12, lsl #3
add x6, x1, x6, lsl #3
st2 {v6.s,v7.s}[0], [x6]
st2 {v6.s,v7.s}[1], [x12]
ubfm x6, x10, #16, #31
ubfm x10, x10, #0, #15
add x6 , x1, x6, lsl #3
add x10, x1, x10, lsl #3
st2 {v24.s,v25.s}[0], [x10]
st2 {v24.s,v25.s}[1], [x6]
mov x19, x0
mov x20, x1
bl X(ff_fft_calc_neon)
mov x12, #1
ldr w14, [x19, #28] // mdct_bits
ldr x4, [x19, #32] // tcos
lsl x12, x12, x14 // n = 1 << nbits
lsr x14, x12, #3 // n8 = n >> 3
add x4, x4, x14, lsl #3
add x6, x20, x14, lsl #3
sub x1, x4, #16
sub x3, x6, #16
mov x7, #-16
mov x8, x6
mov x0, x3
ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =r1,i1 d1 =r0,i0
ld2 {v20.2s,v21.2s}, [x6], #16 // d20=r2,i2 d21=r3,i3
ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0
1:
subs x14, x14, #2
fmul v7.2s, v0.2s, v17.2s // r1*s1,r0*s0
ld2 {v18.2s,v19.2s}, [x4], #16 // c2,c3 s2,s3
fmul v4.2s, v1.2s, v17.2s // i1*s1,i0*s0
fmul v6.2s, v21.2s, v19.2s // i2*s2,i3*s3
fmul v5.2s, v20.2s, v19.2s // r2*s2,r3*s3
fmul v24.2s, v0.2s, v16.2s // r1*c1,r0*c0
fmul v25.2s, v20.2s, v18.2s // r2*c2,r3*c3
fmul v22.2s, v21.2s, v18.2s // i2*c2,i3*c3
fmul v23.2s, v1.2s, v16.2s // i1*c1,i0*c0
fadd v4.2s, v4.2s, v24.2s // i1*s1+r1*c1,i0*s0+r0*c0
fadd v6.2s, v6.2s, v25.2s // i2*s2+r2*c2,i3*s3+r3*c3
fsub v5.2s, v22.2s, v5.2s // i2*c2-r2*s2,i3*c3-r3*s3
fsub v7.2s, v23.2s, v7.2s // i1*c1-r1*s1,i0*c0-r0*s0
fneg v4.2s, v4.2s
fneg v6.2s, v6.2s
b.eq 1f
ld2 {v0.2s, v1.2s}, [x3], x7
ld2 {v20.2s,v21.2s}, [x6], #16
ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0
rev64 v5.2s, v5.2s
rev64 v7.2s, v7.2s
st2 {v4.2s,v5.2s}, [x0], x7
st2 {v6.2s,v7.2s}, [x8], #16
b 1b
1:
rev64 v5.2s, v5.2s
rev64 v7.2s, v7.2s
st2 {v4.2s,v5.2s}, [x0]
st2 {v6.2s,v7.2s}, [x8]
ldp x19, x20, [sp], #16
ldr x30, [sp], #16
ret
endfunc

View File

@ -0,0 +1,149 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
.macro transpose_8x8B r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
trn1 \r8\().8B, \r0\().8B, \r1\().8B
trn2 \r9\().8B, \r0\().8B, \r1\().8B
trn1 \r1\().8B, \r2\().8B, \r3\().8B
trn2 \r3\().8B, \r2\().8B, \r3\().8B
trn1 \r0\().8B, \r4\().8B, \r5\().8B
trn2 \r5\().8B, \r4\().8B, \r5\().8B
trn1 \r2\().8B, \r6\().8B, \r7\().8B
trn2 \r7\().8B, \r6\().8B, \r7\().8B
trn1 \r4\().4H, \r0\().4H, \r2\().4H
trn2 \r2\().4H, \r0\().4H, \r2\().4H
trn1 \r6\().4H, \r5\().4H, \r7\().4H
trn2 \r7\().4H, \r5\().4H, \r7\().4H
trn1 \r5\().4H, \r9\().4H, \r3\().4H
trn2 \r9\().4H, \r9\().4H, \r3\().4H
trn1 \r3\().4H, \r8\().4H, \r1\().4H
trn2 \r8\().4H, \r8\().4H, \r1\().4H
trn1 \r0\().2S, \r3\().2S, \r4\().2S
trn2 \r4\().2S, \r3\().2S, \r4\().2S
trn1 \r1\().2S, \r5\().2S, \r6\().2S
trn2 \r5\().2S, \r5\().2S, \r6\().2S
trn2 \r6\().2S, \r8\().2S, \r2\().2S
trn1 \r2\().2S, \r8\().2S, \r2\().2S
trn1 \r3\().2S, \r9\().2S, \r7\().2S
trn2 \r7\().2S, \r9\().2S, \r7\().2S
.endm
.macro transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
trn1 \t0\().16B, \r0\().16B, \r1\().16B
trn2 \t1\().16B, \r0\().16B, \r1\().16B
trn1 \r1\().16B, \r2\().16B, \r3\().16B
trn2 \r3\().16B, \r2\().16B, \r3\().16B
trn1 \r0\().16B, \r4\().16B, \r5\().16B
trn2 \r5\().16B, \r4\().16B, \r5\().16B
trn1 \r2\().16B, \r6\().16B, \r7\().16B
trn2 \r7\().16B, \r6\().16B, \r7\().16B
trn1 \r4\().8H, \r0\().8H, \r2\().8H
trn2 \r2\().8H, \r0\().8H, \r2\().8H
trn1 \r6\().8H, \r5\().8H, \r7\().8H
trn2 \r7\().8H, \r5\().8H, \r7\().8H
trn1 \r5\().8H, \t1\().8H, \r3\().8H
trn2 \t1\().8H, \t1\().8H, \r3\().8H
trn1 \r3\().8H, \t0\().8H, \r1\().8H
trn2 \t0\().8H, \t0\().8H, \r1\().8H
trn1 \r0\().4S, \r3\().4S, \r4\().4S
trn2 \r4\().4S, \r3\().4S, \r4\().4S
trn1 \r1\().4S, \r5\().4S, \r6\().4S
trn2 \r5\().4S, \r5\().4S, \r6\().4S
trn2 \r6\().4S, \t0\().4S, \r2\().4S
trn1 \r2\().4S, \t0\().4S, \r2\().4S
trn1 \r3\().4S, \t1\().4S, \r7\().4S
trn2 \r7\().4S, \t1\().4S, \r7\().4S
.endm
.macro transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().16B, \r0\().16B, \r1\().16B
trn2 \t5\().16B, \r0\().16B, \r1\().16B
trn1 \t6\().16B, \r2\().16B, \r3\().16B
trn2 \t7\().16B, \r2\().16B, \r3\().16B
trn1 \r0\().8H, \t4\().8H, \t6\().8H
trn2 \r2\().8H, \t4\().8H, \t6\().8H
trn1 \r1\().8H, \t5\().8H, \t7\().8H
trn2 \r3\().8H, \t5\().8H, \t7\().8H
.endm
.macro transpose_4x8B r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().8B, \r0\().8B, \r1\().8B
trn2 \t5\().8B, \r0\().8B, \r1\().8B
trn1 \t6\().8B, \r2\().8B, \r3\().8B
trn2 \t7\().8B, \r2\().8B, \r3\().8B
trn1 \r0\().4H, \t4\().4H, \t6\().4H
trn2 \r2\().4H, \t4\().4H, \t6\().4H
trn1 \r1\().4H, \t5\().4H, \t7\().4H
trn2 \r3\().4H, \t5\().4H, \t7\().4H
.endm
.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7
trn1 \r4\().4H, \r0\().4H, \r1\().4H
trn2 \r5\().4H, \r0\().4H, \r1\().4H
trn1 \r6\().4H, \r2\().4H, \r3\().4H
trn2 \r7\().4H, \r2\().4H, \r3\().4H
trn1 \r0\().2S, \r4\().2S, \r6\().2S
trn2 \r2\().2S, \r4\().2S, \r6\().2S
trn1 \r1\().2S, \r5\().2S, \r7\().2S
trn2 \r3\().2S, \r5\().2S, \r7\().2S
.endm
.macro transpose_8x8H r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
trn1 \r8\().8H, \r0\().8H, \r1\().8H
trn2 \r9\().8H, \r0\().8H, \r1\().8H
trn1 \r1\().8H, \r2\().8H, \r3\().8H
trn2 \r3\().8H, \r2\().8H, \r3\().8H
trn1 \r0\().8H, \r4\().8H, \r5\().8H
trn2 \r5\().8H, \r4\().8H, \r5\().8H
trn1 \r2\().8H, \r6\().8H, \r7\().8H
trn2 \r7\().8H, \r6\().8H, \r7\().8H
trn1 \r4\().4S, \r0\().4S, \r2\().4S
trn2 \r2\().4S, \r0\().4S, \r2\().4S
trn1 \r6\().4S, \r5\().4S, \r7\().4S
trn2 \r7\().4S, \r5\().4S, \r7\().4S
trn1 \r5\().4S, \r9\().4S, \r3\().4S
trn2 \r9\().4S, \r9\().4S, \r3\().4S
trn1 \r3\().4S, \r8\().4S, \r1\().4S
trn2 \r8\().4S, \r8\().4S, \r1\().4S
trn1 \r0\().2D, \r3\().2D, \r4\().2D
trn2 \r4\().2D, \r3\().2D, \r4\().2D
trn1 \r1\().2D, \r5\().2D, \r6\().2D
trn2 \r5\().2D, \r5\().2D, \r6\().2D
trn2 \r6\().2D, \r8\().2D, \r2\().2D
trn1 \r2\().2D, \r8\().2D, \r2\().2D
trn1 \r3\().2D, \r9\().2D, \r7\().2D
trn2 \r7\().2D, \r9\().2D, \r7\().2D
.endm

View File

@ -0,0 +1,362 @@
/*
* ARM NEON IDCT
*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
*
* Based on Simple IDCT
* Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
#define Z1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define Z2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define Z3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define Z4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define Z5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define Z6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define Z7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define Z4c ((1<<(COL_SHIFT-1))/Z4)
#define ROW_SHIFT 11
#define COL_SHIFT 20
#define z1 v0.H[0]
#define z2 v0.H[1]
#define z3 v0.H[2]
#define z4 v0.H[3]
#define z5 v0.H[4]
#define z6 v0.H[5]
#define z7 v0.H[6]
#define z4c v0.H[7]
const idct_coeff_neon, align=4
.short Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z4c
endconst
.macro idct_start data
prfm pldl1keep, [\data]
mov x10, x30
movrel x3, idct_coeff_neon
ld1 {v0.2D}, [x3]
.endm
.macro idct_end
br x10
.endm
.macro smull1 a, b, c
smull \a, \b, \c
.endm
.macro smlal1 a, b, c
smlal \a, \b, \c
.endm
.macro smlsl1 a, b, c
smlsl \a, \b, \c
.endm
.macro idct_col4_top y1, y2, y3, y4, i, l
smull\i v7.4S, \y3\l, z2
smull\i v16.4S, \y3\l, z6
smull\i v17.4S, \y2\l, z1
add v19.4S, v23.4S, v7.4S
smull\i v18.4S, \y2\l, z3
add v20.4S, v23.4S, v16.4S
smull\i v5.4S, \y2\l, z5
sub v21.4S, v23.4S, v16.4S
smull\i v6.4S, \y2\l, z7
sub v22.4S, v23.4S, v7.4S
smlal\i v17.4S, \y4\l, z3
smlsl\i v18.4S, \y4\l, z7
smlsl\i v5.4S, \y4\l, z1
smlsl\i v6.4S, \y4\l, z5
.endm
.macro idct_row4_neon y1, y2, y3, y4, pass
ld1 {\y1\().2D,\y2\().2D}, [x2], #32
movi v23.4S, #1<<2, lsl #8
orr v5.16B, \y1\().16B, \y2\().16B
ld1 {\y3\().2D,\y4\().2D}, [x2], #32
orr v6.16B, \y3\().16B, \y4\().16B
orr v5.16B, v5.16B, v6.16B
mov x3, v5.D[1]
smlal v23.4S, \y1\().4H, z4
idct_col4_top \y1, \y2, \y3, \y4, 1, .4H
cmp x3, #0
b.eq \pass\()f
smull2 v7.4S, \y1\().8H, z4
smlal2 v17.4S, \y2\().8H, z5
smlsl2 v18.4S, \y2\().8H, z1
smull2 v16.4S, \y3\().8H, z2
smlal2 v5.4S, \y2\().8H, z7
add v19.4S, v19.4S, v7.4S
sub v20.4S, v20.4S, v7.4S
sub v21.4S, v21.4S, v7.4S
add v22.4S, v22.4S, v7.4S
smlal2 v6.4S, \y2\().8H, z3
smull2 v7.4S, \y3\().8H, z6
smlal2 v17.4S, \y4\().8H, z7
smlsl2 v18.4S, \y4\().8H, z5
smlal2 v5.4S, \y4\().8H, z3
smlsl2 v6.4S, \y4\().8H, z1
add v19.4S, v19.4S, v7.4S
sub v20.4S, v20.4S, v16.4S
add v21.4S, v21.4S, v16.4S
sub v22.4S, v22.4S, v7.4S
\pass: add \y3\().4S, v19.4S, v17.4S
add \y4\().4S, v20.4S, v18.4S
shrn \y1\().4H, \y3\().4S, #ROW_SHIFT
shrn \y2\().4H, \y4\().4S, #ROW_SHIFT
add v7.4S, v21.4S, v5.4S
add v16.4S, v22.4S, v6.4S
shrn \y3\().4H, v7.4S, #ROW_SHIFT
shrn \y4\().4H, v16.4S, #ROW_SHIFT
sub v22.4S, v22.4S, v6.4S
sub v19.4S, v19.4S, v17.4S
sub v21.4S, v21.4S, v5.4S
shrn2 \y1\().8H, v22.4S, #ROW_SHIFT
sub v20.4S, v20.4S, v18.4S
shrn2 \y2\().8H, v21.4S, #ROW_SHIFT
shrn2 \y3\().8H, v20.4S, #ROW_SHIFT
shrn2 \y4\().8H, v19.4S, #ROW_SHIFT
trn1 v16.8H, \y1\().8H, \y2\().8H
trn2 v17.8H, \y1\().8H, \y2\().8H
trn1 v18.8H, \y3\().8H, \y4\().8H
trn2 v19.8H, \y3\().8H, \y4\().8H
trn1 \y1\().4S, v16.4S, v18.4S
trn1 \y2\().4S, v17.4S, v19.4S
trn2 \y3\().4S, v16.4S, v18.4S
trn2 \y4\().4S, v17.4S, v19.4S
.endm
.macro declare_idct_col4_neon i, l
function idct_col4_neon\i
dup v23.4H, z4c
.if \i == 1
add v23.4H, v23.4H, v24.4H
.else
mov v5.D[0], v24.D[1]
add v23.4H, v23.4H, v5.4H
.endif
smull v23.4S, v23.4H, z4
idct_col4_top v24, v25, v26, v27, \i, \l
mov x4, v28.D[\i - 1]
mov x5, v29.D[\i - 1]
cmp x4, #0
b.eq 1f
smull\i v7.4S, v28\l, z4
add v19.4S, v19.4S, v7.4S
sub v20.4S, v20.4S, v7.4S
sub v21.4S, v21.4S, v7.4S
add v22.4S, v22.4S, v7.4S
1: mov x4, v30.D[\i - 1]
cmp x5, #0
b.eq 2f
smlal\i v17.4S, v29\l, z5
smlsl\i v18.4S, v29\l, z1
smlal\i v5.4S, v29\l, z7
smlal\i v6.4S, v29\l, z3
2: mov x5, v31.D[\i - 1]
cmp x4, #0
b.eq 3f
smull\i v7.4S, v30\l, z6
smull\i v16.4S, v30\l, z2
add v19.4S, v19.4S, v7.4S
sub v22.4S, v22.4S, v7.4S
sub v20.4S, v20.4S, v16.4S
add v21.4S, v21.4S, v16.4S
3: cmp x5, #0
b.eq 4f
smlal\i v17.4S, v31\l, z7
smlsl\i v18.4S, v31\l, z5
smlal\i v5.4S, v31\l, z3
smlsl\i v6.4S, v31\l, z1
4: addhn v7.4H, v19.4S, v17.4S
addhn2 v7.8H, v20.4S, v18.4S
subhn v18.4H, v20.4S, v18.4S
subhn2 v18.8H, v19.4S, v17.4S
addhn v16.4H, v21.4S, v5.4S
addhn2 v16.8H, v22.4S, v6.4S
subhn v17.4H, v22.4S, v6.4S
subhn2 v17.8H, v21.4S, v5.4S
ret
endfunc
.endm
declare_idct_col4_neon 1, .4H
declare_idct_col4_neon 2, .8H
function ff_simple_idct_put_neon, export=1
idct_start x2
idct_row4_neon v24, v25, v26, v27, 1
idct_row4_neon v28, v29, v30, v31, 2
bl idct_col4_neon1
sqshrun v1.8B, v7.8H, #COL_SHIFT-16
sqshrun2 v1.16B, v16.8H, #COL_SHIFT-16
sqshrun v3.8B, v17.8H, #COL_SHIFT-16
sqshrun2 v3.16B, v18.8H, #COL_SHIFT-16
bl idct_col4_neon2
sqshrun v2.8B, v7.8H, #COL_SHIFT-16
sqshrun2 v2.16B, v16.8H, #COL_SHIFT-16
sqshrun v4.8B, v17.8H, #COL_SHIFT-16
sqshrun2 v4.16B, v18.8H, #COL_SHIFT-16
zip1 v16.4S, v1.4S, v2.4S
zip2 v17.4S, v1.4S, v2.4S
st1 {v16.D}[0], [x0], x1
st1 {v16.D}[1], [x0], x1
zip1 v18.4S, v3.4S, v4.4S
zip2 v19.4S, v3.4S, v4.4S
st1 {v17.D}[0], [x0], x1
st1 {v17.D}[1], [x0], x1
st1 {v18.D}[0], [x0], x1
st1 {v18.D}[1], [x0], x1
st1 {v19.D}[0], [x0], x1
st1 {v19.D}[1], [x0], x1
idct_end
endfunc
function ff_simple_idct_add_neon, export=1
idct_start x2
idct_row4_neon v24, v25, v26, v27, 1
idct_row4_neon v28, v29, v30, v31, 2
bl idct_col4_neon1
sshr v1.8H, v7.8H, #COL_SHIFT-16
sshr v2.8H, v16.8H, #COL_SHIFT-16
sshr v3.8H, v17.8H, #COL_SHIFT-16
sshr v4.8H, v18.8H, #COL_SHIFT-16
bl idct_col4_neon2
sshr v7.8H, v7.8H, #COL_SHIFT-16
sshr v16.8H, v16.8H, #COL_SHIFT-16
sshr v17.8H, v17.8H, #COL_SHIFT-16
sshr v18.8H, v18.8H, #COL_SHIFT-16
mov x9, x0
ld1 {v19.D}[0], [x0], x1
zip1 v23.2D, v1.2D, v7.2D
zip2 v24.2D, v1.2D, v7.2D
ld1 {v19.D}[1], [x0], x1
zip1 v25.2D, v2.2D, v16.2D
zip2 v26.2D, v2.2D, v16.2D
ld1 {v20.D}[0], [x0], x1
zip1 v27.2D, v3.2D, v17.2D
zip2 v28.2D, v3.2D, v17.2D
ld1 {v20.D}[1], [x0], x1
zip1 v29.2D, v4.2D, v18.2D
zip2 v30.2D, v4.2D, v18.2D
ld1 {v21.D}[0], [x0], x1
uaddw v23.8H, v23.8H, v19.8B
uaddw2 v24.8H, v24.8H, v19.16B
ld1 {v21.D}[1], [x0], x1
sqxtun v23.8B, v23.8H
sqxtun2 v23.16B, v24.8H
ld1 {v22.D}[0], [x0], x1
uaddw v24.8H, v25.8H, v20.8B
uaddw2 v25.8H, v26.8H, v20.16B
ld1 {v22.D}[1], [x0], x1
sqxtun v24.8B, v24.8H
sqxtun2 v24.16B, v25.8H
st1 {v23.D}[0], [x9], x1
uaddw v25.8H, v27.8H, v21.8B
uaddw2 v26.8H, v28.8H, v21.16B
st1 {v23.D}[1], [x9], x1
sqxtun v25.8B, v25.8H
sqxtun2 v25.16B, v26.8H
st1 {v24.D}[0], [x9], x1
uaddw v26.8H, v29.8H, v22.8B
uaddw2 v27.8H, v30.8H, v22.16B
st1 {v24.D}[1], [x9], x1
sqxtun v26.8B, v26.8H
sqxtun2 v26.16B, v27.8H
st1 {v25.D}[0], [x9], x1
st1 {v25.D}[1], [x9], x1
st1 {v26.D}[0], [x9], x1
st1 {v26.D}[1], [x9], x1
idct_end
endfunc
function ff_simple_idct_neon, export=1
idct_start x0
mov x2, x0
idct_row4_neon v24, v25, v26, v27, 1
idct_row4_neon v28, v29, v30, v31, 2
sub x2, x2, #128
bl idct_col4_neon1
sshr v1.8H, v7.8H, #COL_SHIFT-16
sshr v2.8H, v16.8H, #COL_SHIFT-16
sshr v3.8H, v17.8H, #COL_SHIFT-16
sshr v4.8H, v18.8H, #COL_SHIFT-16
bl idct_col4_neon2
sshr v7.8H, v7.8H, #COL_SHIFT-16
sshr v16.8H, v16.8H, #COL_SHIFT-16
sshr v17.8H, v17.8H, #COL_SHIFT-16
sshr v18.8H, v18.8H, #COL_SHIFT-16
zip1 v23.2D, v1.2D, v7.2D
zip2 v24.2D, v1.2D, v7.2D
st1 {v23.2D,v24.2D}, [x2], #32
zip1 v25.2D, v2.2D, v16.2D
zip2 v26.2D, v2.2D, v16.2D
st1 {v25.2D,v26.2D}, [x2], #32
zip1 v27.2D, v3.2D, v17.2D
zip2 v28.2D, v3.2D, v17.2D
st1 {v27.2D,v28.2D}, [x2], #32
zip1 v29.2D, v4.2D, v18.2D
zip2 v30.2D, v4.2D, v18.2D
st1 {v29.2D,v30.2D}, [x2], #32
idct_end
endfunc

View File

@ -0,0 +1,47 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/aarch64/cpu.h"
#include "libavcodec/vc1dsp.h"
#include "config.h"
void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
int h, int x, int y);
av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
}
}

View File

@ -0,0 +1,28 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
function ff_prefetch_aarch64, export=1
subs w2, w2, #2
prfm pldl1strm, [x0]
prfm pldl1strm, [x0, x1]
add x0, x0, x1, lsl #1
b.gt X(ff_prefetch_aarch64)
ret
endfunc

View File

@ -0,0 +1,32 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/aarch64/cpu.h"
#include "libavcodec/videodsp.h"
void ff_prefetch_aarch64(uint8_t *mem, ptrdiff_t stride, int h);
av_cold void ff_videodsp_init_aarch64(VideoDSPContext *ctx, int bpc)
{
int cpu_flags = av_get_cpu_flags();
if (have_armv8(cpu_flags))
ctx->prefetch = ff_prefetch_aarch64;
}

View File

@ -0,0 +1,29 @@
/*
* Copyright (c) 2017 Google Inc.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_AARCH64_VP9DSP_INIT_H
#define AVCODEC_AARCH64_VP9DSP_INIT_H
#include "libavcodec/vp9dsp.h"
void ff_vp9dsp_init_10bpp_aarch64(VP9DSPContext *dsp);
void ff_vp9dsp_init_12bpp_aarch64(VP9DSPContext *dsp);
#endif /* AVCODEC_AARCH64_VP9DSP_INIT_H */

View File

@ -0,0 +1,23 @@
/*
* Copyright (c) 2017 Google Inc.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#define BPP 10
#define INIT_FUNC ff_vp9dsp_init_10bpp_aarch64
#include "vp9dsp_init_16bpp_aarch64_template.c"

View File

@ -0,0 +1,23 @@
/*
* Copyright (c) 2017 Google Inc.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#define BPP 12
#define INIT_FUNC ff_vp9dsp_init_12bpp_aarch64
#include "vp9dsp_init_16bpp_aarch64_template.c"

View File

@ -0,0 +1,273 @@
/*
* Copyright (c) 2017 Google Inc.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/internal.h"
#include "libavutil/aarch64/cpu.h"
#include "vp9dsp_init.h"
#define declare_fpel(type, sz, suffix) \
void ff_vp9_##type##sz##suffix##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, int mx, int my)
#define decl_mc_func(op, filter, dir, sz, bpp) \
void ff_vp9_##op##_##filter##sz##_##dir##_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, int mx, int my)
#define define_8tap_2d_fn(op, filter, sz, bpp) \
static void op##_##filter##sz##_hv_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, \
ptrdiff_t src_stride, \
int h, int mx, int my) \
{ \
LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz * 2]); \
/* We only need h + 7 lines, but the horizontal filter assumes an \
* even number of rows, so filter h + 8 lines here. */ \
ff_vp9_put_##filter##sz##_h_##bpp##_neon(temp, 2 * sz, \
src - 3 * src_stride, src_stride, \
h + 8, mx, 0); \
ff_vp9_##op##_##filter##sz##_v_##bpp##_neon(dst, dst_stride, \
temp + 3 * 2 * sz, 2 * sz, \
h, 0, my); \
}
#define decl_filter_funcs(op, dir, sz, bpp) \
decl_mc_func(op, regular, dir, sz, bpp); \
decl_mc_func(op, sharp, dir, sz, bpp); \
decl_mc_func(op, smooth, dir, sz, bpp)
#define decl_mc_funcs(sz, bpp) \
decl_filter_funcs(put, h, sz, bpp); \
decl_filter_funcs(avg, h, sz, bpp); \
decl_filter_funcs(put, v, sz, bpp); \
decl_filter_funcs(avg, v, sz, bpp); \
decl_filter_funcs(put, hv, sz, bpp); \
decl_filter_funcs(avg, hv, sz, bpp)
#define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
#define ff_vp9_copy64_neon ff_vp9_copy64_aarch64
#define ff_vp9_copy128_neon ff_vp9_copy128_aarch64
declare_fpel(copy, 128, );
declare_fpel(copy, 64, );
declare_fpel(copy, 32, );
declare_fpel(copy, 16, );
declare_fpel(copy, 8, );
declare_fpel(avg, 64, _16);
declare_fpel(avg, 32, _16);
declare_fpel(avg, 16, _16);
declare_fpel(avg, 8, _16);
declare_fpel(avg, 4, _16);
decl_mc_funcs(64, BPP);
decl_mc_funcs(32, BPP);
decl_mc_funcs(16, BPP);
decl_mc_funcs(8, BPP);
decl_mc_funcs(4, BPP);
#define define_8tap_2d_funcs(sz, bpp) \
define_8tap_2d_fn(put, regular, sz, bpp) \
define_8tap_2d_fn(put, sharp, sz, bpp) \
define_8tap_2d_fn(put, smooth, sz, bpp) \
define_8tap_2d_fn(avg, regular, sz, bpp) \
define_8tap_2d_fn(avg, sharp, sz, bpp) \
define_8tap_2d_fn(avg, smooth, sz, bpp)
define_8tap_2d_funcs(64, BPP)
define_8tap_2d_funcs(32, BPP)
define_8tap_2d_funcs(16, BPP)
define_8tap_2d_funcs(8, BPP)
define_8tap_2d_funcs(4, BPP)
static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
#define init_fpel(idx1, idx2, sz, type, suffix) \
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##suffix
#define init_copy(idx, sz, suffix) \
init_fpel(idx, 0, sz, copy, suffix)
#define init_avg(idx, sz, suffix) \
init_fpel(idx, 1, sz, avg, suffix)
#define init_copy_avg(idx, sz1, sz2) \
init_copy(idx, sz2, _neon); \
init_avg (idx, sz1, _16_neon)
if (have_armv8(cpu_flags)) {
init_copy(0, 128, _aarch64);
init_copy(1, 64, _aarch64);
init_copy(2, 32, _aarch64);
}
if (have_neon(cpu_flags)) {
#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx, bpp) \
dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_##bpp##_neon
#define init_mc_funcs(idx, dir, mx, my, sz, pfx, bpp) \
init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
init_mc_func(idx, 0, put, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \
init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp); \
init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \
init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp)
#define init_mc_funcs_dirs(idx, sz, bpp) \
init_mc_funcs(idx, v, 0, 1, sz, ff_vp9_, bpp); \
init_mc_funcs(idx, h, 1, 0, sz, ff_vp9_, bpp); \
init_mc_funcs(idx, hv, 1, 1, sz, , bpp)
init_avg(0, 64, _16_neon);
init_avg(1, 32, _16_neon);
init_avg(2, 16, _16_neon);
init_copy_avg(3, 8, 16);
init_copy_avg(4, 4, 8);
init_mc_funcs_dirs(0, 64, BPP);
init_mc_funcs_dirs(1, 32, BPP);
init_mc_funcs_dirs(2, 16, BPP);
init_mc_funcs_dirs(3, 8, BPP);
init_mc_funcs_dirs(4, 4, BPP);
}
}
#define define_itxfm2(type_a, type_b, sz, bpp) \
void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_##bpp##_neon(uint8_t *_dst, \
ptrdiff_t stride, \
int16_t *_block, int eob)
#define define_itxfm(type_a, type_b, sz, bpp) define_itxfm2(type_a, type_b, sz, bpp)
#define define_itxfm_funcs(sz, bpp) \
define_itxfm(idct, idct, sz, bpp); \
define_itxfm(iadst, idct, sz, bpp); \
define_itxfm(idct, iadst, sz, bpp); \
define_itxfm(iadst, iadst, sz, bpp)
define_itxfm_funcs(4, BPP);
define_itxfm_funcs(8, BPP);
define_itxfm_funcs(16, BPP);
define_itxfm(idct, idct, 32, BPP);
define_itxfm(iwht, iwht, 4, BPP);
static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
#define init_itxfm2(tx, sz, bpp) \
dsp->itxfm_add[tx][DCT_DCT] = ff_vp9_idct_idct_##sz##_add_##bpp##_neon; \
dsp->itxfm_add[tx][DCT_ADST] = ff_vp9_iadst_idct_##sz##_add_##bpp##_neon; \
dsp->itxfm_add[tx][ADST_DCT] = ff_vp9_idct_iadst_##sz##_add_##bpp##_neon; \
dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_##bpp##_neon
#define init_itxfm(tx, sz, bpp) init_itxfm2(tx, sz, bpp)
#define init_idct2(tx, nm, bpp) \
dsp->itxfm_add[tx][DCT_DCT] = \
dsp->itxfm_add[tx][ADST_DCT] = \
dsp->itxfm_add[tx][DCT_ADST] = \
dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_##bpp##_neon
#define init_idct(tx, nm, bpp) init_idct2(tx, nm, bpp)
init_itxfm(TX_4X4, 4x4, BPP);
init_itxfm(TX_8X8, 8x8, BPP);
init_itxfm(TX_16X16, 16x16, BPP);
init_idct(TX_32X32, idct_idct_32x32, BPP);
init_idct(4, iwht_iwht_4x4, BPP);
}
}
#define define_loop_filter(dir, wd, size, bpp) \
void ff_vp9_loop_filter_##dir##_##wd##_##size##_##bpp##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
#define define_loop_filters(wd, size, bpp) \
define_loop_filter(h, wd, size, bpp); \
define_loop_filter(v, wd, size, bpp)
define_loop_filters(4, 8, BPP);
define_loop_filters(8, 8, BPP);
define_loop_filters(16, 8, BPP);
define_loop_filters(16, 16, BPP);
define_loop_filters(44, 16, BPP);
define_loop_filters(48, 16, BPP);
define_loop_filters(84, 16, BPP);
define_loop_filters(88, 16, BPP);
static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
#define init_lpf_func_8(idx1, idx2, dir, wd, bpp) \
dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_8_##bpp##_neon
#define init_lpf_func_16(idx, dir, bpp) \
dsp->loop_filter_16[idx] = ff_vp9_loop_filter_##dir##_16_16_##bpp##_neon
#define init_lpf_func_mix2(idx1, idx2, idx3, dir, wd, bpp) \
dsp->loop_filter_mix2[idx1][idx2][idx3] = ff_vp9_loop_filter_##dir##_##wd##_16_##bpp##_neon
#define init_lpf_funcs_8_wd(idx, wd, bpp) \
init_lpf_func_8(idx, 0, h, wd, bpp); \
init_lpf_func_8(idx, 1, v, wd, bpp)
#define init_lpf_funcs_16(bpp) \
init_lpf_func_16(0, h, bpp); \
init_lpf_func_16(1, v, bpp)
#define init_lpf_funcs_mix2_wd(idx1, idx2, wd, bpp) \
init_lpf_func_mix2(idx1, idx2, 0, h, wd, bpp); \
init_lpf_func_mix2(idx1, idx2, 1, v, wd, bpp)
#define init_lpf_funcs_8(bpp) \
init_lpf_funcs_8_wd(0, 4, bpp); \
init_lpf_funcs_8_wd(1, 8, bpp); \
init_lpf_funcs_8_wd(2, 16, bpp)
#define init_lpf_funcs_mix2(bpp) \
init_lpf_funcs_mix2_wd(0, 0, 44, bpp); \
init_lpf_funcs_mix2_wd(0, 1, 48, bpp); \
init_lpf_funcs_mix2_wd(1, 0, 84, bpp); \
init_lpf_funcs_mix2_wd(1, 1, 88, bpp)
init_lpf_funcs_8(BPP);
init_lpf_funcs_16(BPP);
init_lpf_funcs_mix2(BPP);
}
}
av_cold void INIT_FUNC(VP9DSPContext *dsp)
{
vp9dsp_mc_init_aarch64(dsp);
vp9dsp_loopfilter_init_aarch64(dsp);
vp9dsp_itxfm_init_aarch64(dsp);
}

View File

@ -0,0 +1,258 @@
/*
* Copyright (c) 2016 Google Inc.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/internal.h"
#include "libavutil/aarch64/cpu.h"
#include "libavcodec/vp9dsp.h"
#include "vp9dsp_init.h"
#define declare_fpel(type, sz) \
void ff_vp9_##type##sz##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, int mx, int my)
#define declare_copy_avg(sz) \
declare_fpel(copy, sz); \
declare_fpel(avg , sz)
#define decl_mc_func(op, filter, dir, sz) \
void ff_vp9_##op##_##filter##sz##_##dir##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, int mx, int my)
#define define_8tap_2d_fn(op, filter, sz) \
static void op##_##filter##sz##_hv_neon(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, int mx, int my) \
{ \
LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz]); \
/* We only need h + 7 lines, but the horizontal filter assumes an \
* even number of rows, so filter h + 8 lines here. */ \
ff_vp9_put_##filter##sz##_h_neon(temp, sz, \
src - 3 * src_stride, src_stride, \
h + 8, mx, 0); \
ff_vp9_##op##_##filter##sz##_v_neon(dst, dst_stride, \
temp + 3 * sz, sz, \
h, 0, my); \
}
#define decl_filter_funcs(op, dir, sz) \
decl_mc_func(op, regular, dir, sz); \
decl_mc_func(op, sharp, dir, sz); \
decl_mc_func(op, smooth, dir, sz)
#define decl_mc_funcs(sz) \
decl_filter_funcs(put, h, sz); \
decl_filter_funcs(avg, h, sz); \
decl_filter_funcs(put, v, sz); \
decl_filter_funcs(avg, v, sz); \
decl_filter_funcs(put, hv, sz); \
decl_filter_funcs(avg, hv, sz)
#define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
#define ff_vp9_copy64_neon ff_vp9_copy64_aarch64
declare_copy_avg(64);
declare_copy_avg(32);
declare_copy_avg(16);
declare_copy_avg(8);
declare_copy_avg(4);
decl_mc_funcs(64);
decl_mc_funcs(32);
decl_mc_funcs(16);
decl_mc_funcs(8);
decl_mc_funcs(4);
#define define_8tap_2d_funcs(sz) \
define_8tap_2d_fn(put, regular, sz) \
define_8tap_2d_fn(put, sharp, sz) \
define_8tap_2d_fn(put, smooth, sz) \
define_8tap_2d_fn(avg, regular, sz) \
define_8tap_2d_fn(avg, sharp, sz) \
define_8tap_2d_fn(avg, smooth, sz)
define_8tap_2d_funcs(64)
define_8tap_2d_funcs(32)
define_8tap_2d_funcs(16)
define_8tap_2d_funcs(8)
define_8tap_2d_funcs(4)
static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
#define init_fpel(idx1, idx2, sz, type, suffix) \
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##suffix
#define init_copy(idx, sz, suffix) \
init_fpel(idx, 0, sz, copy, suffix)
#define init_avg(idx, sz, suffix) \
init_fpel(idx, 1, sz, avg, suffix)
#define init_copy_avg(idx, sz) \
init_copy(idx, sz, _neon); \
init_avg (idx, sz, _neon)
if (have_armv8(cpu_flags)) {
init_copy(0, 64, _aarch64);
init_copy(1, 32, _aarch64);
}
if (have_neon(cpu_flags)) {
#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx) \
dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_neon
#define init_mc_funcs(idx, dir, mx, my, sz, pfx) \
init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
init_mc_func(idx, 0, put, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx); \
init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx); \
init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx); \
init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx)
#define init_mc_funcs_dirs(idx, sz) \
init_mc_funcs(idx, h, 1, 0, sz, ff_vp9_); \
init_mc_funcs(idx, v, 0, 1, sz, ff_vp9_); \
init_mc_funcs(idx, hv, 1, 1, sz,)
init_avg(0, 64, _neon);
init_avg(1, 32, _neon);
init_copy_avg(2, 16);
init_copy_avg(3, 8);
init_copy_avg(4, 4);
init_mc_funcs_dirs(0, 64);
init_mc_funcs_dirs(1, 32);
init_mc_funcs_dirs(2, 16);
init_mc_funcs_dirs(3, 8);
init_mc_funcs_dirs(4, 4);
}
}
#define define_itxfm(type_a, type_b, sz) \
void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_neon(uint8_t *_dst, \
ptrdiff_t stride, \
int16_t *_block, int eob)
#define define_itxfm_funcs(sz) \
define_itxfm(idct, idct, sz); \
define_itxfm(iadst, idct, sz); \
define_itxfm(idct, iadst, sz); \
define_itxfm(iadst, iadst, sz)
define_itxfm_funcs(4);
define_itxfm_funcs(8);
define_itxfm_funcs(16);
define_itxfm(idct, idct, 32);
define_itxfm(iwht, iwht, 4);
static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
#define init_itxfm(tx, sz) \
dsp->itxfm_add[tx][DCT_DCT] = ff_vp9_idct_idct_##sz##_add_neon; \
dsp->itxfm_add[tx][DCT_ADST] = ff_vp9_iadst_idct_##sz##_add_neon; \
dsp->itxfm_add[tx][ADST_DCT] = ff_vp9_idct_iadst_##sz##_add_neon; \
dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_neon
#define init_idct(tx, nm) \
dsp->itxfm_add[tx][DCT_DCT] = \
dsp->itxfm_add[tx][ADST_DCT] = \
dsp->itxfm_add[tx][DCT_ADST] = \
dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_neon
init_itxfm(TX_4X4, 4x4);
init_itxfm(TX_8X8, 8x8);
init_itxfm(TX_16X16, 16x16);
init_idct(TX_32X32, idct_idct_32x32);
init_idct(4, iwht_iwht_4x4);
}
}
#define define_loop_filter(dir, wd, len) \
void ff_vp9_loop_filter_##dir##_##wd##_##len##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
#define define_loop_filters(wd, len) \
define_loop_filter(h, wd, len); \
define_loop_filter(v, wd, len)
define_loop_filters(4, 8);
define_loop_filters(8, 8);
define_loop_filters(16, 8);
define_loop_filters(16, 16);
define_loop_filters(44, 16);
define_loop_filters(48, 16);
define_loop_filters(84, 16);
define_loop_filters(88, 16);
static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_neon;
dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_neon;
dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_neon;
dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_neon;
dsp->loop_filter_8[2][1] = ff_vp9_loop_filter_v_16_8_neon;
dsp->loop_filter_8[2][0] = ff_vp9_loop_filter_h_16_8_neon;
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_neon;
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_neon;
dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_neon;
dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_neon;
dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_neon;
dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_neon;
dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_neon;
dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_neon;
dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_neon;
dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_neon;
}
}
av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp)
{
if (bpp == 10) {
ff_vp9dsp_init_10bpp_aarch64(dsp);
return;
} else if (bpp == 12) {
ff_vp9dsp_init_12bpp_aarch64(dsp);
return;
} else if (bpp != 8)
return;
vp9dsp_mc_init_aarch64(dsp);
vp9dsp_loopfilter_init_aarch64(dsp);
vp9dsp_itxfm_init_aarch64(dsp);
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,873 @@
/*
* Copyright (c) 2017 Google Inc.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
#include "neon.S"
.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().8h, \r0\().8h, \r1\().8h
trn2 \t5\().8h, \r0\().8h, \r1\().8h
trn1 \t6\().8h, \r2\().8h, \r3\().8h
trn2 \t7\().8h, \r2\().8h, \r3\().8h
trn1 \r0\().4s, \t4\().4s, \t6\().4s
trn2 \r2\().4s, \t4\().4s, \t6\().4s
trn1 \r1\().4s, \t5\().4s, \t7\().4s
trn2 \r3\().4s, \t5\().4s, \t7\().4s
.endm
// The input to and output from this macro is in the registers v16-v31,
// and v0-v7 are used as scratch registers.
// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
// Depending on the width of the loop filter, we either use v16-v19
// and v28-v31 as temp registers, or v8-v15.
.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
dup v0.8h, w2 // E
dup v2.8h, w3 // I
dup v3.8h, w4 // H
uabd v4.8h, v20.8h, v21.8h // abs(p3 - p2)
uabd v5.8h, v21.8h, v22.8h // abs(p2 - p1)
uabd v6.8h, v22.8h, v23.8h // abs(p1 - p0)
uabd v7.8h, v24.8h, v25.8h // abs(q0 - q1)
uabd \tmp1\().8h, v25.8h, v26.8h // abs(q1 - q2)
uabd \tmp2\().8h, v26.8h, v27.8h // abs(q2 - q3)
umax v4.8h, v4.8h, v5.8h
umax v5.8h, v6.8h, v7.8h
umax \tmp1\().8h, \tmp1\().8h, \tmp2\().8h
uabd v6.8h, v23.8h, v24.8h // abs(p0 - q0)
umax v4.8h, v4.8h, v5.8h
add v6.8h, v6.8h, v6.8h // abs(p0 - q0) * 2
uabd v5.8h, v22.8h, v25.8h // abs(p1 - q1)
umax v4.8h, v4.8h, \tmp1\().8h // max(abs(p3 - p2), ..., abs(q2 - q3))
ushr v5.8h, v5.8h, #1
cmhs v4.8h, v2.8h, v4.8h // max(abs()) <= I
add v6.8h, v6.8h, v5.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
cmhs v6.8h, v0.8h, v6.8h
and v4.16b, v4.16b, v6.16b // fm
// If no pixels need filtering, just exit as soon as possible
mov x11, v4.d[0]
mov x12, v4.d[1]
adds x11, x11, x12
b.ne 1f
br x10
1:
.if \wd >= 8
dup v0.8h, w5
uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0)
uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0)
uabd v1.8h, v22.8h, v23.8h // abs(p1 - p0)
uabd \tmp1\().8h, v25.8h, v24.8h // abs(q1 - q0)
uabd \tmp2\().8h, v26.8h, v24.8h // abs(q2 - q0)
uabd \tmp3\().8h, v27.8h, v24.8h // abs(q3 - q0)
umax v6.8h, v6.8h, v2.8h
umax v1.8h, v1.8h, \tmp1\().8h
umax \tmp2\().8h, \tmp2\().8h, \tmp3\().8h
.if \wd == 16
uabd v7.8h, v16.8h, v23.8h // abs(p7 - p0)
umax v6.8h, v6.8h, v1.8h
uabd v2.8h, v17.8h, v23.8h // abs(p6 - p0)
umax v6.8h, v6.8h, \tmp2\().8h
uabd v1.8h, v18.8h, v23.8h // abs(p5 - p0)
cmhs v6.8h, v0.8h, v6.8h // flat8in
uabd v8.8h, v19.8h, v23.8h // abs(p4 - p0)
and v6.16b, v6.16b, v4.16b // flat8in && fm
uabd v9.8h, v28.8h, v24.8h // abs(q4 - q0)
bic v4.16b, v4.16b, v6.16b // fm && !flat8in
uabd v10.8h, v29.8h, v24.8h // abs(q5 - q0)
uabd v11.8h, v30.8h, v24.8h // abs(q6 - q0)
uabd v12.8h, v31.8h, v24.8h // abs(q7 - q0)
umax v7.8h, v7.8h, v2.8h
umax v1.8h, v1.8h, v8.8h
umax v9.8h, v9.8h, v10.8h
umax v11.8h, v11.8h, v12.8h
// The rest of the calculation of flat8out is interleaved below
.else
// The rest of the calculation of flat8in is interleaved below
.endif
.endif
// Calculate the normal inner loop filter for 2 or 4 pixels
uabd v5.8h, v22.8h, v23.8h // abs(p1 - p0)
.if \wd == 16
umax v7.8h, v7.8h, v1.8h
umax v9.8h, v9.8h, v11.8h
.elseif \wd == 8
umax v6.8h, v6.8h, v1.8h
.endif
uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0)
.if \wd == 16
umax v7.8h, v7.8h, v9.8h
.elseif \wd == 8
umax v6.8h, v6.8h, \tmp2\().8h
.endif
dup \tmp2\().8h, w6 // left shift for saturation
sub \tmp1\().8h, v22.8h, v25.8h // p1 - q1
neg \tmp6\().8h, \tmp2\().8h // negative left shift after saturation
umax v5.8h, v5.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0))
sub \tmp3\().8h, v24.8h, v23.8h // q0 - p0
movi \tmp5\().8h, #3
.if \wd == 8
cmhs v6.8h, v0.8h, v6.8h // flat8in
.endif
cmhs v5.8h, v3.8h, v5.8h // !hev
.if \wd == 8
and v6.16b, v6.16b, v4.16b // flat8in && fm
.endif
sqshl \tmp1\().8h, \tmp1\().8h, \tmp2\().8h
.if \wd == 16
cmhs v7.8h, v0.8h, v7.8h // flat8out
.elseif \wd == 8
bic v4.16b, v4.16b, v6.16b // fm && !flat8in
.endif
and v5.16b, v5.16b, v4.16b // !hev && fm && !flat8in
.if \wd == 16
and v7.16b, v7.16b, v6.16b // flat8out && flat8in && fm
.endif
sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
mul \tmp3\().8h, \tmp3\().8h, \tmp5\().8h // 3 * (q0 - p0)
bic \tmp1\().16b, \tmp1\().16b, v5.16b // if (!hev) av_clip_int8 = 0
movi v2.8h, #4
add \tmp3\().8h, \tmp3\().8h, \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
movi v3.8h, #3
sqshl \tmp1\().8h, \tmp3\().8h, \tmp2\().8h
movi \tmp5\().8h, #0
sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
dup \tmp6\().8h, w7 // max pixel value
.if \wd == 16
bic v6.16b, v6.16b, v7.16b // fm && flat8in && !flat8out
.endif
ushr \tmp2\().8h, \tmp6\().8h, #1 // (1 << (BIT_DEPTH - 1)) - 1
add \tmp3\().8h, \tmp1\().8h, v2.8h // f + 4
add \tmp4\().8h, \tmp1\().8h, v3.8h // f + 3
smin \tmp3\().8h, \tmp3\().8h, \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
smin \tmp4\().8h, \tmp4\().8h, \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
sshr \tmp3\().8h, \tmp3\().8h, #3 // f1
sshr \tmp4\().8h, \tmp4\().8h, #3 // f2
add v0.8h, v23.8h, \tmp4\().8h // p0 + f2
sub v2.8h, v24.8h, \tmp3\().8h // q0 - f1
smin v0.8h, v0.8h, \tmp6\().8h
smin v2.8h, v2.8h, \tmp6\().8h
srshr \tmp3\().8h, \tmp3\().8h, #1 // f = (f1 + 1) >> 1
smax v0.8h, v0.8h, \tmp5\().8h // out p0
smax v2.8h, v2.8h, \tmp5\().8h // out q0
bit v23.16b, v0.16b, v4.16b // if (fm && !flat8in)
bit v24.16b, v2.16b, v4.16b
add v0.8h, v22.8h, \tmp3\().8h // p1 + f
sub v2.8h, v25.8h, \tmp3\().8h // q1 - f
.if \wd >= 8
mov x11, v6.d[0]
.endif
smin v0.8h, v0.8h, \tmp6\().8h
smin v2.8h, v2.8h, \tmp6\().8h
.if \wd >= 8
mov x12, v6.d[1]
.endif
smax v0.8h, v0.8h, \tmp5\().8h // out p1
smax v2.8h, v2.8h, \tmp5\().8h // out q1
.if \wd >= 8
adds x11, x11, x12
.endif
bit v22.16b, v0.16b, v5.16b // if (!hev && fm && !flat8in)
bit v25.16b, v2.16b, v5.16b
// If no pixels need flat8in, jump to flat8out
// (or to a writeout of the inner 4 pixels, for wd=8)
.if \wd >= 8
.if \wd == 16
b.eq 6f
.else
b.ne 1f
br x13
1:
.endif
// flat8in
add \tmp1\().8h, v20.8h, v21.8h
add \tmp3\().8h, v22.8h, v25.8h
add \tmp5\().8h, v20.8h, v22.8h
add \tmp7\().8h, v23.8h, v26.8h
add v0.8h, \tmp1\().8h, \tmp1\().8h
add v0.8h, v0.8h, v23.8h
add v0.8h, v0.8h, v24.8h
add v0.8h, v0.8h, \tmp5\().8h
sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
urshr v2.8h, v0.8h, #3 // out p2
add v0.8h, v0.8h, \tmp3\().8h
add \tmp1\().8h, v20.8h, v23.8h
add \tmp3\().8h, v24.8h, v27.8h
urshr v3.8h, v0.8h, #3 // out p1
add v0.8h, v0.8h, \tmp7\().8h
sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
add \tmp5\().8h, v21.8h, v24.8h
add \tmp7\().8h, v25.8h, v27.8h
urshr v4.8h, v0.8h, #3 // out p0
add v0.8h, v0.8h, \tmp3\().8h
sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
add \tmp1\().8h, v22.8h, v25.8h
add \tmp3\().8h, v26.8h, v27.8h
urshr v5.8h, v0.8h, #3 // out q0
add v0.8h, v0.8h, \tmp7\().8h
sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
urshr \tmp5\().8h, v0.8h, #3 // out q1
add v0.8h, v0.8h, \tmp3\().8h
// The output here is written back into the input registers. This doesn't
// matter for the flat8part below, since we only update those pixels
// which won't be touched below.
bit v21.16b, v2.16b, v6.16b
bit v22.16b, v3.16b, v6.16b
bit v23.16b, v4.16b, v6.16b
urshr \tmp6\().8h, v0.8h, #3 // out q2
bit v24.16b, v5.16b, v6.16b
bit v25.16b, \tmp5\().16b, v6.16b
bit v26.16b, \tmp6\().16b, v6.16b
.endif
.if \wd == 16
6:
orr v2.16b, v6.16b, v7.16b
mov x11, v2.d[0]
mov x12, v2.d[1]
adds x11, x11, x12
b.ne 1f
// If no pixels needed flat8in nor flat8out, jump to a
// writeout of the inner 4 pixels
br x14
1:
mov x11, v7.d[0]
mov x12, v7.d[1]
adds x11, x11, x12
b.ne 1f
// If no pixels need flat8out, jump to a writeout of the inner 6 pixels
br x15
1:
// flat8out
// This writes all outputs into v2-v17 (skipping v6 and v16).
// If this part is skipped, the output is read from v21-v26 (which is the input
// to this section).
shl v0.8h, v16.8h, #3 // 8 * v16
sub v0.8h, v0.8h, v16.8h // 7 * v16
add v0.8h, v0.8h, v17.8h
add v8.8h, v17.8h, v18.8h
add v10.8h, v19.8h, v20.8h
add v0.8h, v0.8h, v8.8h
add v8.8h, v16.8h, v17.8h
add v12.8h, v21.8h, v22.8h
add v0.8h, v0.8h, v10.8h
add v10.8h, v18.8h, v25.8h
add v14.8h, v23.8h, v24.8h
sub v10.8h, v10.8h, v8.8h
add v0.8h, v0.8h, v12.8h
add v0.8h, v0.8h, v14.8h
add v12.8h, v16.8h, v18.8h
add v14.8h, v19.8h, v26.8h
urshr v2.8h, v0.8h, #4
add v0.8h, v0.8h, v10.8h
add v8.8h, v16.8h, v19.8h
add v10.8h, v20.8h, v27.8h
sub v14.8h, v14.8h, v12.8h
bif v2.16b, v17.16b, v7.16b
urshr v3.8h , v0.8h, #4
add v0.8h, v0.8h, v14.8h
add v12.8h, v16.8h, v20.8h
add v14.8h, v21.8h, v28.8h
sub v10.8h, v10.8h, v8.8h
bif v3.16b, v18.16b, v7.16b
urshr v4.8h, v0.8h, #4
add v0.8h, v0.8h, v10.8h
add v8.8h, v16.8h, v21.8h
add v10.8h, v22.8h, v29.8h
sub v14.8h, v14.8h, v12.8h
bif v4.16b, v19.16b, v7.16b
urshr v5.8h, v0.8h, #4
add v0.8h, v0.8h, v14.8h
add v12.8h, v16.8h, v22.8h
add v14.8h, v23.8h, v30.8h
sub v10.8h, v10.8h, v8.8h
bif v5.16b, v20.16b, v7.16b
urshr v6.8h, v0.8h, #4
add v0.8h, v0.8h, v10.8h
add v10.8h, v16.8h, v23.8h
sub v14.8h, v14.8h, v12.8h
add v12.8h, v24.8h, v31.8h
bif v6.16b, v21.16b, v7.16b
urshr v8.8h, v0.8h, #4
add v0.8h, v0.8h, v14.8h
sub v10.8h, v12.8h, v10.8h
add v12.8h, v17.8h, v24.8h
add v14.8h, v25.8h, v31.8h
bif v8.16b, v22.16b, v7.16b
urshr v9.8h, v0.8h, #4
add v0.8h, v0.8h, v10.8h
sub v14.8h, v14.8h, v12.8h
add v12.8h, v26.8h, v31.8h
bif v9.16b, v23.16b, v7.16b
urshr v10.8h, v0.8h, #4
add v0.8h, v0.8h, v14.8h
add v14.8h, v18.8h, v25.8h
add v18.8h, v19.8h, v26.8h
sub v12.8h, v12.8h, v14.8h
add v14.8h, v27.8h, v31.8h
bif v10.16b, v24.16b, v7.16b
urshr v11.8h, v0.8h, #4
add v0.8h, v0.8h, v12.8h
add v12.8h, v20.8h, v27.8h
sub v14.8h, v14.8h, v18.8h
add v18.8h, v28.8h, v31.8h
bif v11.16b, v25.16b, v7.16b
sub v18.8h, v18.8h, v12.8h
urshr v12.8h, v0.8h, #4
add v0.8h, v0.8h, v14.8h
add v14.8h, v21.8h, v28.8h
add v20.8h, v29.8h, v31.8h
bif v12.16b, v26.16b, v7.16b
urshr v13.8h, v0.8h, #4
add v0.8h, v0.8h, v18.8h
sub v20.8h, v20.8h, v14.8h
add v18.8h, v22.8h, v29.8h
add v22.8h, v30.8h, v31.8h
bif v13.16b, v27.16b, v7.16b
urshr v14.8h, v0.8h, #4
add v0.8h, v0.8h, v20.8h
sub v22.8h, v22.8h, v18.8h
bif v14.16b, v28.16b, v7.16b
urshr v15.8h, v0.8h, #4
add v0.8h, v0.8h, v22.8h
bif v15.16b, v29.16b, v7.16b
urshr v17.8h, v0.8h, #4
bif v17.16b, v30.16b, v7.16b
.endif
.endm
// For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
// while we need those for inputs/outputs in wd=16 and use v8-v15
// for temp registers there instead.
function vp9_loop_filter_4
loop_filter 4, v16, v17, v18, v19, v28, v29, v30, v31
ret
endfunc
function vp9_loop_filter_8
loop_filter 8, v16, v17, v18, v19, v28, v29, v30, v31
ret
endfunc
function vp9_loop_filter_16
loop_filter 16, v8, v9, v10, v11, v12, v13, v14, v15
ret
endfunc
.macro loop_filter_4
bl vp9_loop_filter_4
.endm
.macro loop_filter_8
// calculate alternative 'return' targets
adr x13, 6f
bl vp9_loop_filter_8
.endm
.macro loop_filter_16
// calculate alternative 'return' targets
adr x14, 7f
adr x15, 8f
bl vp9_loop_filter_16
.endm
// The public functions in this file have got the following signature:
// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
.macro bpp_frontend func, bpp, push
function ff_\func\()_\bpp\()_neon, export=1
.if \push
mov x16, x30
stp d14, d15, [sp, #-0x10]!
stp d12, d13, [sp, #-0x10]!
stp d10, d11, [sp, #-0x10]!
stp d8, d9, [sp, #-0x10]!
.endif
lsl w2, w2, #\bpp - 8
lsl w3, w3, #\bpp - 8
lsl w4, w4, #\bpp - 8
mov x5, #1 << (\bpp - 8)
mov x6, #16 - \bpp
mov x7, #((1 << \bpp) - 1)
.if \push
bl \func\()_16_neon
ldp d8, d9, [sp], 0x10
ldp d10, d11, [sp], 0x10
ldp d12, d13, [sp], 0x10
ldp d14, d15, [sp], 0x10
br x16
.else
b \func\()_16_neon
.endif
endfunc
.endm
.macro bpp_frontends func, push=0
bpp_frontend \func, 10, \push
bpp_frontend \func, 12, \push
.endm
.macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push
function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
mov x16, x30
.if \push
stp d14, d15, [sp, #-0x10]!
stp d12, d13, [sp, #-0x10]!
stp d10, d11, [sp, #-0x10]!
stp d8, d9, [sp, #-0x10]!
.endif
lsl w2, w2, #\bpp - 8
lsl w3, w3, #\bpp - 8
lsl w4, w4, #\bpp - 8
mov x5, #1 << (\bpp - 8)
mov x6, #16 - \bpp
mov x7, #((1 << \bpp) - 1)
bl \func\()_\int_suffix\()_16_neon
.ifc \dir,h
add x0, x0, x1, lsl #3
.else
add x0, x0, #16
.endif
bl \func\()_\int_suffix\()_16_neon
.if \push
ldp d8, d9, [sp], 0x10
ldp d10, d11, [sp], 0x10
ldp d12, d13, [sp], 0x10
ldp d14, d15, [sp], 0x10
.endif
br x16
endfunc
.endm
.macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0
bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push
bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push
.endm
.macro bpp_frontend_mix2 wd1, wd2, dir, bpp
function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
mov x16, x30
lsr w8, w2, #8
lsr w14, w3, #8
lsr w15, w4, #8
and w2, w2, #0xff
and w3, w3, #0xff
and w4, w4, #0xff
lsl w2, w2, #\bpp - 8
lsl w3, w3, #\bpp - 8
lsl w4, w4, #\bpp - 8
mov x5, #1 << (\bpp - 8)
mov x6, #16 - \bpp
mov x7, #((1 << \bpp) - 1)
bl vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
.ifc \dir,h
add x0, x0, x1, lsl #3
.else
add x0, x0, #16
.endif
lsl w2, w8, #\bpp - 8
lsl w3, w14, #\bpp - 8
lsl w4, w15, #\bpp - 8
bl vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
br x16
endfunc
.endm
.macro bpp_frontends_mix2 wd1, wd2
bpp_frontend_mix2 \wd1, \wd2, v, 10
bpp_frontend_mix2 \wd1, \wd2, v, 12
bpp_frontend_mix2 \wd1, \wd2, h, 10
bpp_frontend_mix2 \wd1, \wd2, h, 12
.endm
function vp9_loop_filter_v_4_8_16_neon
mov x10, x30
sub x9, x0, x1, lsl #2
ld1 {v20.8h}, [x9], x1 // p3
ld1 {v24.8h}, [x0], x1 // q0
ld1 {v21.8h}, [x9], x1 // p2
ld1 {v25.8h}, [x0], x1 // q1
ld1 {v22.8h}, [x9], x1 // p1
ld1 {v26.8h}, [x0], x1 // q2
ld1 {v23.8h}, [x9], x1 // p0
ld1 {v27.8h}, [x0], x1 // q3
sub x0, x0, x1, lsl #2
sub x9, x9, x1, lsl #1
loop_filter_4
st1 {v22.8h}, [x9], x1
st1 {v24.8h}, [x0], x1
st1 {v23.8h}, [x9], x1
st1 {v25.8h}, [x0], x1
sub x0, x0, x1, lsl #1
br x10
endfunc
bpp_frontends vp9_loop_filter_v_4_8
function vp9_loop_filter_h_4_8_16_neon
mov x10, x30
sub x9, x0, #8
add x0, x9, x1, lsl #2
ld1 {v20.8h}, [x9], x1
ld1 {v24.8h}, [x0], x1
ld1 {v21.8h}, [x9], x1
ld1 {v25.8h}, [x0], x1
ld1 {v22.8h}, [x9], x1
ld1 {v26.8h}, [x0], x1
ld1 {v23.8h}, [x9], x1
ld1 {v27.8h}, [x0], x1
sub x9, x9, x1, lsl #2
sub x0, x0, x1, lsl #3
add x0, x0, #8
transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
loop_filter_4
// Move x9 forward by 2 pixels; we don't need to rewrite the
// outermost 2 pixels since they aren't changed.
add x9, x9, #4
add x0, x9, x1, lsl #2
// We only will write the mid 4 pixels back; after the loop filter,
// these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
// We need to transpose them to columns, done with a 4x8 transpose
// (which in practice is two 4x4 transposes of the two 4x4 halves
// of the 8x4 pixels; into 4x8 pixels).
transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29
st1 {v22.d}[0], [x9], x1
st1 {v22.d}[1], [x0], x1
st1 {v23.d}[0], [x9], x1
st1 {v23.d}[1], [x0], x1
st1 {v24.d}[0], [x9], x1
st1 {v24.d}[1], [x0], x1
st1 {v25.d}[0], [x9], x1
st1 {v25.d}[1], [x0], x1
sub x0, x0, x1, lsl #3
add x0, x0, #4
br x10
endfunc
bpp_frontends vp9_loop_filter_h_4_8
function vp9_loop_filter_v_8_8_16_neon
mov x10, x30
sub x9, x0, x1, lsl #2
ld1 {v20.8h}, [x9], x1 // p3
ld1 {v24.8h}, [x0], x1 // q0
ld1 {v21.8h}, [x9], x1 // p2
ld1 {v25.8h}, [x0], x1 // q1
ld1 {v22.8h}, [x9], x1 // p1
ld1 {v26.8h}, [x0], x1 // q2
ld1 {v23.8h}, [x9], x1 // p0
ld1 {v27.8h}, [x0], x1 // q3
sub x9, x9, x1, lsl #2
sub x0, x0, x1, lsl #2
add x9, x9, x1
loop_filter_8
st1 {v21.8h}, [x9], x1
st1 {v24.8h}, [x0], x1
st1 {v22.8h}, [x9], x1
st1 {v25.8h}, [x0], x1
st1 {v23.8h}, [x9], x1
st1 {v26.8h}, [x0], x1
sub x0, x0, x1, lsl #1
sub x0, x0, x1
br x10
6:
sub x9, x0, x1, lsl #1
st1 {v22.8h}, [x9], x1
st1 {v24.8h}, [x0], x1
st1 {v23.8h}, [x9], x1
st1 {v25.8h}, [x0], x1
sub x0, x0, x1, lsl #1
br x10
endfunc
bpp_frontends vp9_loop_filter_v_8_8
function vp9_loop_filter_h_8_8_16_neon
mov x10, x30
sub x9, x0, #8
add x0, x9, x1, lsl #2
ld1 {v20.8h}, [x9], x1
ld1 {v24.8h}, [x0], x1
ld1 {v21.8h}, [x9], x1
ld1 {v25.8h}, [x0], x1
ld1 {v22.8h}, [x9], x1
ld1 {v26.8h}, [x0], x1
ld1 {v23.8h}, [x9], x1
ld1 {v27.8h}, [x0], x1
sub x9, x9, x1, lsl #2
sub x0, x0, x1, lsl #3
add x0, x0, #8
transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
loop_filter_8
add x0, x9, x1, lsl #2
// Even though only 6 pixels per row have been changed, we write the
// full 8 pixel registers.
transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
st1 {v20.8h}, [x9], x1
st1 {v24.8h}, [x0], x1
st1 {v21.8h}, [x9], x1
st1 {v25.8h}, [x0], x1
st1 {v22.8h}, [x9], x1
st1 {v26.8h}, [x0], x1
st1 {v23.8h}, [x9], x1
st1 {v27.8h}, [x0], x1
sub x0, x0, x1, lsl #3
add x0, x0, #8
br x10
6:
// If we didn't need to do the flat8in part, we use the same writeback
// as in loop_filter_h_4_8.
add x9, x9, #4
add x0, x9, x1, lsl #2
transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29
st1 {v22.d}[0], [x9], x1
st1 {v22.d}[1], [x0], x1
st1 {v23.d}[0], [x9], x1
st1 {v23.d}[1], [x0], x1
st1 {v24.d}[0], [x9], x1
st1 {v24.d}[1], [x0], x1
st1 {v25.d}[0], [x9], x1
st1 {v25.d}[1], [x0], x1
sub x0, x0, x1, lsl #3
add x0, x0, #4
br x10
endfunc
bpp_frontends vp9_loop_filter_h_8_8
bpp_frontends_mix2 4, 4
bpp_frontends_mix2 4, 8
bpp_frontends_mix2 8, 4
bpp_frontends_mix2 8, 8
function vp9_loop_filter_v_16_8_16_neon
mov x10, x30
sub x9, x0, x1, lsl #3
ld1 {v16.8h}, [x9], x1 // p7
ld1 {v24.8h}, [x0], x1 // q0
ld1 {v17.8h}, [x9], x1 // p6
ld1 {v25.8h}, [x0], x1 // q1
ld1 {v18.8h}, [x9], x1 // p5
ld1 {v26.8h}, [x0], x1 // q2
ld1 {v19.8h}, [x9], x1 // p4
ld1 {v27.8h}, [x0], x1 // q3
ld1 {v20.8h}, [x9], x1 // p3
ld1 {v28.8h}, [x0], x1 // q4
ld1 {v21.8h}, [x9], x1 // p2
ld1 {v29.8h}, [x0], x1 // q5
ld1 {v22.8h}, [x9], x1 // p1
ld1 {v30.8h}, [x0], x1 // q6
ld1 {v23.8h}, [x9], x1 // p0
ld1 {v31.8h}, [x0], x1 // q7
sub x9, x9, x1, lsl #3
sub x0, x0, x1, lsl #3
add x9, x9, x1
loop_filter_16
// If we did the flat8out part, we get the output in
// v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
// store v2-v9 there, and v10-v17 into x0.
st1 {v2.8h}, [x9], x1
st1 {v10.8h}, [x0], x1
st1 {v3.8h}, [x9], x1
st1 {v11.8h}, [x0], x1
st1 {v4.8h}, [x9], x1
st1 {v12.8h}, [x0], x1
st1 {v5.8h}, [x9], x1
st1 {v13.8h}, [x0], x1
st1 {v6.8h}, [x9], x1
st1 {v14.8h}, [x0], x1
st1 {v8.8h}, [x9], x1
st1 {v15.8h}, [x0], x1
st1 {v9.8h}, [x9], x1
st1 {v17.8h}, [x0], x1
sub x0, x0, x1, lsl #3
add x0, x0, x1
br x10
8:
add x9, x9, x1, lsl #2
// If we didn't do the flat8out part, the output is left in the
// input registers.
st1 {v21.8h}, [x9], x1
st1 {v24.8h}, [x0], x1
st1 {v22.8h}, [x9], x1
st1 {v25.8h}, [x0], x1
st1 {v23.8h}, [x9], x1
st1 {v26.8h}, [x0], x1
sub x0, x0, x1, lsl #1
sub x0, x0, x1
br x10
7:
sub x9, x0, x1, lsl #1
st1 {v22.8h}, [x9], x1
st1 {v24.8h}, [x0], x1
st1 {v23.8h}, [x9], x1
st1 {v25.8h}, [x0], x1
sub x0, x0, x1, lsl #1
br x10
endfunc
bpp_frontends vp9_loop_filter_v_16_8, push=1
bpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1
function vp9_loop_filter_h_16_8_16_neon
mov x10, x30
sub x9, x0, #16
ld1 {v16.8h}, [x9], x1
ld1 {v24.8h}, [x0], x1
ld1 {v17.8h}, [x9], x1
ld1 {v25.8h}, [x0], x1
ld1 {v18.8h}, [x9], x1
ld1 {v26.8h}, [x0], x1
ld1 {v19.8h}, [x9], x1
ld1 {v27.8h}, [x0], x1
ld1 {v20.8h}, [x9], x1
ld1 {v28.8h}, [x0], x1
ld1 {v21.8h}, [x9], x1
ld1 {v29.8h}, [x0], x1
ld1 {v22.8h}, [x9], x1
ld1 {v30.8h}, [x0], x1
ld1 {v23.8h}, [x9], x1
ld1 {v31.8h}, [x0], x1
sub x0, x0, x1, lsl #3
sub x9, x9, x1, lsl #3
// The 16x8 pixels read above is in two 8x8 blocks; the left
// half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
// of this, to get one column per register.
transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
loop_filter_16
transpose_8x8H v16, v2, v3, v4, v5, v6, v8, v9, v0, v1
transpose_8x8H v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
st1 {v16.8h}, [x9], x1
st1 {v10.8h}, [x0], x1
st1 {v2.8h}, [x9], x1
st1 {v11.8h}, [x0], x1
st1 {v3.8h}, [x9], x1
st1 {v12.8h}, [x0], x1
st1 {v4.8h}, [x9], x1
st1 {v13.8h}, [x0], x1
st1 {v5.8h}, [x9], x1
st1 {v14.8h}, [x0], x1
st1 {v6.8h}, [x9], x1
st1 {v15.8h}, [x0], x1
st1 {v8.8h}, [x9], x1
st1 {v17.8h}, [x0], x1
st1 {v9.8h}, [x9], x1
st1 {v31.8h}, [x0], x1
sub x0, x0, x1, lsl #3
br x10
8:
// The same writeback as in loop_filter_h_8_8
sub x9, x0, #8
add x0, x9, x1, lsl #2
transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
st1 {v20.8h}, [x9], x1
st1 {v24.8h}, [x0], x1
st1 {v21.8h}, [x9], x1
st1 {v25.8h}, [x0], x1
st1 {v22.8h}, [x9], x1
st1 {v26.8h}, [x0], x1
st1 {v23.8h}, [x9], x1
st1 {v27.8h}, [x0], x1
sub x0, x0, x1, lsl #3
add x0, x0, #8
br x10
7:
// The same writeback as in loop_filter_h_4_8
sub x9, x0, #4
add x0, x9, x1, lsl #2
transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29
st1 {v22.d}[0], [x9], x1
st1 {v22.d}[1], [x0], x1
st1 {v23.d}[0], [x9], x1
st1 {v23.d}[1], [x0], x1
st1 {v24.d}[0], [x9], x1
st1 {v24.d}[1], [x0], x1
st1 {v25.d}[0], [x9], x1
st1 {v25.d}[1], [x0], x1
sub x0, x0, x1, lsl #3
add x0, x0, #4
br x10
endfunc
bpp_frontends vp9_loop_filter_h_16_8, push=1
bpp_frontends_rep vp9_loop_filter_h_16, 16, 8, h, push=1

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,631 @@
/*
* Copyright (c) 2017 Google Inc.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
// All public functions in this file have the following signature:
// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
// const uint8_t *ref, ptrdiff_t ref_stride,
// int h, int mx, int my);
function ff_vp9_copy128_aarch64, export=1
1:
ldp x5, x6, [x2]
ldp x7, x8, [x2, #16]
stp x5, x6, [x0]
ldp x9, x10, [x2, #32]
stp x7, x8, [x0, #16]
subs w4, w4, #1
ldp x11, x12, [x2, #48]
stp x9, x10, [x0, #32]
stp x11, x12, [x0, #48]
ldp x5, x6, [x2, #64]
ldp x7, x8, [x2, #80]
stp x5, x6, [x0, #64]
ldp x9, x10, [x2, #96]
stp x7, x8, [x0, #80]
ldp x11, x12, [x2, #112]
stp x9, x10, [x0, #96]
stp x11, x12, [x0, #112]
add x2, x2, x3
add x0, x0, x1
b.ne 1b
ret
endfunc
function ff_vp9_avg64_16_neon, export=1
mov x5, x0
sub x1, x1, #64
sub x3, x3, #64
1:
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
urhadd v0.8h, v0.8h, v4.8h
urhadd v1.8h, v1.8h, v5.8h
ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
urhadd v2.8h, v2.8h, v6.8h
urhadd v3.8h, v3.8h, v7.8h
subs w4, w4, #1
urhadd v16.8h, v16.8h, v20.8h
urhadd v17.8h, v17.8h, v21.8h
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x5], #64
urhadd v18.8h, v18.8h, v22.8h
urhadd v19.8h, v19.8h, v23.8h
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
b.ne 1b
ret
endfunc
function ff_vp9_avg32_16_neon, export=1
mov x5, x0
1:
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x3
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
urhadd v0.8h, v0.8h, v4.8h
urhadd v1.8h, v1.8h, v5.8h
ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
urhadd v2.8h, v2.8h, v6.8h
urhadd v3.8h, v3.8h, v7.8h
subs w4, w4, #2
urhadd v16.8h, v16.8h, v20.8h
urhadd v17.8h, v17.8h, v21.8h
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x5], x1
urhadd v18.8h, v18.8h, v22.8h
urhadd v19.8h, v19.8h, v23.8h
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
b.ne 1b
ret
endfunc
function ff_vp9_avg16_16_neon, export=1
1:
ld1 {v2.8h, v3.8h}, [x2], x3
ld1 {v0.8h, v1.8h}, [x0]
urhadd v0.8h, v0.8h, v2.8h
urhadd v1.8h, v1.8h, v3.8h
subs w4, w4, #1
st1 {v0.8h, v1.8h}, [x0], x1
b.ne 1b
ret
endfunc
function ff_vp9_avg8_16_neon, export=1
mov x5, x0
1:
ld1 {v2.8h}, [x2], x3
ld1 {v0.8h}, [x0], x1
ld1 {v3.8h}, [x2], x3
urhadd v0.8h, v0.8h, v2.8h
ld1 {v1.8h}, [x0], x1
urhadd v1.8h, v1.8h, v3.8h
subs w4, w4, #2
st1 {v0.8h}, [x5], x1
st1 {v1.8h}, [x5], x1
b.ne 1b
ret
endfunc
function ff_vp9_avg4_16_neon, export=1
mov x5, x0
1:
ld1 {v2.4h}, [x2], x3
ld1 {v0.4h}, [x0], x1
ld1 {v3.4h}, [x2], x3
urhadd v0.4h, v0.4h, v2.4h
ld1 {v1.4h}, [x0], x1
urhadd v1.4h, v1.4h, v3.4h
subs w4, w4, #2
st1 {v0.4h}, [x5], x1
st1 {v1.8b}, [x5], x1
b.ne 1b
ret
endfunc
// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
// for size >= 16), and multiply-accumulate into dst1 and dst5 (or
// dst1-dst2 and dst5-dst6 for size >= 8 and dst1-dst4 and dst5-dst8
// for size >= 16)
.macro extmlal dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, src1, src2, src3, src4, src5, src6, offset, size
ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
smlal \dst1\().4s, v20.4h, v0.h[\offset]
smlal \dst5\().4s, v22.4h, v0.h[\offset]
.if \size >= 16
ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
.endif
.if \size >= 8
smlal2 \dst2\().4s, v20.8h, v0.h[\offset]
smlal2 \dst6\().4s, v22.8h, v0.h[\offset]
.endif
.if \size >= 16
smlal \dst3\().4s, v21.4h, v0.h[\offset]
smlal \dst7\().4s, v23.4h, v0.h[\offset]
smlal2 \dst4\().4s, v21.8h, v0.h[\offset]
smlal2 \dst8\().4s, v23.8h, v0.h[\offset]
.endif
.endm
// Instantiate a horizontal filter function for the given size.
// This can work on 4, 8 or 16 pixels in parallel; for larger
// widths it will do 16 pixels at a time and loop horizontally.
// The actual width (in bytes) is passed in x5, the height in w4 and
// the filter coefficients in x9.
.macro do_8tap_h type, size
function \type\()_8tap_\size\()h
sub x2, x2, #6
add x6, x0, x1
add x7, x2, x3
add x1, x1, x1
add x3, x3, x3
// Only size >= 16 loops horizontally and needs
// reduced dst stride
.if \size >= 16
sub x1, x1, x5
.endif
// size >= 16 loads two qwords and increments r2,
// for size 4/8 it's enough with one qword and no
// postincrement
.if \size >= 16
sub x3, x3, x5
sub x3, x3, #16
.endif
// Load the filter vector
ld1 {v0.8h}, [x9]
1:
.if \size >= 16
mov x9, x5
.endif
// Load src
.if \size >= 16
ld1 {v5.8h, v6.8h, v7.8h}, [x2], #48
ld1 {v16.8h, v17.8h, v18.8h}, [x7], #48
.else
ld1 {v5.8h, v6.8h}, [x2]
ld1 {v16.8h, v17.8h}, [x7]
.endif
2:
smull v1.4s, v5.4h, v0.h[0]
smull v24.4s, v16.4h, v0.h[0]
.if \size >= 8
smull2 v2.4s, v5.8h, v0.h[0]
smull2 v25.4s, v16.8h, v0.h[0]
.endif
.if \size >= 16
smull v3.4s, v6.4h, v0.h[0]
smull v26.4s, v17.4h, v0.h[0]
smull2 v4.4s, v6.8h, v0.h[0]
smull2 v27.4s, v17.8h, v0.h[0]
.endif
extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 1, \size
extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 2, \size
extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 3, \size
extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 4, \size
extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 5, \size
extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 6, \size
extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 7, \size
// Round, shift and saturate
// The sqrshrun takes care of clamping negative values to zero, but
// we manually need to do umin with the max pixel value.
sqrshrun v1.4h, v1.4s, #7
sqrshrun v24.4h, v24.4s, #7
.if \size >= 8
sqrshrun2 v1.8h, v2.4s, #7
sqrshrun2 v24.8h, v25.4s, #7
umin v1.8h, v1.8h, v31.8h
umin v24.8h, v24.8h, v31.8h
.if \size >= 16
sqrshrun v2.4h, v3.4s, #7
sqrshrun v25.4h, v26.4s, #7
sqrshrun2 v2.8h, v4.4s, #7
sqrshrun2 v25.8h, v27.4s, #7
umin v2.8h, v2.8h, v31.8h
umin v25.8h, v25.8h, v31.8h
.endif
.else
umin v1.4h, v1.4h, v31.4h
umin v24.4h, v24.4h, v31.4h
.endif
// Average
.ifc \type,avg
.if \size >= 16
ld1 {v3.8h, v4.8h}, [x0]
ld1 {v29.8h, v30.8h}, [x6]
urhadd v1.8h, v1.8h, v3.8h
urhadd v2.8h, v2.8h, v4.8h
urhadd v24.8h, v24.8h, v29.8h
urhadd v25.8h, v25.8h, v30.8h
.elseif \size >= 8
ld1 {v3.8h}, [x0]
ld1 {v4.8h}, [x6]
urhadd v1.8h, v1.8h, v3.8h
urhadd v24.8h, v24.8h, v4.8h
.else
ld1 {v3.4h}, [x0]
ld1 {v4.4h}, [x6]
urhadd v1.4h, v1.4h, v3.4h
urhadd v24.4h, v24.4h, v4.4h
.endif
.endif
// Store and loop horizontally (for size >= 16)
.if \size >= 16
subs x9, x9, #32
st1 {v1.8h, v2.8h}, [x0], #32
st1 {v24.8h, v25.8h}, [x6], #32
b.eq 3f
mov v5.16b, v7.16b
mov v16.16b, v18.16b
ld1 {v6.8h, v7.8h}, [x2], #32
ld1 {v17.8h, v18.8h}, [x7], #32
b 2b
.elseif \size == 8
st1 {v1.8h}, [x0]
st1 {v24.8h}, [x6]
.else // \size == 4
st1 {v1.4h}, [x0]
st1 {v24.4h}, [x6]
.endif
3:
// Loop vertically
add x0, x0, x1
add x6, x6, x1
add x2, x2, x3
add x7, x7, x3
subs w4, w4, #2
b.ne 1b
ret
endfunc
.endm
.macro do_8tap_h_size size
do_8tap_h put, \size
do_8tap_h avg, \size
.endm
do_8tap_h_size 4
do_8tap_h_size 8
do_8tap_h_size 16
.macro do_8tap_h_func type, filter, offset, size, bpp
function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1
mvni v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
movrel x6, X(ff_vp9_subpel_filters), 256*\offset
cmp w5, #8
add x9, x6, w5, uxtw #4
mov x5, #2*\size
.if \size >= 16
b \type\()_8tap_16h
.else
b \type\()_8tap_\size\()h
.endif
endfunc
.endm
.macro do_8tap_h_filters size, bpp
do_8tap_h_func put, regular, 1, \size, \bpp
do_8tap_h_func avg, regular, 1, \size, \bpp
do_8tap_h_func put, sharp, 2, \size, \bpp
do_8tap_h_func avg, sharp, 2, \size, \bpp
do_8tap_h_func put, smooth, 0, \size, \bpp
do_8tap_h_func avg, smooth, 0, \size, \bpp
.endm
.macro do_8tap_h_filters_bpp bpp
do_8tap_h_filters 64, \bpp
do_8tap_h_filters 32, \bpp
do_8tap_h_filters 16, \bpp
do_8tap_h_filters 8, \bpp
do_8tap_h_filters 4, \bpp
.endm
do_8tap_h_filters_bpp 10
do_8tap_h_filters_bpp 12
// Vertical filters
// Round, shift and saturate and store reg1-reg4
.macro do_store4 reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, minreg, type
sqrshrun \reg1\().4h, \reg1\().4s, #7
sqrshrun \reg2\().4h, \reg2\().4s, #7
sqrshrun \reg3\().4h, \reg3\().4s, #7
sqrshrun \reg4\().4h, \reg4\().4s, #7
.ifc \type,avg
ld1 {\tmp1\().4h}, [x7], x1
ld1 {\tmp2\().4h}, [x7], x1
ld1 {\tmp3\().4h}, [x7], x1
ld1 {\tmp4\().4h}, [x7], x1
.endif
umin \reg1\().4h, \reg1\().4h, \minreg\().4h
umin \reg2\().4h, \reg2\().4h, \minreg\().4h
umin \reg3\().4h, \reg3\().4h, \minreg\().4h
umin \reg4\().4h, \reg4\().4h, \minreg\().4h
.ifc \type,avg
urhadd \reg1\().4h, \reg1\().4h, \tmp1\().4h
urhadd \reg2\().4h, \reg2\().4h, \tmp2\().4h
urhadd \reg3\().4h, \reg3\().4h, \tmp3\().4h
urhadd \reg4\().4h, \reg4\().4h, \tmp4\().4h
.endif
st1 {\reg1\().4h}, [x0], x1
st1 {\reg2\().4h}, [x0], x1
st1 {\reg3\().4h}, [x0], x1
st1 {\reg4\().4h}, [x0], x1
.endm
// Round, shift and saturate and store reg1-8, where
// reg1-2, reg3-4 etc pairwise correspond to 4 rows.
.macro do_store8 reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, minreg, type
sqrshrun \reg1\().4h, \reg1\().4s, #7
sqrshrun2 \reg1\().8h, \reg2\().4s, #7
sqrshrun \reg2\().4h, \reg3\().4s, #7
sqrshrun2 \reg2\().8h, \reg4\().4s, #7
sqrshrun \reg3\().4h, \reg5\().4s, #7
sqrshrun2 \reg3\().8h, \reg6\().4s, #7
sqrshrun \reg4\().4h, \reg7\().4s, #7
sqrshrun2 \reg4\().8h, \reg8\().4s, #7
.ifc \type,avg
ld1 {\reg5\().8h}, [x7], x1
ld1 {\reg6\().8h}, [x7], x1
ld1 {\reg7\().8h}, [x7], x1
ld1 {\reg8\().8h}, [x7], x1
.endif
umin \reg1\().8h, \reg1\().8h, \minreg\().8h
umin \reg2\().8h, \reg2\().8h, \minreg\().8h
umin \reg3\().8h, \reg3\().8h, \minreg\().8h
umin \reg4\().8h, \reg4\().8h, \minreg\().8h
.ifc \type,avg
urhadd \reg1\().8h, \reg1\().8h, \reg5\().8h
urhadd \reg2\().8h, \reg2\().8h, \reg6\().8h
urhadd \reg3\().8h, \reg3\().8h, \reg7\().8h
urhadd \reg4\().8h, \reg4\().8h, \reg8\().8h
.endif
st1 {\reg1\().8h}, [x0], x1
st1 {\reg2\().8h}, [x0], x1
st1 {\reg3\().8h}, [x0], x1
st1 {\reg4\().8h}, [x0], x1
.endm
// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
// (src1-src8 into dst1, src2-src9 into dst2).
.macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2
smull \dst1\().4s, \src1\().4h, v0.h[0]
smull \dst2\().4s, \src2\().4h, v0.h[0]
smull \tmp1\().4s, \src2\().4h, v0.h[1]
smull \tmp2\().4s, \src3\().4h, v0.h[1]
smlal \dst1\().4s, \src3\().4h, v0.h[2]
smlal \dst2\().4s, \src4\().4h, v0.h[2]
smlal \tmp1\().4s, \src4\().4h, v0.h[3]
smlal \tmp2\().4s, \src5\().4h, v0.h[3]
smlal \dst1\().4s, \src5\().4h, v0.h[4]
smlal \dst2\().4s, \src6\().4h, v0.h[4]
smlal \tmp1\().4s, \src6\().4h, v0.h[5]
smlal \tmp2\().4s, \src7\().4h, v0.h[5]
smlal \dst1\().4s, \src7\().4h, v0.h[6]
smlal \dst2\().4s, \src8\().4h, v0.h[6]
smlal \tmp1\().4s, \src8\().4h, v0.h[7]
smlal \tmp2\().4s, \src9\().4h, v0.h[7]
add \dst1\().4s, \dst1\().4s, \tmp1\().4s
add \dst2\().4s, \dst2\().4s, \tmp2\().4s
.endm
// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst4
// (src1-src8 into dst1-dst2, src2-src9 into dst3-dst4).
.macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9
smull \dst1\().4s, \src1\().4h, v0.h[0]
smull2 \dst2\().4s, \src1\().8h, v0.h[0]
smull \dst3\().4s, \src2\().4h, v0.h[0]
smull2 \dst4\().4s, \src2\().8h, v0.h[0]
smlal \dst1\().4s, \src2\().4h, v0.h[1]
smlal2 \dst2\().4s, \src2\().8h, v0.h[1]
smlal \dst3\().4s, \src3\().4h, v0.h[1]
smlal2 \dst4\().4s, \src3\().8h, v0.h[1]
smlal \dst1\().4s, \src3\().4h, v0.h[2]
smlal2 \dst2\().4s, \src3\().8h, v0.h[2]
smlal \dst3\().4s, \src4\().4h, v0.h[2]
smlal2 \dst4\().4s, \src4\().8h, v0.h[2]
smlal \dst1\().4s, \src4\().4h, v0.h[3]
smlal2 \dst2\().4s, \src4\().8h, v0.h[3]
smlal \dst3\().4s, \src5\().4h, v0.h[3]
smlal2 \dst4\().4s, \src5\().8h, v0.h[3]
smlal \dst1\().4s, \src5\().4h, v0.h[4]
smlal2 \dst2\().4s, \src5\().8h, v0.h[4]
smlal \dst3\().4s, \src6\().4h, v0.h[4]
smlal2 \dst4\().4s, \src6\().8h, v0.h[4]
smlal \dst1\().4s, \src6\().4h, v0.h[5]
smlal2 \dst2\().4s, \src6\().8h, v0.h[5]
smlal \dst3\().4s, \src7\().4h, v0.h[5]
smlal2 \dst4\().4s, \src7\().8h, v0.h[5]
smlal \dst1\().4s, \src7\().4h, v0.h[6]
smlal2 \dst2\().4s, \src7\().8h, v0.h[6]
smlal \dst3\().4s, \src8\().4h, v0.h[6]
smlal2 \dst4\().4s, \src8\().8h, v0.h[6]
smlal \dst1\().4s, \src8\().4h, v0.h[7]
smlal2 \dst2\().4s, \src8\().8h, v0.h[7]
smlal \dst3\().4s, \src9\().4h, v0.h[7]
smlal2 \dst4\().4s, \src9\().8h, v0.h[7]
.endm
// Instantiate a vertical filter function for filtering 8 pixels at a time.
// The height is passed in x4, the width in x5 and the filter coefficients
// in x6.
.macro do_8tap_8v type
function \type\()_8tap_8v
sub x2, x2, x3, lsl #1
sub x2, x2, x3
ld1 {v0.8h}, [x6]
1:
.ifc \type,avg
mov x7, x0
.endif
mov x6, x4
ld1 {v17.8h}, [x2], x3
ld1 {v18.8h}, [x2], x3
ld1 {v19.8h}, [x2], x3
ld1 {v20.8h}, [x2], x3
ld1 {v21.8h}, [x2], x3
ld1 {v22.8h}, [x2], x3
ld1 {v23.8h}, [x2], x3
2:
ld1 {v24.8h}, [x2], x3
ld1 {v25.8h}, [x2], x3
ld1 {v26.8h}, [x2], x3
ld1 {v27.8h}, [x2], x3
convolve8 v2, v3, v4, v5, v17, v18, v19, v20, v21, v22, v23, v24, v25
convolve8 v6, v7, v30, v31, v19, v20, v21, v22, v23, v24, v25, v26, v27
do_store8 v2, v3, v4, v5, v6, v7, v30, v31, v1, \type
subs x6, x6, #4
b.eq 8f
ld1 {v16.8h}, [x2], x3
ld1 {v17.8h}, [x2], x3
ld1 {v18.8h}, [x2], x3
ld1 {v19.8h}, [x2], x3
convolve8 v2, v3, v4, v5, v21, v22, v23, v24, v25, v26, v27, v16, v17
convolve8 v6, v7, v20, v21, v23, v24, v25, v26, v27, v16, v17, v18, v19
do_store8 v2, v3, v4, v5, v6, v7, v20, v21, v1, \type
subs x6, x6, #4
b.eq 8f
ld1 {v20.8h}, [x2], x3
ld1 {v21.8h}, [x2], x3
ld1 {v22.8h}, [x2], x3
ld1 {v23.8h}, [x2], x3
convolve8 v2, v3, v4, v5, v25, v26, v27, v16, v17, v18, v19, v20, v21
convolve8 v6, v7, v24, v25, v27, v16, v17, v18, v19, v20, v21, v22, v23
do_store8 v2, v3, v4, v5, v6, v7, v24, v25, v1, \type
subs x6, x6, #4
b.ne 2b
8:
subs x5, x5, #8
b.eq 9f
// x0 -= h * dst_stride
msub x0, x1, x4, x0
// x2 -= h * src_stride
msub x2, x3, x4, x2
// x2 -= 8 * src_stride
sub x2, x2, x3, lsl #3
// x2 += 1 * src_stride
add x2, x2, x3
add x2, x2, #16
add x0, x0, #16
b 1b
9:
ret
endfunc
.endm
do_8tap_8v put
do_8tap_8v avg
// Instantiate a vertical filter function for filtering a 4 pixels wide
// slice. This only is designed to work for 4 or 8 output lines.
.macro do_8tap_4v type
function \type\()_8tap_4v
sub x2, x2, x3, lsl #1
sub x2, x2, x3
ld1 {v0.8h}, [x6]
.ifc \type,avg
mov x7, x0
.endif
ld1 {v16.4h}, [x2], x3
ld1 {v17.4h}, [x2], x3
ld1 {v18.4h}, [x2], x3
ld1 {v19.4h}, [x2], x3
ld1 {v20.4h}, [x2], x3
ld1 {v21.4h}, [x2], x3
ld1 {v22.4h}, [x2], x3
ld1 {v23.4h}, [x2], x3
ld1 {v24.4h}, [x2], x3
ld1 {v25.4h}, [x2], x3
ld1 {v26.4h}, [x2], x3
convolve4 v2, v3, v16, v17, v18, v19, v20, v21, v22, v23, v24, v30, v31
convolve4 v4, v5, v18, v19, v20, v21, v22, v23, v24, v25, v26, v30, v31
do_store4 v2, v3, v4, v5, v28, v29, v30, v31, v1, \type
subs x4, x4, #4
b.eq 9f
ld1 {v27.4h}, [x2], x3
ld1 {v28.4h}, [x2], x3
ld1 {v29.4h}, [x2], x3
ld1 {v30.4h}, [x2], x3
convolve4 v2, v3, v20, v21, v22, v23, v24, v25, v26, v27, v28, v16, v17
convolve4 v4, v5, v22, v23, v24, v25, v26, v27, v28, v29, v30, v16, v17
do_store4 v2, v3, v4, v5, v16, v17, v18, v19, v1, \type
9:
ret
endfunc
.endm
do_8tap_4v put
do_8tap_4v avg
.macro do_8tap_v_func type, filter, offset, size, bpp
function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1
uxtw x4, w4
mvni v1.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
movrel x5, X(ff_vp9_subpel_filters), 256*\offset
add x6, x5, w6, uxtw #4
mov x5, #\size
.if \size >= 8
b \type\()_8tap_8v
.else
b \type\()_8tap_4v
.endif
endfunc
.endm
.macro do_8tap_v_filters size, bpp
do_8tap_v_func put, regular, 1, \size, \bpp
do_8tap_v_func avg, regular, 1, \size, \bpp
do_8tap_v_func put, sharp, 2, \size, \bpp
do_8tap_v_func avg, sharp, 2, \size, \bpp
do_8tap_v_func put, smooth, 0, \size, \bpp
do_8tap_v_func avg, smooth, 0, \size, \bpp
.endm
.macro do_8tap_v_filters_bpp bpp
do_8tap_v_filters 64, \bpp
do_8tap_v_filters 32, \bpp
do_8tap_v_filters 16, \bpp
do_8tap_v_filters 8, \bpp
do_8tap_v_filters 4, \bpp
.endm
do_8tap_v_filters_bpp 10
do_8tap_v_filters_bpp 12

View File

@ -0,0 +1,687 @@
/*
* Copyright (c) 2016 Google Inc.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
// All public functions in this file have the following signature:
// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
// const uint8_t *ref, ptrdiff_t ref_stride,
// int h, int mx, int my);
function ff_vp9_copy64_aarch64, export=1
1:
ldp x5, x6, [x2]
ldp x7, x8, [x2, #16]
stp x5, x6, [x0]
ldp x9, x10, [x2, #32]
stp x7, x8, [x0, #16]
subs w4, w4, #1
ldp x11, x12, [x2, #48]
stp x9, x10, [x0, #32]
stp x11, x12, [x0, #48]
add x2, x2, x3
add x0, x0, x1
b.ne 1b
ret
endfunc
function ff_vp9_avg64_neon, export=1
mov x5, x0
1:
ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x2], x3
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
urhadd v0.16b, v0.16b, v4.16b
urhadd v1.16b, v1.16b, v5.16b
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
urhadd v2.16b, v2.16b, v6.16b
urhadd v3.16b, v3.16b, v7.16b
subs w4, w4, #2
urhadd v16.16b, v16.16b, v20.16b
urhadd v17.16b, v17.16b, v21.16b
st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], x1
urhadd v18.16b, v18.16b, v22.16b
urhadd v19.16b, v19.16b, v23.16b
st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x5], x1
b.ne 1b
ret
endfunc
function ff_vp9_copy32_aarch64, export=1
1:
ldp x5, x6, [x2]
ldp x7, x8, [x2, #16]
stp x5, x6, [x0]
subs w4, w4, #1
stp x7, x8, [x0, #16]
add x2, x2, x3
add x0, x0, x1
b.ne 1b
ret
endfunc
function ff_vp9_avg32_neon, export=1
1:
ld1 {v2.16b, v3.16b}, [x2], x3
ld1 {v0.16b, v1.16b}, [x0]
urhadd v0.16b, v0.16b, v2.16b
urhadd v1.16b, v1.16b, v3.16b
subs w4, w4, #1
st1 {v0.16b, v1.16b}, [x0], x1
b.ne 1b
ret
endfunc
function ff_vp9_copy16_neon, export=1
add x5, x0, x1
lsl x1, x1, #1
add x6, x2, x3
lsl x3, x3, #1
1:
ld1 {v0.16b}, [x2], x3
ld1 {v1.16b}, [x6], x3
ld1 {v2.16b}, [x2], x3
ld1 {v3.16b}, [x6], x3
subs w4, w4, #4
st1 {v0.16b}, [x0], x1
st1 {v1.16b}, [x5], x1
st1 {v2.16b}, [x0], x1
st1 {v3.16b}, [x5], x1
b.ne 1b
ret
endfunc
function ff_vp9_avg16_neon, export=1
mov x5, x0
1:
ld1 {v2.16b}, [x2], x3
ld1 {v0.16b}, [x0], x1
ld1 {v3.16b}, [x2], x3
urhadd v0.16b, v0.16b, v2.16b
ld1 {v1.16b}, [x0], x1
urhadd v1.16b, v1.16b, v3.16b
subs w4, w4, #2
st1 {v0.16b}, [x5], x1
st1 {v1.16b}, [x5], x1
b.ne 1b
ret
endfunc
function ff_vp9_copy8_neon, export=1
1:
ld1 {v0.8b}, [x2], x3
ld1 {v1.8b}, [x2], x3
subs w4, w4, #2
st1 {v0.8b}, [x0], x1
st1 {v1.8b}, [x0], x1
b.ne 1b
ret
endfunc
function ff_vp9_avg8_neon, export=1
mov x5, x0
1:
ld1 {v2.8b}, [x2], x3
ld1 {v0.8b}, [x0], x1
ld1 {v3.8b}, [x2], x3
urhadd v0.8b, v0.8b, v2.8b
ld1 {v1.8b}, [x0], x1
urhadd v1.8b, v1.8b, v3.8b
subs w4, w4, #2
st1 {v0.8b}, [x5], x1
st1 {v1.8b}, [x5], x1
b.ne 1b
ret
endfunc
function ff_vp9_copy4_neon, export=1
1:
ld1 {v0.s}[0], [x2], x3
ld1 {v1.s}[0], [x2], x3
st1 {v0.s}[0], [x0], x1
ld1 {v2.s}[0], [x2], x3
st1 {v1.s}[0], [x0], x1
ld1 {v3.s}[0], [x2], x3
subs w4, w4, #4
st1 {v2.s}[0], [x0], x1
st1 {v3.s}[0], [x0], x1
b.ne 1b
ret
endfunc
function ff_vp9_avg4_neon, export=1
mov x5, x0
1:
ld1 {v2.s}[0], [x2], x3
ld1 {v0.s}[0], [x0], x1
ld1 {v2.s}[1], [x2], x3
ld1 {v0.s}[1], [x0], x1
ld1 {v3.s}[0], [x2], x3
ld1 {v1.s}[0], [x0], x1
ld1 {v3.s}[1], [x2], x3
ld1 {v1.s}[1], [x0], x1
subs w4, w4, #4
urhadd v0.8b, v0.8b, v2.8b
urhadd v1.8b, v1.8b, v3.8b
st1 {v0.s}[0], [x5], x1
st1 {v0.s}[1], [x5], x1
st1 {v1.s}[0], [x5], x1
st1 {v1.s}[1], [x5], x1
b.ne 1b
ret
endfunc
// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
// for size >= 16), and multiply-accumulate into dst1 and dst3 (or
// dst1-dst2 and dst3-dst4 for size >= 16)
.macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
.if \size >= 16
mla \dst1\().8h, v20.8h, v0.h[\offset]
ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
mla \dst3\().8h, v22.8h, v0.h[\offset]
ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
mla \dst2\().8h, v21.8h, v0.h[\offset]
mla \dst4\().8h, v23.8h, v0.h[\offset]
.elseif \size == 8
mla \dst1\().8h, v20.8h, v0.h[\offset]
mla \dst3\().8h, v22.8h, v0.h[\offset]
.else
mla \dst1\().4h, v20.4h, v0.h[\offset]
mla \dst3\().4h, v22.4h, v0.h[\offset]
.endif
.endm
// The same as above, but don't accumulate straight into the
// destination, but use a temp register and accumulate with saturation.
.macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
.if \size >= 16
mul v20.8h, v20.8h, v0.h[\offset]
ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
mul v22.8h, v22.8h, v0.h[\offset]
ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
mul v21.8h, v21.8h, v0.h[\offset]
mul v23.8h, v23.8h, v0.h[\offset]
.elseif \size == 8
mul v20.8h, v20.8h, v0.h[\offset]
mul v22.8h, v22.8h, v0.h[\offset]
.else
mul v20.4h, v20.4h, v0.h[\offset]
mul v22.4h, v22.4h, v0.h[\offset]
.endif
.if \size == 4
sqadd \dst1\().4h, \dst1\().4h, v20.4h
sqadd \dst3\().4h, \dst3\().4h, v22.4h
.else
sqadd \dst1\().8h, \dst1\().8h, v20.8h
sqadd \dst3\().8h, \dst3\().8h, v22.8h
.if \size >= 16
sqadd \dst2\().8h, \dst2\().8h, v21.8h
sqadd \dst4\().8h, \dst4\().8h, v23.8h
.endif
.endif
.endm
// Instantiate a horizontal filter function for the given size.
// This can work on 4, 8 or 16 pixels in parallel; for larger
// widths it will do 16 pixels at a time and loop horizontally.
// The actual width is passed in x5, the height in w4 and the
// filter coefficients in x9. idx2 is the index of the largest
// filter coefficient (3 or 4) and idx1 is the other one of them.
.macro do_8tap_h type, size, idx1, idx2
function \type\()_8tap_\size\()h_\idx1\idx2
sub x2, x2, #3
add x6, x0, x1
add x7, x2, x3
add x1, x1, x1
add x3, x3, x3
// Only size >= 16 loops horizontally and needs
// reduced dst stride
.if \size >= 16
sub x1, x1, x5
.endif
// size >= 16 loads two qwords and increments x2,
// for size 4/8 it's enough with one qword and no
// postincrement
.if \size >= 16
sub x3, x3, x5
sub x3, x3, #8
.endif
// Load the filter vector
ld1 {v0.8h}, [x9]
1:
.if \size >= 16
mov x9, x5
.endif
// Load src
.if \size >= 16
ld1 {v4.8b, v5.8b, v6.8b}, [x2], #24
ld1 {v16.8b, v17.8b, v18.8b}, [x7], #24
.else
ld1 {v4.8b, v5.8b}, [x2]
ld1 {v16.8b, v17.8b}, [x7]
.endif
uxtl v4.8h, v4.8b
uxtl v5.8h, v5.8b
uxtl v16.8h, v16.8b
uxtl v17.8h, v17.8b
.if \size >= 16
uxtl v6.8h, v6.8b
uxtl v18.8h, v18.8b
.endif
2:
// Accumulate, adding idx2 last with a separate
// saturating add. The positive filter coefficients
// for all indices except idx2 must add up to less
// than 127 for this not to overflow.
mul v1.8h, v4.8h, v0.h[0]
mul v24.8h, v16.8h, v0.h[0]
.if \size >= 16
mul v2.8h, v5.8h, v0.h[0]
mul v25.8h, v17.8h, v0.h[0]
.endif
extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 1, \size
extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 2, \size
extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, \idx1, \size
extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 5, \size
extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 6, \size
extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 7, \size
extmulqadd v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, \idx2, \size
// Round, shift and saturate
sqrshrun v1.8b, v1.8h, #7
sqrshrun v24.8b, v24.8h, #7
.if \size >= 16
sqrshrun2 v1.16b, v2.8h, #7
sqrshrun2 v24.16b, v25.8h, #7
.endif
// Average
.ifc \type,avg
.if \size >= 16
ld1 {v2.16b}, [x0]
ld1 {v3.16b}, [x6]
urhadd v1.16b, v1.16b, v2.16b
urhadd v24.16b, v24.16b, v3.16b
.elseif \size == 8
ld1 {v2.8b}, [x0]
ld1 {v3.8b}, [x6]
urhadd v1.8b, v1.8b, v2.8b
urhadd v24.8b, v24.8b, v3.8b
.else
ld1 {v2.s}[0], [x0]
ld1 {v3.s}[0], [x6]
urhadd v1.8b, v1.8b, v2.8b
urhadd v24.8b, v24.8b, v3.8b
.endif
.endif
// Store and loop horizontally (for size >= 16)
.if \size >= 16
subs x9, x9, #16
st1 {v1.16b}, [x0], #16
st1 {v24.16b}, [x6], #16
b.eq 3f
mov v4.16b, v6.16b
mov v16.16b, v18.16b
ld1 {v6.16b}, [x2], #16
ld1 {v18.16b}, [x7], #16
uxtl v5.8h, v6.8b
uxtl2 v6.8h, v6.16b
uxtl v17.8h, v18.8b
uxtl2 v18.8h, v18.16b
b 2b
.elseif \size == 8
st1 {v1.8b}, [x0]
st1 {v24.8b}, [x6]
.else // \size == 4
st1 {v1.s}[0], [x0]
st1 {v24.s}[0], [x6]
.endif
3:
// Loop vertically
add x0, x0, x1
add x6, x6, x1
add x2, x2, x3
add x7, x7, x3
subs w4, w4, #2
b.ne 1b
ret
endfunc
.endm
.macro do_8tap_h_size size
do_8tap_h put, \size, 3, 4
do_8tap_h avg, \size, 3, 4
do_8tap_h put, \size, 4, 3
do_8tap_h avg, \size, 4, 3
.endm
do_8tap_h_size 4
do_8tap_h_size 8
do_8tap_h_size 16
.macro do_8tap_h_func type, filter, offset, size
function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
movrel x6, X(ff_vp9_subpel_filters), 256*\offset
cmp w5, #8
add x9, x6, w5, uxtw #4
mov x5, #\size
.if \size >= 16
b.ge \type\()_8tap_16h_34
b \type\()_8tap_16h_43
.else
b.ge \type\()_8tap_\size\()h_34
b \type\()_8tap_\size\()h_43
.endif
endfunc
.endm
.macro do_8tap_h_filters size
do_8tap_h_func put, regular, 1, \size
do_8tap_h_func avg, regular, 1, \size
do_8tap_h_func put, sharp, 2, \size
do_8tap_h_func avg, sharp, 2, \size
do_8tap_h_func put, smooth, 0, \size
do_8tap_h_func avg, smooth, 0, \size
.endm
do_8tap_h_filters 64
do_8tap_h_filters 32
do_8tap_h_filters 16
do_8tap_h_filters 8
do_8tap_h_filters 4
// Vertical filters
// Round, shift and saturate and store reg1-reg2 over 4 lines
.macro do_store4 reg1, reg2, tmp1, tmp2, type
sqrshrun \reg1\().8b, \reg1\().8h, #7
sqrshrun \reg2\().8b, \reg2\().8h, #7
.ifc \type,avg
ld1 {\tmp1\().s}[0], [x7], x1
ld1 {\tmp2\().s}[0], [x7], x1
ld1 {\tmp1\().s}[1], [x7], x1
ld1 {\tmp2\().s}[1], [x7], x1
urhadd \reg1\().8b, \reg1\().8b, \tmp1\().8b
urhadd \reg2\().8b, \reg2\().8b, \tmp2\().8b
.endif
st1 {\reg1\().s}[0], [x0], x1
st1 {\reg2\().s}[0], [x0], x1
st1 {\reg1\().s}[1], [x0], x1
st1 {\reg2\().s}[1], [x0], x1
.endm
// Round, shift and saturate and store reg1-4
.macro do_store reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, type
sqrshrun \reg1\().8b, \reg1\().8h, #7
sqrshrun \reg2\().8b, \reg2\().8h, #7
sqrshrun \reg3\().8b, \reg3\().8h, #7
sqrshrun \reg4\().8b, \reg4\().8h, #7
.ifc \type,avg
ld1 {\tmp1\().8b}, [x7], x1
ld1 {\tmp2\().8b}, [x7], x1
ld1 {\tmp3\().8b}, [x7], x1
ld1 {\tmp4\().8b}, [x7], x1
urhadd \reg1\().8b, \reg1\().8b, \tmp1\().8b
urhadd \reg2\().8b, \reg2\().8b, \tmp2\().8b
urhadd \reg3\().8b, \reg3\().8b, \tmp3\().8b
urhadd \reg4\().8b, \reg4\().8b, \tmp4\().8b
.endif
st1 {\reg1\().8b}, [x0], x1
st1 {\reg2\().8b}, [x0], x1
st1 {\reg3\().8b}, [x0], x1
st1 {\reg4\().8b}, [x0], x1
.endm
// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
// (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately
// at the end with saturation. Indices 0 and 7 always have negative or zero
// coefficients, so they can be accumulated into tmp1-tmp2 together with the
// largest coefficient.
.macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2
mul \dst1\().8h, \src2\().8h, v0.h[1]
mul \dst2\().8h, \src3\().8h, v0.h[1]
mul \tmp1\().8h, \src1\().8h, v0.h[0]
mul \tmp2\().8h, \src2\().8h, v0.h[0]
mla \dst1\().8h, \src3\().8h, v0.h[2]
mla \dst2\().8h, \src4\().8h, v0.h[2]
.if \idx1 == 3
mla \dst1\().8h, \src4\().8h, v0.h[3]
mla \dst2\().8h, \src5\().8h, v0.h[3]
.else
mla \dst1\().8h, \src5\().8h, v0.h[4]
mla \dst2\().8h, \src6\().8h, v0.h[4]
.endif
mla \dst1\().8h, \src6\().8h, v0.h[5]
mla \dst2\().8h, \src7\().8h, v0.h[5]
mla \tmp1\().8h, \src8\().8h, v0.h[7]
mla \tmp2\().8h, \src9\().8h, v0.h[7]
mla \dst1\().8h, \src7\().8h, v0.h[6]
mla \dst2\().8h, \src8\().8h, v0.h[6]
.if \idx2 == 3
mla \tmp1\().8h, \src4\().8h, v0.h[3]
mla \tmp2\().8h, \src5\().8h, v0.h[3]
.else
mla \tmp1\().8h, \src5\().8h, v0.h[4]
mla \tmp2\().8h, \src6\().8h, v0.h[4]
.endif
sqadd \dst1\().8h, \dst1\().8h, \tmp1\().8h
sqadd \dst2\().8h, \dst2\().8h, \tmp2\().8h
.endm
// Load pixels and extend them to 16 bit
.macro loadl dst1, dst2, dst3, dst4
ld1 {v1.8b}, [x2], x3
ld1 {v2.8b}, [x2], x3
ld1 {v3.8b}, [x2], x3
.ifnb \dst4
ld1 {v4.8b}, [x2], x3
.endif
uxtl \dst1\().8h, v1.8b
uxtl \dst2\().8h, v2.8b
uxtl \dst3\().8h, v3.8b
.ifnb \dst4
uxtl \dst4\().8h, v4.8b
.endif
.endm
// Instantiate a vertical filter function for filtering 8 pixels at a time.
// The height is passed in x4, the width in x5 and the filter coefficients
// in x6. idx2 is the index of the largest filter coefficient (3 or 4)
// and idx1 is the other one of them.
.macro do_8tap_8v type, idx1, idx2
function \type\()_8tap_8v_\idx1\idx2
sub x2, x2, x3, lsl #1
sub x2, x2, x3
ld1 {v0.8h}, [x6]
1:
.ifc \type,avg
mov x7, x0
.endif
mov x6, x4
loadl v17, v18, v19
loadl v20, v21, v22, v23
2:
loadl v24, v25, v26, v27
convolve v1, v2, v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v5, v6
convolve v3, v4, v19, v20, v21, v22, v23, v24, v25, v26, v27, \idx1, \idx2, v5, v6
do_store v1, v2, v3, v4, v5, v6, v7, v28, \type
subs x6, x6, #4
b.eq 8f
loadl v16, v17, v18, v19
convolve v1, v2, v21, v22, v23, v24, v25, v26, v27, v16, v17, \idx1, \idx2, v5, v6
convolve v3, v4, v23, v24, v25, v26, v27, v16, v17, v18, v19, \idx1, \idx2, v5, v6
do_store v1, v2, v3, v4, v5, v6, v7, v28, \type
subs x6, x6, #4
b.eq 8f
loadl v20, v21, v22, v23
convolve v1, v2, v25, v26, v27, v16, v17, v18, v19, v20, v21, \idx1, \idx2, v5, v6
convolve v3, v4, v27, v16, v17, v18, v19, v20, v21, v22, v23, \idx1, \idx2, v5, v6
do_store v1, v2, v3, v4, v5, v6, v7, v28, \type
subs x6, x6, #4
b.ne 2b
8:
subs x5, x5, #8
b.eq 9f
// x0 -= h * dst_stride
msub x0, x1, x4, x0
// x2 -= h * src_stride
msub x2, x3, x4, x2
// x2 -= 8 * src_stride
sub x2, x2, x3, lsl #3
// x2 += 1 * src_stride
add x2, x2, x3
add x2, x2, #8
add x0, x0, #8
b 1b
9:
ret
endfunc
.endm
do_8tap_8v put, 3, 4
do_8tap_8v put, 4, 3
do_8tap_8v avg, 3, 4
do_8tap_8v avg, 4, 3
// Instantiate a vertical filter function for filtering a 4 pixels wide
// slice. The first half of the registers contain one row, while the second
// half of a register contains the second-next row (also stored in the first
// half of the register two steps ahead). The convolution does two outputs
// at a time; the output of v17-v24 into one, and v18-v25 into another one.
// The first half of first output is the first output row, the first half
// of the other output is the second output row. The second halves of the
// registers are rows 3 and 4.
// This only is designed to work for 4 or 8 output lines.
.macro do_8tap_4v type, idx1, idx2
function \type\()_8tap_4v_\idx1\idx2
sub x2, x2, x3, lsl #1
sub x2, x2, x3
ld1 {v0.8h}, [x6]
.ifc \type,avg
mov x7, x0
.endif
ld1 {v1.s}[0], [x2], x3
ld1 {v2.s}[0], [x2], x3
ld1 {v3.s}[0], [x2], x3
ld1 {v4.s}[0], [x2], x3
ld1 {v5.s}[0], [x2], x3
ld1 {v6.s}[0], [x2], x3
trn1 v1.2s, v1.2s, v3.2s
ld1 {v7.s}[0], [x2], x3
trn1 v2.2s, v2.2s, v4.2s
ld1 {v26.s}[0], [x2], x3
uxtl v17.8h, v1.8b
trn1 v3.2s, v3.2s, v5.2s
ld1 {v27.s}[0], [x2], x3
uxtl v18.8h, v2.8b
trn1 v4.2s, v4.2s, v6.2s
ld1 {v28.s}[0], [x2], x3
uxtl v19.8h, v3.8b
trn1 v5.2s, v5.2s, v7.2s
ld1 {v29.s}[0], [x2], x3
uxtl v20.8h, v4.8b
trn1 v6.2s, v6.2s, v26.2s
uxtl v21.8h, v5.8b
trn1 v7.2s, v7.2s, v27.2s
uxtl v22.8h, v6.8b
trn1 v26.2s, v26.2s, v28.2s
uxtl v23.8h, v7.8b
trn1 v27.2s, v27.2s, v29.2s
uxtl v24.8h, v26.8b
uxtl v25.8h, v27.8b
convolve v1, v2, v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v3, v4
do_store4 v1, v2, v5, v6, \type
subs x4, x4, #4
b.eq 9f
ld1 {v1.s}[0], [x2], x3
ld1 {v2.s}[0], [x2], x3
trn1 v28.2s, v28.2s, v1.2s
trn1 v29.2s, v29.2s, v2.2s
ld1 {v1.s}[1], [x2], x3
uxtl v26.8h, v28.8b
ld1 {v2.s}[1], [x2], x3
uxtl v27.8h, v29.8b
uxtl v28.8h, v1.8b
uxtl v29.8h, v2.8b
convolve v1, v2, v21, v22, v23, v24, v25, v26, v27, v28, v29, \idx1, \idx2, v3, v4
do_store4 v1, v2, v5, v6, \type
9:
ret
endfunc
.endm
do_8tap_4v put, 3, 4
do_8tap_4v put, 4, 3
do_8tap_4v avg, 3, 4
do_8tap_4v avg, 4, 3
.macro do_8tap_v_func type, filter, offset, size
function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
uxtw x4, w4
movrel x5, X(ff_vp9_subpel_filters), 256*\offset
cmp w6, #8
add x6, x5, w6, uxtw #4
mov x5, #\size
.if \size >= 8
b.ge \type\()_8tap_8v_34
b \type\()_8tap_8v_43
.else
b.ge \type\()_8tap_4v_34
b \type\()_8tap_4v_43
.endif
endfunc
.endm
.macro do_8tap_v_filters size
do_8tap_v_func put, regular, 1, \size
do_8tap_v_func avg, regular, 1, \size
do_8tap_v_func put, sharp, 2, \size
do_8tap_v_func avg, sharp, 2, \size
do_8tap_v_func put, smooth, 0, \size
do_8tap_v_func avg, smooth, 0, \size
.endm
do_8tap_v_filters 64
do_8tap_v_filters 32
do_8tap_v_filters 16
do_8tap_v_filters 8
do_8tap_v_filters 4

View File

@ -0,0 +1,104 @@
/*
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#ifdef __ELF__
# define ELF
#else
# define ELF #
#endif
#if HAVE_AS_FUNC
# define FUNC
#else
# define FUNC #
#endif
.macro function name, export=0, align=2
.macro endfunc
ELF .size \name, . - \name
FUNC .endfunc
.purgem endfunc
.endm
.text
.align \align
.if \export
.global EXTERN_ASM\name
ELF .type EXTERN_ASM\name, %function
FUNC .func EXTERN_ASM\name
EXTERN_ASM\name:
.else
ELF .type \name, %function
FUNC .func \name
\name:
.endif
.endm
.macro const name, align=2, relocate=0
.macro endconst
ELF .size \name, . - \name
.purgem endconst
.endm
#if HAVE_SECTION_DATA_REL_RO
.if \relocate
.section .data.rel.ro
.else
.section .rodata
.endif
#elif !defined(__MACH__)
.section .rodata
#else
.const_data
#endif
.align \align
\name:
.endm
.macro movrel rd, val, offset=0
#if CONFIG_PIC && defined(__APPLE__)
.if \offset < 0
adrp \rd, \val@PAGE
add \rd, \rd, \val@PAGEOFF
sub \rd, \rd, -(\offset)
.else
adrp \rd, \val+(\offset)@PAGE
add \rd, \rd, \val+(\offset)@PAGEOFF
.endif
#elif CONFIG_PIC && defined(_WIN32)
.if \offset < 0
adrp \rd, \val
add \rd, \rd, :lo12:\val
sub \rd, \rd, -(\offset)
.else
adrp \rd, \val+(\offset)
add \rd, \rd, :lo12:\val+(\offset)
.endif
#elif CONFIG_PIC
adrp \rd, \val+(\offset)
add \rd, \rd, :lo12:\val+(\offset)
#else
ldr \rd, =\val+\offset
#endif
.endm
#define GLUE(a, b) a ## b
#define JOIN(a, b) GLUE(a, b)
#define X(s) JOIN(EXTERN_ASM, s)

View File

@ -0,0 +1,51 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_AARCH64_BSWAP_H
#define AVUTIL_AARCH64_BSWAP_H
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#if HAVE_INLINE_ASM
#define av_bswap16 av_bswap16
static av_always_inline av_const unsigned av_bswap16(unsigned x)
{
__asm__("rev16 %w0, %w0" : "+r"(x));
return x;
}
#define av_bswap32 av_bswap32
static av_always_inline av_const uint32_t av_bswap32(uint32_t x)
{
__asm__("rev %w0, %w0" : "+r"(x));
return x;
}
#define av_bswap64 av_bswap64
static av_always_inline av_const uint64_t av_bswap64(uint64_t x)
{
__asm__("rev %0, %0" : "+r"(x));
return x;
}
#endif /* HAVE_INLINE_ASM */
#endif /* AVUTIL_AARCH64_BSWAP_H */

View File

@ -0,0 +1,38 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/cpu.h"
#include "libavutil/cpu_internal.h"
#include "config.h"
int ff_get_cpu_flags_aarch64(void)
{
return AV_CPU_FLAG_ARMV8 * HAVE_ARMV8 |
AV_CPU_FLAG_NEON * HAVE_NEON |
AV_CPU_FLAG_VFP * HAVE_VFP;
}
size_t ff_get_cpu_max_align_aarch64(void)
{
int flags = av_get_cpu_flags();
if (flags & AV_CPU_FLAG_NEON)
return 16;
return 8;
}

View File

@ -0,0 +1,29 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_AARCH64_CPU_H
#define AVUTIL_AARCH64_CPU_H
#include "libavutil/cpu.h"
#include "libavutil/cpu_internal.h"
#define have_armv8(flags) CPUEXT(flags, ARMV8)
#define have_neon(flags) CPUEXT(flags, NEON)
#define have_vfp(flags) CPUEXT(flags, VFP)
#endif /* AVUTIL_AARCH64_CPU_H */

View File

@ -0,0 +1,69 @@
/*
* ARM NEON optimised Float DSP functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdint.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/float_dsp.h"
#include "cpu.h"
void ff_vector_fmul_neon(float *dst, const float *src0, const float *src1,
int len);
void ff_vector_fmac_scalar_neon(float *dst, const float *src, float mul,
int len);
void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul,
int len);
void ff_vector_dmul_scalar_neon(double *dst, const double *src, double mul,
int len);
void ff_vector_fmul_window_neon(float *dst, const float *src0,
const float *src1, const float *win, int len);
void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
const float *src2, int len);
void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
const float *src1, int len);
void ff_butterflies_float_neon(float *v1, float *v2, int len);
float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
av_cold void ff_float_dsp_init_aarch64(AVFloatDSPContext *fdsp)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
fdsp->butterflies_float = ff_butterflies_float_neon;
fdsp->scalarproduct_float = ff_scalarproduct_float_neon;
fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_neon;
fdsp->vector_fmul = ff_vector_fmul_neon;
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_neon;
fdsp->vector_fmul_add = ff_vector_fmul_add_neon;
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_neon;
fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_neon;
fdsp->vector_fmul_window = ff_vector_fmul_window_neon;
}
}

View File

@ -0,0 +1,202 @@
/*
* ARM NEON optimised Float DSP functions
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "asm.S"
function ff_vector_fmul_neon, export=1
1: subs w3, w3, #16
ld1 {v0.4S, v1.4S}, [x1], #32
ld1 {v2.4S, v3.4S}, [x1], #32
ld1 {v4.4S, v5.4S}, [x2], #32
ld1 {v6.4S, v7.4S}, [x2], #32
fmul v16.4S, v0.4S, v4.4S
fmul v17.4S, v1.4S, v5.4S
fmul v18.4S, v2.4S, v6.4S
fmul v19.4S, v3.4S, v7.4S
st1 {v16.4S, v17.4S}, [x0], #32
st1 {v18.4S, v19.4S}, [x0], #32
b.ne 1b
ret
endfunc
function ff_vector_fmac_scalar_neon, export=1
mov x3, #-32
1: subs w2, w2, #16
ld1 {v16.4S, v17.4S}, [x0], #32
ld1 {v18.4S, v19.4S}, [x0], x3
ld1 {v4.4S, v5.4S}, [x1], #32
ld1 {v6.4S, v7.4S}, [x1], #32
fmla v16.4S, v4.4S, v0.S[0]
fmla v17.4S, v5.4S, v0.S[0]
fmla v18.4S, v6.4S, v0.S[0]
fmla v19.4S, v7.4S, v0.S[0]
st1 {v16.4S, v17.4S}, [x0], #32
st1 {v18.4S, v19.4S}, [x0], #32
b.ne 1b
ret
endfunc
function ff_vector_fmul_scalar_neon, export=1
mov w4, #15
bics w3, w2, w4
dup v16.4S, v0.S[0]
b.eq 3f
ld1 {v0.4S, v1.4S}, [x1], #32
1: subs w3, w3, #16
fmul v0.4S, v0.4S, v16.4S
ld1 {v2.4S, v3.4S}, [x1], #32
fmul v1.4S, v1.4S, v16.4S
fmul v2.4S, v2.4S, v16.4S
st1 {v0.4S, v1.4S}, [x0], #32
fmul v3.4S, v3.4S, v16.4S
b.eq 2f
ld1 {v0.4S, v1.4S}, [x1], #32
st1 {v2.4S, v3.4S}, [x0], #32
b 1b
2: ands w2, w2, #15
st1 {v2.4S, v3.4S}, [x0], #32
b.eq 4f
3: ld1 {v0.4S}, [x1], #16
fmul v0.4S, v0.4S, v16.4S
st1 {v0.4S}, [x0], #16
subs w2, w2, #4
b.gt 3b
4: ret
endfunc
function ff_vector_dmul_scalar_neon, export=1
dup v16.2D, v0.D[0]
ld1 {v0.2D, v1.2D}, [x1], #32
1: subs w2, w2, #8
fmul v0.2D, v0.2D, v16.2D
ld1 {v2.2D, v3.2D}, [x1], #32
fmul v1.2D, v1.2D, v16.2D
fmul v2.2D, v2.2D, v16.2D
st1 {v0.2D, v1.2D}, [x0], #32
fmul v3.2D, v3.2D, v16.2D
ld1 {v0.2D, v1.2D}, [x1], #32
st1 {v2.2D, v3.2D}, [x0], #32
b.gt 1b
ret
endfunc
function ff_vector_fmul_window_neon, export=1
sxtw x4, w4 // len
sub x2, x2, #8
sub x5, x4, #2
add x2, x2, x5, lsl #2 // src1 + 4 * (len - 4)
add x6, x3, x5, lsl #3 // win + 8 * (len - 2)
add x5, x0, x5, lsl #3 // dst + 8 * (len - 2)
mov x7, #-16
ld1 {v0.4S}, [x1], #16 // s0
ld1 {v2.4S}, [x3], #16 // wi
ld1 {v1.4S}, [x2], x7 // s1
1: ld1 {v3.4S}, [x6], x7 // wj
subs x4, x4, #4
fmul v17.4S, v0.4S, v2.4S // s0 * wi
rev64 v4.4S, v1.4S
rev64 v5.4S, v3.4S
rev64 v17.4S, v17.4S
ext v4.16B, v4.16B, v4.16B, #8 // s1_r
ext v5.16B, v5.16B, v5.16B, #8 // wj_r
ext v17.16B, v17.16B, v17.16B, #8 // (s0 * wi)_rev
fmul v16.4S, v0.4S, v5.4S // s0 * wj_r
fmla v17.4S, v1.4S, v3.4S // (s0 * wi)_rev + s1 * wj
b.eq 2f
ld1 {v0.4S}, [x1], #16
fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi
st1 {v17.4S}, [x5], x7
ld1 {v2.4S}, [x3], #16
ld1 {v1.4S}, [x2], x7
st1 {v16.4S}, [x0], #16
b 1b
2:
fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi
st1 {v17.4S}, [x5], x7
st1 {v16.4S}, [x0], #16
ret
endfunc
function ff_vector_fmul_add_neon, export=1
ld1 {v0.4S, v1.4S}, [x1], #32
ld1 {v2.4S, v3.4S}, [x2], #32
ld1 {v4.4S, v5.4S}, [x3], #32
1: subs w4, w4, #8
fmla v4.4S, v0.4S, v2.4S
fmla v5.4S, v1.4S, v3.4S
b.eq 2f
ld1 {v0.4S, v1.4S}, [x1], #32
ld1 {v2.4S, v3.4S}, [x2], #32
st1 {v4.4S, v5.4S}, [x0], #32
ld1 {v4.4S, v5.4S}, [x3], #32
b 1b
2: st1 {v4.4S, v5.4S}, [x0], #32
ret
endfunc
function ff_vector_fmul_reverse_neon, export=1
sxtw x3, w3
add x2, x2, x3, lsl #2
sub x2, x2, #32
mov x4, #-32
ld1 {v2.4S, v3.4S}, [x2], x4
ld1 {v0.4S, v1.4S}, [x1], #32
1: subs x3, x3, #8
rev64 v3.4S, v3.4S
rev64 v2.4S, v2.4S
ext v3.16B, v3.16B, v3.16B, #8
ext v2.16B, v2.16B, v2.16B, #8
fmul v16.4S, v0.4S, v3.4S
fmul v17.4S, v1.4S, v2.4S
b.eq 2f
ld1 {v2.4S, v3.4S}, [x2], x4
ld1 {v0.4S, v1.4S}, [x1], #32
st1 {v16.4S, v17.4S}, [x0], #32
b 1b
2: st1 {v16.4S, v17.4S}, [x0], #32
ret
endfunc
function ff_butterflies_float_neon, export=1
1: ld1 {v0.4S}, [x0]
ld1 {v1.4S}, [x1]
subs w2, w2, #4
fsub v2.4S, v0.4S, v1.4S
fadd v3.4S, v0.4S, v1.4S
st1 {v2.4S}, [x1], #16
st1 {v3.4S}, [x0], #16
b.gt 1b
ret
endfunc
function ff_scalarproduct_float_neon, export=1
movi v2.4S, #0
1: ld1 {v0.4S}, [x0], #16
ld1 {v1.4S}, [x1], #16
subs w2, w2, #4
fmla v2.4S, v0.4S, v1.4S
b.gt 1b
faddp v0.4S, v2.4S, v2.4S
faddp s0, v0.2S
ret
endfunc

View File

@ -0,0 +1,44 @@
/*
* Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_AARCH64_TIMER_H
#define AVUTIL_AARCH64_TIMER_H
#include <stdint.h>
#include "config.h"
#if HAVE_INLINE_ASM
#define AV_READ_TIME read_time
static inline uint64_t read_time(void)
{
uint64_t cycle_counter;
__asm__ volatile(
"isb \t\n"
"mrs %0, pmccntr_el0 "
: "=r"(cycle_counter) :: "memory" );
return cycle_counter;
}
#endif /* HAVE_INLINE_ASM */
#endif /* AVUTIL_AARCH64_TIMER_H */