Bug 1588123 - Update libdavid to 0.5 + one commit; r=TD-Linux

Differential Revision: https://phabricator.services.mozilla.com/D49897

--HG--
rename : third_party/dav1d/snap/snapcraft.yaml => third_party/dav1d/package/snap/snapcraft.yaml
extra : moz-landing-system : lando
This commit is contained in:
Dan Minor 2019-10-21 13:47:25 +00:00
parent 8c695f6954
commit 1f9f60bf3d
48 changed files with 6749 additions and 943 deletions

View File

@ -20,7 +20,7 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: commit c0865f35c74bdcc71021630f64dca2db35d2bc8c (2019-09-19T12:07:23.000+02:00).
release: commit a6228f47f0eebcdfebb1753a786e3e1654b51ea4 (2019-10-11T13:55:51.000+03:00).
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

View File

@ -1,2 +1,2 @@
/* auto-generated, do not edit */
#define DAV1D_VERSION "0.4.0-49-gc0865f3"
#define DAV1D_VERSION "0.5.0-1-ga6228f4"

View File

@ -241,7 +241,7 @@ build-ubuntu-snap:
- debian
- amd64
script:
- snapcraft snap
- cd package/snap && snapcraft snap
- |
if [ "$CI_PROJECT_NAMESPACE" = "videolan" ]; then
echo $SNAP_LOGIN | base64 --decode | snapcraft login --with -
@ -251,7 +251,7 @@ build-ubuntu-snap:
artifacts:
name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
paths:
- dav1d_*.snap
- package/snap/dav1d_*.snap
expire_in: 1 week
allow_failure: true

View File

@ -1,3 +1,25 @@
Changes for 0.5.0 'Asiatic Cheetah':
----------------------------
0.5.0 is a medium release fixing regressions and minor issues,
and improving speed significantly:
- Export ITU T.35 metadata
- Speed improvements on blend_ on ARM
- Speed improvements on decode_coef and MSAC
- NEON optimizations for blend*, w_mask_, ipred functions for ARM64
- NEON optimizations for CDEF and warp on ARM32
- SSE2 optimizations for MSAC hi_tok decoding
- SSSE3 optimizations for deblocking loopfilters and warp_affine
- AVX-2 optimizations for film grain and ipred_z2
- SSE4 optimizations for warp_affine
- VSX optimizations for wiener
- Fix inverse transform overflows in x86 and NEON asm
- Fix integer overflows with large frames
- Improve film grain generation to match reference code
- Improve compatibility with older binutils for ARM
- More advanced Player example in tools
Changes for 0.4.0 'Cheetah':
----------------------------
@ -11,6 +33,7 @@ Changes for 0.4.0 'Cheetah':
- NEON optimizations for blend functions on ARM
- NEON optimizations for w_mask functions on ARM
- NEON optimizations for inverse transforms on ARM64
- VSX optimizations for CDEF filter
- Improve handling of malloc failures
- Simple Player example in tools
@ -38,7 +61,7 @@ Changes for 0.2.2 (0.3.0-rc) 'Antelope':
- Large improvement on MSAC decoding with SSE, bringing 4-6% speed increase
The impact is important on SSSE3, SSE4 and AVX-2 cpus
- SSSE3 optimizations for all blocks size in itx
- SSSE3 optimizations for ipred_paeth and ipref_cfl (420, 422 and 444)
- SSSE3 optimizations for ipred_paeth and ipred_cfl (420, 422 and 444)
- Speed improvements on CDEF for SSE4 CPUs
- NEON optimizations for SGR and loop filter
- Minor crashes, improvements and build changes

View File

@ -73,28 +73,15 @@ The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this pr
# Compile
1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86\* targets, [nasm](https://nasm.us/) (2.13.02 or higher)
2. Run `meson build --buildtype release`
3. Build with `ninja -C build`
2. Run `mkdir build && cd build` to create a build directory and enter it
3. Run `meson ..` to configure meson, add `--default-library=static` if static linking is desired
4. Run `ninja` to compile
# Run tests
1. During initial build dir setup or `meson configure` specify `-Denable_tests=true`
2. In the build directory run `meson test` optionally with `-v` for more verbose output, especially useful
for checkasm
# Run testdata based tests
1. Checkout the test data repository
```
git clone https://code.videolan.org/videolan/dav1d-test-data.git tests/dav1d-test-data
```
2. During initial build dir setup or `meson configure` specify `-Denable_tests=true` and `-Dtestdata_tests=true`
```
meson .test -Denable_tests=true -Dtestdata_tests=true
```
3. In the build directory run `meson test` optionally with `-v` for more verbose output
1. In the root directory, run `git clone https://code.videolan.org/videolan/dav1d-test-data.git tests/dav1d-test-data` to fetch the test data repository
2. During meson configuration, specify `-Dtestdata_tests=true`
3. Run `meson test -v` after compiling
# Support

Binary file not shown.

Before

Width:  |  Height:  |  Size: 0 B

After

Width:  |  Height:  |  Size: 19 KiB

View File

@ -28,6 +28,7 @@
#include "vcs_version.h"
#include <getopt.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
@ -48,6 +49,9 @@
*/
typedef struct {
const char *inputfile;
int highquality;
int untimed;
int zerocopy;
} Dav1dPlaySettings;
#define WINDOW_WIDTH 910
@ -156,9 +160,13 @@ typedef struct rdr_info
// Callback to destroy the renderer
void (*destroy_renderer)(void *cookie);
// Callback to the render function that renders a prevously sent frame
void (*render)(void *cookie);
void (*render)(void *cookie, const Dav1dPlaySettings *settings);
// Callback to the send frame function
int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic);
int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic,
const Dav1dPlaySettings *settings);
// Callback for alloc/release pictures (optional)
int (*alloc_pic)(Dav1dPicture *pic, void *cookie);
void (*release_pic)(Dav1dPicture *pic, void *cookie);
} Dav1dPlayRenderInfo;
#ifdef HAVE_PLACEBO_VULKAN
@ -325,7 +333,7 @@ static void placebo_renderer_destroy(void *cookie)
pl_context_destroy(&(rd_priv_ctx->ctx));
}
static void placebo_render(void *cookie)
static void placebo_render(void *cookie, const Dav1dPlaySettings *settings)
{
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
assert(rd_priv_ctx != NULL);
@ -358,8 +366,9 @@ static void placebo_render(void *cookie)
.height = img->params.h,
};
struct pl_render_params render_params = pl_render_default_params;
//render_params.upscaler = &pl_filter_ewa_lanczos;
struct pl_render_params render_params = {0};
if (settings->highquality)
render_params = pl_render_default_params;
struct pl_render_target target;
pl_render_target_from_swapchain(&target, &frame);
@ -385,7 +394,8 @@ static void placebo_render(void *cookie)
SDL_UnlockMutex(rd_priv_ctx->lock);
}
static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic)
static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic,
const Dav1dPlaySettings *settings)
{
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
assert(rd_priv_ctx != NULL);
@ -413,7 +423,6 @@ static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic)
.height = height,
.pixel_stride = 1,
.row_stride = dav1d_pic->stride[0],
.pixels = dav1d_pic->data[0],
.component_size = {8},
.component_map = {0},
};
@ -424,7 +433,6 @@ static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic)
.height = height/2,
.pixel_stride = 1,
.row_stride = dav1d_pic->stride[1],
.pixels = dav1d_pic->data[1],
.component_size = {8},
.component_map = {1},
};
@ -435,11 +443,23 @@ static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic)
.height = height/2,
.pixel_stride = 1,
.row_stride = dav1d_pic->stride[1],
.pixels = dav1d_pic->data[2],
.component_size = {8},
.component_map = {2},
};
if (settings->zerocopy) {
const struct pl_buf *buf = dav1d_pic->allocator_data;
assert(buf);
data_y.buf = data_u.buf = data_v.buf = buf;
data_y.buf_offset = (uintptr_t) dav1d_pic->data[0] - (uintptr_t) buf->data;
data_u.buf_offset = (uintptr_t) dav1d_pic->data[1] - (uintptr_t) buf->data;
data_v.buf_offset = (uintptr_t) dav1d_pic->data[2] - (uintptr_t) buf->data;
} else {
data_y.pixels = dav1d_pic->data[0];
data_u.pixels = dav1d_pic->data[1];
data_v.pixels = dav1d_pic->data[2];
}
bool ok = true;
ok &= pl_upload_plane(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->y_plane), &(rd_priv_ctx->y_tex), &data_y);
ok &= pl_upload_plane(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->u_plane), &(rd_priv_ctx->u_tex), &data_u);
@ -456,11 +476,106 @@ static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic)
return !ok;
}
// Align to power of 2
#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))
static int placebo_alloc_pic(Dav1dPicture *const p, void *cookie)
{
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
assert(rd_priv_ctx != NULL);
SDL_LockMutex(rd_priv_ctx->lock);
const struct pl_gpu *gpu = rd_priv_ctx->vk->gpu;
int ret = DAV1D_ERR(ENOMEM);
// Copied from dav1d_default_picture_alloc
const int hbd = p->p.bpc > 8;
const int aligned_w = ALIGN2(p->p.w, 128);
const int aligned_h = ALIGN2(p->p.h, 128);
const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
p->stride[0] = aligned_w << hbd;
p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0;
// Align strides up to multiples of the GPU performance hints
p->stride[0] = ALIGN2(p->stride[0], gpu->limits.align_tex_xfer_stride);
p->stride[1] = ALIGN2(p->stride[1], gpu->limits.align_tex_xfer_stride);
// Aligning offsets to 4 also implicity aligns to the texel size (1 or 2)
size_t off_align = ALIGN2(gpu->limits.align_tex_xfer_offset, 4);
const size_t y_sz = ALIGN2(p->stride[0] * aligned_h, off_align);
const size_t uv_sz = ALIGN2(p->stride[1] * (aligned_h >> ss_ver), off_align);
// The extra DAV1D_PICTURE_ALIGNMENTs are to brute force plane alignment,
// even in the case that the driver gives us insane alignments
const size_t pic_size = y_sz + 2 * uv_sz;
const size_t total_size = pic_size + DAV1D_PICTURE_ALIGNMENT * 4;
// Validate size limitations
if (total_size > gpu->limits.max_xfer_size) {
printf("alloc of %zu bytes exceeds limits\n", total_size);
goto err;
}
const struct pl_buf *buf = pl_buf_create(gpu, &(struct pl_buf_params) {
.type = PL_BUF_TEX_TRANSFER,
.host_mapped = true,
.size = total_size,
.memory_type = PL_BUF_MEM_HOST,
.user_data = p,
});
if (!buf) {
printf("alloc of GPU mapped buffer failed\n");
goto err;
}
assert(buf->data);
uintptr_t base = (uintptr_t) buf->data, data[3];
data[0] = ALIGN2(base, DAV1D_PICTURE_ALIGNMENT);
data[1] = ALIGN2(data[0] + y_sz, DAV1D_PICTURE_ALIGNMENT);
data[2] = ALIGN2(data[1] + uv_sz, DAV1D_PICTURE_ALIGNMENT);
// Sanity check offset alignment for the sake of debugging
if (data[0] - base != ALIGN2(data[0] - base, off_align) ||
data[1] - base != ALIGN2(data[1] - base, off_align) ||
data[2] - base != ALIGN2(data[2] - base, off_align))
{
printf("GPU buffer horribly misaligned, expect slowdown!\n");
}
p->allocator_data = (void *) buf;
p->data[0] = (void *) data[0];
p->data[1] = (void *) data[1];
p->data[2] = (void *) data[2];
ret = 0;
// fall through
err:
SDL_UnlockMutex(rd_priv_ctx->lock);
return ret;
}
static void placebo_release_pic(Dav1dPicture *pic, void *cookie)
{
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
assert(rd_priv_ctx != NULL);
assert(pic->allocator_data);
SDL_LockMutex(rd_priv_ctx->lock);
const struct pl_gpu *gpu = rd_priv_ctx->vk->gpu;
pl_buf_destroy(gpu, (const struct pl_buf **) &pic->allocator_data);
SDL_UnlockMutex(rd_priv_ctx->lock);
}
static const Dav1dPlayRenderInfo renderer_info = {
.create_renderer = placebo_renderer_create,
.destroy_renderer = placebo_renderer_destroy,
.render = placebo_render,
.update_frame = placebo_upload_planes
.update_frame = placebo_upload_planes,
.alloc_pic = placebo_alloc_pic,
.release_pic = placebo_release_pic,
};
#else
@ -516,7 +631,7 @@ static void sdl_renderer_destroy(void *cookie)
free(rd_priv_ctx);
}
static void sdl_render(void *cookie)
static void sdl_render(void *cookie, const Dav1dPlaySettings *settings)
{
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
assert(rd_priv_ctx != NULL);
@ -536,7 +651,8 @@ static void sdl_render(void *cookie)
SDL_UnlockMutex(rd_priv_ctx->lock);
}
static int sdl_update_texture(void *cookie, Dav1dPicture *dav1d_pic)
static int sdl_update_texture(void *cookie, Dav1dPicture *dav1d_pic,
const Dav1dPlaySettings *settings)
{
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
assert(rd_priv_ctx != NULL);
@ -647,8 +763,11 @@ static void dp_settings_print_usage(const char *const app,
fprintf(stderr, "Usage: %s [options]\n\n", app);
fprintf(stderr, "Supported options:\n"
" --input/-i $file: input file\n"
" --untimed/-u: ignore PTS, render as fast as possible\n"
" --framethreads $num: number of frame threads (default: 1)\n"
" --tilethreads $num: number of tile threads (default: 1)\n"
" --highquality: enable high quality rendering\n"
" --zerocopy/-z: enable zero copy upload path\n"
" --version/-v: print version and exit\n");
exit(1);
}
@ -672,19 +791,23 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
Dav1dSettings *lib_settings = &rd_ctx->lib_settings;
// Short options
static const char short_opts[] = "i:v";
static const char short_opts[] = "i:vuz";
enum {
ARG_FRAME_THREADS = 256,
ARG_TILE_THREADS,
ARG_HIGH_QUALITY,
};
// Long options
static const struct option long_opts[] = {
{ "input", 1, NULL, 'i' },
{ "version", 0, NULL, 'v' },
{ "untimed", 0, NULL, 'u' },
{ "framethreads", 1, NULL, ARG_FRAME_THREADS },
{ "tilethreads", 1, NULL, ARG_TILE_THREADS },
{ "highquality", 0, NULL, ARG_HIGH_QUALITY },
{ "zerocopy", 0, NULL, 'z' },
{ NULL, 0, NULL, 0 },
};
@ -696,6 +819,21 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
case 'v':
fprintf(stderr, "%s\n", dav1d_version());
exit(0);
case 'u':
settings->untimed = true;
break;
case ARG_HIGH_QUALITY:
settings->highquality = true;
#ifndef HAVE_PLACEBO_VULKAN
fprintf(stderr, "warning: --highquality requires libplacebo\n");
#endif
break;
case 'z':
settings->zerocopy = true;
#ifndef HAVE_PLACEBO_VULKAN
fprintf(stderr, "warning: --zerocopy requires libplacebo\n");
#endif
break;
case ARG_FRAME_THREADS:
lib_settings->n_frame_threads =
parse_unsigned(optarg, ARG_FRAME_THREADS, argv[0]);
@ -811,7 +949,7 @@ static void dp_rd_ctx_post_event(Dav1dPlayRenderContext *rd_ctx, uint32_t code)
static void dp_rd_ctx_update_with_dav1d_picture(Dav1dPlayRenderContext *rd_ctx,
Dav1dPicture *dav1d_pic)
{
renderer_info.update_frame(rd_ctx->rd_priv, dav1d_pic);
renderer_info.update_frame(rd_ctx->rd_priv, dav1d_pic, &rd_ctx->settings);
rd_ctx->current_pts = dav1d_pic->m.timestamp;
}
@ -853,16 +991,20 @@ static void dp_rd_ctx_render(Dav1dPlayRenderContext *rd_ctx)
int32_t wait_time = (pts_diff * rd_ctx->timebase) * 1000 - ticks_diff;
rd_ctx->last_pts = rd_ctx->current_pts;
// In untimed mode, simply don't wait
if (rd_ctx->settings.untimed)
wait_time = 0;
// This way of timing the playback is not accurate, as there is no guarantee
// that SDL_Delay will wait for exactly the requested amount of time so in a
// accurate player this would need to be done in a better way.
if (wait_time >= 0) {
if (wait_time > 0) {
SDL_Delay(wait_time);
} else if (wait_time < -10) { // Do not warn for minor time drifts
fprintf(stderr, "Frame displayed %f seconds too late\n", wait_time/(float)1000);
}
renderer_info.render(rd_ctx->rd_priv);
renderer_info.render(rd_ctx->rd_priv, &rd_ctx->settings);
rd_ctx->last_ticks = SDL_GetTicks();
}
@ -1046,6 +1188,18 @@ int main(int argc, char **argv)
// Parse and validate arguments
dp_rd_ctx_parse_args(rd_ctx, argc, argv);
if (rd_ctx->settings.zerocopy) {
if (renderer_info.alloc_pic) {
rd_ctx->lib_settings.allocator = (Dav1dPicAllocator) {
.cookie = rd_ctx->rd_priv,
.alloc_picture_callback = renderer_info.alloc_pic,
.release_picture_callback = renderer_info.release_pic,
};
} else {
fprintf(stderr, "--zerocopy unsupported by compiled renderer\n");
}
}
// Start decoder thread
decoder_thread = SDL_CreateThread(decoder_thread_main, "Decoder thread", rd_ctx);

View File

@ -23,7 +23,7 @@
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
project('dav1d', ['c'],
version: '0.4.0',
version: '0.5.0',
default_options: ['c_std=c99',
'warning_level=2',
'buildtype=release',

View File

@ -17,7 +17,7 @@ apps:
parts:
dav1d:
plugin: meson
source: .
source: ../../
build-packages: [ 'nasm' ]
meson-parameters:
- --prefix=/usr

660
third_party/dav1d/src/arm/32/cdef.S vendored Normal file
View File

@ -0,0 +1,660 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2019, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
// n1 = s0/d0
// w1 = d0/q0
// n2 = s4/d2
// w2 = d2/q1
.macro pad_top_bottom s1, s2, w, stride, n1, w1, n2, w2, align, ret
tst r6, #1 // CDEF_HAVE_LEFT
beq 2f
// CDEF_HAVE_LEFT
tst r6, #2 // CDEF_HAVE_RIGHT
beq 1f
// CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
ldrh r12, [\s1, #-2]
vldr \n1, [\s1]
vdup.16 d4, r12
ldrh r12, [\s1, #\w]
vmov.16 d4[1], r12
ldrh r12, [\s2, #-2]
vldr \n2, [\s2]
vmov.16 d4[2], r12
ldrh r12, [\s2, #\w]
vmovl.u8 q0, d0
vmov.16 d4[3], r12
vmovl.u8 q1, d2
vmovl.u8 q2, d4
vstr s8, [r0, #-4]
vst1.16 {\w1}, [r0, :\align]
vstr s9, [r0, #2*\w]
add r0, r0, #2*\stride
vstr s10, [r0, #-4]
vst1.16 {\w2}, [r0, :\align]
vstr s11, [r0, #2*\w]
.if \ret
pop {r4-r7,pc}
.else
add r0, r0, #2*\stride
b 3f
.endif
1:
// CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
ldrh r12, [\s1, #-2]
vldr \n1, [\s1]
vdup.16 d4, r12
ldrh r12, [\s2, #-2]
vldr \n2, [\s2]
vmovl.u8 q0, d0
vmov.16 d4[1], r12
vmovl.u8 q1, d2
vmovl.u8 q2, d4
vstr s8, [r0, #-4]
vst1.16 {\w1}, [r0, :\align]
vstr s12, [r0, #2*\w]
add r0, r0, #2*\stride
vstr s9, [r0, #-4]
vst1.16 {\w2}, [r0, :\align]
vstr s12, [r0, #2*\w]
.if \ret
pop {r4-r7,pc}
.else
add r0, r0, #2*\stride
b 3f
.endif
2:
// !CDEF_HAVE_LEFT
tst r6, #2 // CDEF_HAVE_RIGHT
beq 1f
// !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
vldr \n1, [\s1]
ldrh r12, [\s1, #\w]
vldr \n2, [\s2]
vdup.16 d4, r12
ldrh r12, [\s2, #\w]
vmovl.u8 q0, d0
vmov.16 d4[1], r12
vmovl.u8 q1, d2
vmovl.u8 q2, d4
vstr s12, [r0, #-4]
vst1.16 {\w1}, [r0, :\align]
vstr s8, [r0, #2*\w]
add r0, r0, #2*\stride
vstr s12, [r0, #-4]
vst1.16 {\w2}, [r0, :\align]
vstr s9, [r0, #2*\w]
.if \ret
pop {r4-r7,pc}
.else
add r0, r0, #2*\stride
b 3f
.endif
1:
// !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
vldr \n1, [\s1]
vldr \n2, [\s2]
vmovl.u8 q0, d0
vmovl.u8 q1, d2
vstr s12, [r0, #-4]
vst1.16 {\w1}, [r0, :\align]
vstr s12, [r0, #2*\w]
add r0, r0, #2*\stride
vstr s12, [r0, #-4]
vst1.16 {\w2}, [r0, :\align]
vstr s12, [r0, #2*\w]
.if \ret
pop {r4-r7,pc}
.else
add r0, r0, #2*\stride
.endif
3:
.endm
.macro load_n_incr dst, src, incr, w
.if \w == 4
vld1.32 {\dst\()[0]}, [\src, :32], \incr
.else
vld1.8 {\dst\()}, [\src, :64], \incr
.endif
.endm
// void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// /*const*/ pixel *const top[2], int h,
// enum CdefEdgeFlags edges);
// n1 = s0/d0
// w1 = d0/q0
// n2 = s4/d2
// w2 = d2/q1
.macro padding_func w, stride, n1, w1, n2, w2, align
function cdef_padding\w\()_neon, export=1
push {r4-r7,lr}
ldrd r4, r5, [sp, #20]
ldr r6, [sp, #28]
vmov.i16 q3, #0x8000
tst r6, #4 // CDEF_HAVE_TOP
bne 1f
// !CDEF_HAVE_TOP
sub r12, r0, #2*(2*\stride+2)
vmov.i16 q2, #0x8000
vst1.16 {q2,q3}, [r12]!
.if \w == 8
vst1.16 {q2,q3}, [r12]!
.endif
b 3f
1:
// CDEF_HAVE_TOP
ldr r7, [r4]
ldr lr, [r4, #4]
sub r0, r0, #2*(2*\stride)
pad_top_bottom r7, lr, \w, \stride, \n1, \w1, \n2, \w2, \align, 0
// Middle section
3:
tst r6, #1 // CDEF_HAVE_LEFT
beq 2f
// CDEF_HAVE_LEFT
tst r6, #2 // CDEF_HAVE_RIGHT
beq 1f
// CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
0:
ldrh r12, [r3], #2
vldr \n1, [r1]
vdup.16 d2, r12
ldrh r12, [r1, #\w]
add r1, r1, r2
subs r5, r5, #1
vmov.16 d2[1], r12
vmovl.u8 q0, d0
vmovl.u8 q1, d2
vstr s4, [r0, #-4]
vst1.16 {\w1}, [r0, :\align]
vstr s5, [r0, #2*\w]
add r0, r0, #2*\stride
bgt 0b
b 3f
1:
// CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
ldrh r12, [r3], #2
load_n_incr d0, r1, r2, \w
vdup.16 d2, r12
subs r5, r5, #1
vmovl.u8 q0, d0
vmovl.u8 q1, d2
vstr s4, [r0, #-4]
vst1.16 {\w1}, [r0, :\align]
vstr s12, [r0, #2*\w]
add r0, r0, #2*\stride
bgt 1b
b 3f
2:
tst r6, #2 // CDEF_HAVE_RIGHT
beq 1f
// !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
0:
ldrh r12, [r1, #\w]
load_n_incr d0, r1, r2, \w
vdup.16 d2, r12
subs r5, r5, #1
vmovl.u8 q0, d0
vmovl.u8 q1, d2
vstr s12, [r0, #-4]
vst1.16 {\w1}, [r0, :\align]
vstr s4, [r0, #2*\w]
add r0, r0, #2*\stride
bgt 0b
b 3f
1:
// !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
load_n_incr d0, r1, r2, \w
subs r5, r5, #1
vmovl.u8 q0, d0
vstr s12, [r0, #-4]
vst1.16 {\w1}, [r0, :\align]
vstr s12, [r0, #2*\w]
add r0, r0, #2*\stride
bgt 1b
3:
tst r6, #8 // CDEF_HAVE_BOTTOM
bne 1f
// !CDEF_HAVE_BOTTOM
sub r12, r0, #4
vmov.i16 q2, #0x8000
vst1.16 {q2,q3}, [r12]!
.if \w == 8
vst1.16 {q2,q3}, [r12]!
.endif
pop {r4-r7,pc}
1:
// CDEF_HAVE_BOTTOM
add r7, r1, r2
pad_top_bottom r1, r7, \w, \stride, \n1, \w1, \n2, \w2, \align, 1
endfunc
.endm
padding_func 8, 16, d0, q0, d2, q1, 128
padding_func 4, 8, s0, d0, s4, d2, 64
.macro dir_table w, stride
const directions\w
.byte -1 * \stride + 1, -2 * \stride + 2
.byte 0 * \stride + 1, -1 * \stride + 2
.byte 0 * \stride + 1, 0 * \stride + 2
.byte 0 * \stride + 1, 1 * \stride + 2
.byte 1 * \stride + 1, 2 * \stride + 2
.byte 1 * \stride + 0, 2 * \stride + 1
.byte 1 * \stride + 0, 2 * \stride + 0
.byte 1 * \stride + 0, 2 * \stride - 1
// Repeated, to avoid & 7
.byte -1 * \stride + 1, -2 * \stride + 2
.byte 0 * \stride + 1, -1 * \stride + 2
.byte 0 * \stride + 1, 0 * \stride + 2
.byte 0 * \stride + 1, 1 * \stride + 2
.byte 1 * \stride + 1, 2 * \stride + 2
.byte 1 * \stride + 0, 2 * \stride + 1
endconst
.endm
dir_table 8, 16
dir_table 4, 8
const pri_taps
.byte 4, 2, 3, 3
endconst
.macro load_px d11, d12, d21, d22, w
.if \w == 8
add r6, r2, r9, lsl #1 // x + off
sub r9, r2, r9, lsl #1 // x - off
vld1.16 {\d11,\d12}, [r6] // p0
vld1.16 {\d21,\d22}, [r9] // p1
.else
add r6, r2, r9, lsl #1 // x + off
sub r9, r2, r9, lsl #1 // x - off
vld1.16 {\d11}, [r6] // p0
add r6, r6, #2*8 // += stride
vld1.16 {\d21}, [r9] // p1
add r9, r9, #2*8 // += stride
vld1.16 {\d12}, [r6] // p0
vld1.16 {\d22}, [r9] // p1
.endif
.endm
.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
cmp \threshold, #0
vmin.u16 q2, q2, \s1
vmax.s16 q3, q3, \s1
vmin.u16 q2, q2, \s2
vmax.s16 q3, q3, \s2
beq 3f
vabd.u16 q8, q0, \s1 // abs(diff)
vabd.u16 q11, q0, \s2 // abs(diff)
vshl.u16 q9, q8, \shift // abs(diff) >> shift
vshl.u16 q12, q11, \shift // abs(diff) >> shift
vqsub.u16 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
vqsub.u16 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
vsub.i16 q10, \s1, q0 // diff = p0 - px
vsub.u16 q13, \s2, q0 // diff = p1 - px
vneg.s16 q8, q9 // -clip
vneg.s16 q11, q12 // -clip
vmin.s16 q10, q10, q9 // imin(diff, clip)
vmin.s16 q13, q13, q12 // imin(diff, clip)
vdup.16 q9, \tap // taps[k]
vmax.s16 q10, q10, q8 // constrain() = imax(imin(diff, clip), -clip)
vmax.s16 q13, q13, q11 // constrain() = imax(imin(diff, clip), -clip)
vmla.i16 q1, q10, q9 // sum += taps[k] * constrain()
vmla.i16 q1, q13, q9 // sum += taps[k] * constrain()
3:
.endm
// void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint16_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping, int h);
.macro filter w
function cdef_filter\w\()_neon, export=1
push {r4-r9,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #92]
ldrd r6, r7, [sp, #100]
movrel_local r8, pri_taps
and r9, r3, #1
add r8, r8, r9, lsl #1
movrel_local r9, directions\w
add r5, r9, r5, lsl #1
vmov.u16 d17, #15
vdup.16 d16, r6 // damping
vdup.16 q5, r3 // threshold
vdup.16 q7, r4 // threshold
vmov.16 d8[0], r3
vmov.16 d8[1], r4
vclz.i16 d8, d8 // clz(threshold)
vsub.i16 d8, d17, d8 // ulog2(threshold)
vqsub.u16 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold))
vneg.s16 d8, d8 // -shift
vdup.16 q6, d8[1]
vdup.16 q4, d8[0]
1:
.if \w == 8
vld1.16 {q0}, [r2, :128] // px
.else
add r12, r2, #2*8
vld1.16 {d0}, [r2, :64] // px
vld1.16 {d1}, [r12, :64] // px
.endif
vmov.u16 q1, #0 // sum
vmov.u16 q2, q0 // min
vmov.u16 q3, q0 // max
// Instead of loading sec_taps 2, 1 from memory, just set it
// to 2 initially and decrease for the second round.
mov lr, #2 // sec_taps[0]
2:
ldrsb r9, [r5] // off1
load_px d28, d29, d30, d31, \w
add r5, r5, #4 // +2*2
ldrsb r9, [r5] // off2
ldrb r12, [r8] // *pri_taps
handle_pixel q14, q15, r3, q5, q4, r12
load_px d28, d29, d30, d31, \w
add r5, r5, #8 // +2*4
ldrsb r9, [r5] // off3
handle_pixel q14, q15, r4, q7, q6, lr
load_px d28, d29, d30, d31, \w
handle_pixel q14, q15, r4, q7, q6, lr
sub r5, r5, #11 // x8 -= 2*(2+4); x8 += 1;
subs lr, lr, #1 // sec_tap-- (value)
add r8, r8, #1 // pri_taps++ (pointer)
bne 2b
vshr.s16 q14, q1, #15 // -(sum < 0)
vadd.i16 q1, q1, q14 // sum - (sum < 0)
vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4
vadd.i16 q0, q0, q1 // px + (8 + sum ...) >> 4
vmin.s16 q0, q0, q3
vmax.s16 q0, q0, q2 // iclip(px + .., min, max)
vmovn.u16 d0, q0
.if \w == 8
add r2, r2, #2*16 // tmp += tmp_stride
subs r7, r7, #1 // h--
vst1.8 {d0}, [r0, :64], r1
.else
vst1.32 {d0[0]}, [r0, :32], r1
add r2, r2, #2*16 // tmp += 2*tmp_stride
subs r7, r7, #2 // h -= 2
vst1.32 {d0[1]}, [r0, :32], r1
.endif
// Reset pri_taps/sec_taps back to the original point
sub r5, r5, #2
sub r8, r8, #2
bgt 1b
vpop {q4-q7}
pop {r4-r9,pc}
endfunc
.endm
filter 8
filter 4
const div_table, align=4
.short 840, 420, 280, 210, 168, 140, 120, 105
endconst
const alt_fact, align=4
.short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
endconst
// int dav1d_cdef_find_dir_neon(const pixel *img, const ptrdiff_t stride,
// unsigned *const var)
function cdef_find_dir_neon, export=1
push {lr}
vpush {q4-q7}
sub sp, sp, #32 // cost
mov r3, #8
vmov.u16 q1, #0 // q0-q1 sum_diag[0]
vmov.u16 q3, #0 // q2-q3 sum_diag[1]
vmov.u16 q5, #0 // q4-q5 sum_hv[0-1]
vmov.u16 q8, #0 // q6,d16 sum_alt[0]
// q7,d17 sum_alt[1]
vmov.u16 q9, #0 // q9,d22 sum_alt[2]
vmov.u16 q11, #0
vmov.u16 q10, #0 // q10,d23 sum_alt[3]
.irpc i, 01234567
vld1.8 {d30}, [r0, :64], r1
vmov.u8 d31, #128
vsubl.u8 q15, d30, d31 // img[x] - 128
vmov.u16 q14, #0
.if \i == 0
vmov q0, q15 // sum_diag[0]
.else
vext.8 q12, q14, q15, #(16-2*\i)
vext.8 q13, q15, q14, #(16-2*\i)
vadd.i16 q0, q0, q12 // sum_diag[0]
vadd.i16 q1, q1, q13 // sum_diag[0]
.endif
vrev64.16 q13, q15
vswp d26, d27 // [-x]
.if \i == 0
vmov q2, q13 // sum_diag[1]
.else
vext.8 q12, q14, q13, #(16-2*\i)
vext.8 q13, q13, q14, #(16-2*\i)
vadd.i16 q2, q2, q12 // sum_diag[1]
vadd.i16 q3, q3, q13 // sum_diag[1]
.endif
vpadd.u16 d26, d30, d31 // [(x >> 1)]
vmov.u16 d27, #0
vpadd.u16 d24, d26, d28
vpadd.u16 d24, d24, d28 // [y]
vmov.u16 r12, d24[0]
vadd.i16 q5, q5, q15 // sum_hv[1]
.if \i < 4
vmov.16 d8[\i], r12 // sum_hv[0]
.else
vmov.16 d9[\i-4], r12 // sum_hv[0]
.endif
.if \i == 0
vmov.u16 q6, q13 // sum_alt[0]
.else
vext.8 q12, q14, q13, #(16-2*\i)
vext.8 q14, q13, q14, #(16-2*\i)
vadd.i16 q6, q6, q12 // sum_alt[0]
vadd.i16 d16, d16, d28 // sum_alt[0]
.endif
vrev64.16 d26, d26 // [-(x >> 1)]
vmov.u16 q14, #0
.if \i == 0
vmov q7, q13 // sum_alt[1]
.else
vext.8 q12, q14, q13, #(16-2*\i)
vext.8 q13, q13, q14, #(16-2*\i)
vadd.i16 q7, q7, q12 // sum_alt[1]
vadd.i16 d17, d17, d26 // sum_alt[1]
.endif
.if \i < 6
vext.8 q12, q14, q15, #(16-2*(3-(\i/2)))
vext.8 q13, q15, q14, #(16-2*(3-(\i/2)))
vadd.i16 q9, q9, q12 // sum_alt[2]
vadd.i16 d22, d22, d26 // sum_alt[2]
.else
vadd.i16 q9, q9, q15 // sum_alt[2]
.endif
.if \i == 0
vmov q10, q15 // sum_alt[3]
.elseif \i == 1
vadd.i16 q10, q10, q15 // sum_alt[3]
.else
vext.8 q12, q14, q15, #(16-2*(\i/2))
vext.8 q13, q15, q14, #(16-2*(\i/2))
vadd.i16 q10, q10, q12 // sum_alt[3]
vadd.i16 d23, d23, d26 // sum_alt[3]
.endif
.endr
vmov.u32 q15, #105
vmull.s16 q12, d8, d8 // sum_hv[0]*sum_hv[0]
vmlal.s16 q12, d9, d9
vmull.s16 q13, d10, d10 // sum_hv[1]*sum_hv[1]
vmlal.s16 q13, d11, d11
vadd.s32 d8, d24, d25
vadd.s32 d9, d26, d27
vpadd.s32 d8, d8, d9 // cost[2,6] (s16, s17)
vmul.i32 d8, d8, d30 // cost[2,6] *= 105
vrev64.16 q1, q1
vrev64.16 q3, q3
vext.8 q1, q1, q1, #10 // sum_diag[0][14-n]
vext.8 q3, q3, q3, #10 // sum_diag[1][14-n]
vstr s16, [sp, #2*4] // cost[2]
vstr s17, [sp, #6*4] // cost[6]
movrel_local r12, div_table
vld1.16 {q14}, [r12, :128]
vmull.s16 q5, d0, d0 // sum_diag[0]*sum_diag[0]
vmull.s16 q12, d1, d1
vmlal.s16 q5, d2, d2
vmlal.s16 q12, d3, d3
vmull.s16 q0, d4, d4 // sum_diag[1]*sum_diag[1]
vmull.s16 q1, d5, d5
vmlal.s16 q0, d6, d6
vmlal.s16 q1, d7, d7
vmovl.u16 q13, d28 // div_table
vmovl.u16 q14, d29
vmul.i32 q5, q5, q13 // cost[0]
vmla.i32 q5, q12, q14
vmul.i32 q0, q0, q13 // cost[4]
vmla.i32 q0, q1, q14
vadd.i32 d10, d10, d11
vadd.i32 d0, d0, d1
vpadd.i32 d0, d10, d0 // cost[0,4] = s0,s1
movrel_local r12, alt_fact
vld1.16 {d29, d30, d31}, [r12, :64] // div_table[2*m+1] + 105
vstr s0, [sp, #0*4] // cost[0]
vstr s1, [sp, #4*4] // cost[4]
vmovl.u16 q13, d29 // div_table[2*m+1] + 105
vmovl.u16 q14, d30
vmovl.u16 q15, d31
.macro cost_alt dest, s1, s2, s3, s4, s5, s6
vmull.s16 q1, \s1, \s1 // sum_alt[n]*sum_alt[n]
vmull.s16 q2, \s2, \s2
vmull.s16 q3, \s3, \s3
vmull.s16 q5, \s4, \s4 // sum_alt[n]*sum_alt[n]
vmull.s16 q12, \s5, \s5
vmull.s16 q6, \s6, \s6 // q6 overlaps the first \s1-\s2 here
vmul.i32 q1, q1, q13 // sum_alt[n]^2*fact
vmla.i32 q1, q2, q14
vmla.i32 q1, q3, q15
vmul.i32 q5, q5, q13 // sum_alt[n]^2*fact
vmla.i32 q5, q12, q14
vmla.i32 q5, q6, q15
vadd.i32 d2, d2, d3
vadd.i32 d3, d10, d11
vpadd.i32 \dest, d2, d3 // *cost_ptr
.endm
cost_alt d14, d12, d13, d16, d14, d15, d17 // cost[1], cost[3]
cost_alt d15, d18, d19, d22, d20, d21, d23 // cost[5], cost[7]
vstr s28, [sp, #1*4] // cost[1]
vstr s29, [sp, #3*4] // cost[3]
mov r0, #0 // best_dir
vmov.32 r1, d0[0] // best_cost
mov r3, #1 // n
vstr s30, [sp, #5*4] // cost[5]
vstr s31, [sp, #7*4] // cost[7]
vmov.32 r12, d14[0]
.macro find_best s1, s2, s3
.ifnb \s2
vmov.32 lr, \s2
.endif
cmp r12, r1 // cost[n] > best_cost
itt gt
movgt r0, r3 // best_dir = n
movgt r1, r12 // best_cost = cost[n]
.ifnb \s2
add r3, r3, #1 // n++
cmp lr, r1 // cost[n] > best_cost
vmov.32 r12, \s3
itt gt
movgt r0, r3 // best_dir = n
movgt r1, lr // best_cost = cost[n]
add r3, r3, #1 // n++
.endif
.endm
find_best d14[0], d8[0], d14[1]
find_best d14[1], d0[1], d15[0]
find_best d15[0], d8[1], d15[1]
find_best d15[1]
eor r3, r0, #4 // best_dir ^4
ldr r12, [sp, r3, lsl #2]
sub r1, r1, r12 // best_cost - cost[best_dir ^ 4]
lsr r1, r1, #10
str r1, [r2] // *var
add sp, sp, #32
vpop {q4-q7}
pop {pc}
endfunc

View File

@ -2971,3 +2971,206 @@ endfunc
filter_fn put, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, 10
filter_fn prep, r0, r7, r1, r2, r3, r4, r5, r6, r8, r9, 6
.macro load_filter_ptr src
asr r12, \src, #10
add r12, r11, r12, lsl #3
.endm
.macro load_filter_coef dst, src, inc
vld1.8 {\dst}, [r12, :64]
add \src, \src, \inc
.endm
.macro load_filter_row dst, src, inc
load_filter_ptr \src
load_filter_coef \dst, \src, \inc
.endm
function warp_filter_horz_neon
load_filter_ptr r5 // filter 0
vld1.16 {q7}, [r2], r3
load_filter_coef d0, r5, r7 // filter 0
vmovl.u8 q6, d14 // original pixels
load_filter_row d2, r5, r7 // filter 1
vmovl.u8 q7, d15 // original pixels
load_filter_row d4, r5, r7 // filter 2
vmovl.s8 q0, d0 // filter 0
vext.8 q3, q6, q7, #2*1 // filter 1 pixels
load_filter_ptr r5 // filter 3
vmovl.s8 q1, d2 // filter 1
vmul.i16 q5, q6, q0 // filter 0 output
load_filter_coef d0, r5, r7 // filter 3
vmovl.s8 q2, d4 // filter 2
load_filter_ptr r5 // filter 4
vext.8 q4, q6, q7, #2*2 // filter 2 pixels
vmul.i16 q3, q3, q1 // filter 1 output
load_filter_coef d2, r5, r7 // filter 4
vmul.i16 q4, q4, q2 // filter 2 output
vext.8 q2, q6, q7, #2*3 // filter 3 pixels
vmovl.s8 q0, d0 // filter 3
vpaddl.s16 q5, q5 // pixel 0 (4x32)
vpaddl.s16 q3, q3 // pixel 1 (4x32)
vmul.i16 q0, q2, q0 // filter 3 output
load_filter_ptr r5 // filter 5
vext.8 q2, q6, q7, #2*4 // filter 4 pixels
vmovl.s8 q1, d2 // filter 4
vpaddl.s16 q4, q4 // pixel 2 (4x32)
vpadd.s32 d10, d10, d11 // pixel 0 (2x32)
vpadd.s32 d11, d6, d7 // pixel 1 (2x32)
load_filter_coef d6, r5, r7 // filter 5
vmul.i16 q1, q2, q1 // filter 4 output
vpadd.s32 d8, d8, d9 // pixel 2 (2x32)
load_filter_ptr r5 // filter 6
vpaddl.s16 q0, q0 // pixel 3 (4x32)
vpadd.s32 d10, d10, d11 // pixel 0,1
vext.8 q2, q6, q7, #2*5 // filter 5 pixels
vmovl.s8 q3, d6 // filter 5
vpaddl.s16 q1, q1 // pixel 4 (4x32)
vpadd.s32 d9, d0, d1 // pixel 3 (2x32)
load_filter_coef d0, r5, r7 // filter 6
vmul.i16 q2, q2, q3 // filter 5 output
vpadd.s32 d11, d8, d9 // pixel 2,3
load_filter_ptr r5 // filter 7
vpaddl.s16 q2, q2 // pixel 5 (4x32)
vpadd.s32 d8, d2, d3 // pixel 4 (2x32)
vext.8 q3, q6, q7, #2*6 // filter 6 pixels
vmovl.s8 q0, d0 // filter 6
vpadd.s32 d9, d4, d5 // pixel 5 (2x32)
load_filter_coef d4, r5, r7 // filter 7
vpadd.s32 d8, d8, d9 // pixel 4,5
vext.8 q1, q6, q7, #2*7 // filter 7 pixels
vmovl.s8 q2, d4 // filter 7
vmul.i16 q3, q3, q0 // filter 6 output
vmul.i16 q1, q1, q2 // filter 7 output
sub r5, r5, r7, lsl #3
vpaddl.s16 q3, q3 // pixel 6 (4x32)
vpaddl.s16 q1, q1 // pixel 7 (4x32)
vpadd.s32 d6, d6, d7 // pixel 6 (2x32)
vpadd.s32 d2, d2, d3 // pixel 7 (2x32)
vpadd.s32 d9, d6, d2 // pixel 6,7
add r5, r5, r8
vrshrn.s32 d10, q5, #3
vrshrn.s32 d11, q4, #3
bx lr
endfunc
// void dav1d_warp_affine_8x8_8bpc_neon(
// pixel *dst, const ptrdiff_t dst_stride,
// const pixel *src, const ptrdiff_t src_stride,
// const int16_t *const abcd, int mx, int my)
.macro warp t, shift
function warp_affine_8x8\t\()_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
ldr r6, [sp, #108]
ldrd r8, r9, [r4]
sxth r7, r8
asr r8, r8, #16
asr r4, r9, #16
sxth r9, r9
mov r10, #8
sub r2, r2, r3, lsl #1
sub r2, r2, r3
sub r2, r2, #3
movrel r11, X(mc_warp_filter), 64*8
.ifnb \t
lsl r1, r1, #1
.endif
add r5, r5, #512
add r6, r6, #512
bl warp_filter_horz_neon
vmov q8, q5
bl warp_filter_horz_neon
vmov q9, q5
bl warp_filter_horz_neon
vmov q10, q5
bl warp_filter_horz_neon
vmov q11, q5
bl warp_filter_horz_neon
vmov q12, q5
bl warp_filter_horz_neon
vmov q13, q5
bl warp_filter_horz_neon
vmov q14, q5
1:
bl warp_filter_horz_neon
vmov q15, q5
load_filter_row d8, r6, r9
load_filter_row d9, r6, r9
load_filter_row d10, r6, r9
load_filter_row d11, r6, r9
load_filter_row d12, r6, r9
load_filter_row d13, r6, r9
load_filter_row d14, r6, r9
load_filter_row d15, r6, r9
transpose_8x8b q4, q5, q6, q7, d8, d9, d10, d11, d12, d13, d14, d15
vmovl.s8 q1, d8
vmovl.s8 q2, d9
vmovl.s8 q3, d10
vmovl.s8 q4, d11
vmovl.s8 q5, d12
vmovl.s8 q6, d13
sub r6, r6, r9, lsl #3
// This ordering of vmull/vmlal is highly beneficial for
// Cortex A8/A9/A53 here, but harmful for Cortex A7.
vmull.s16 q0, d16, d2
vmlal.s16 q0, d18, d4
vmlal.s16 q0, d20, d6
vmlal.s16 q0, d22, d8
vmlal.s16 q0, d24, d10
vmlal.s16 q0, d26, d12
vmull.s16 q1, d17, d3
vmlal.s16 q1, d19, d5
vmlal.s16 q1, d21, d7
vmlal.s16 q1, d23, d9
vmlal.s16 q1, d25, d11
vmlal.s16 q1, d27, d13
vmovl.s8 q2, d14
vmovl.s8 q3, d15
vmlal.s16 q0, d28, d4
vmlal.s16 q0, d30, d6
vmlal.s16 q1, d29, d5
vmlal.s16 q1, d31, d7
vmov q8, q9
vmov q9, q10
vqrshrn.s32 d0, q0, #\shift
vmov q10, q11
vqrshrn.s32 d1, q1, #\shift
vmov q11, q12
vmov q12, q13
.ifb \t
vqmovun.s16 d0, q0
.endif
vmov q13, q14
vmov q14, q15
subs r10, r10, #1
.ifnb \t
vst1.16 {q0}, [r0, :128], r1
.else
vst1.8 {d0}, [r0, :64], r1
.endif
add r6, r6, r4
bgt 1b
vpop {q4-q7}
pop {r4-r11,pc}
endfunc
.endm
warp , 11
warp t, 7

View File

@ -32,6 +32,20 @@
#include "config.h"
#include "src/arm/asm.S"
.macro movrel_local rd, val, offset=0
#if defined(PIC)
ldr \rd, 1f
b 2f
1:
.word \val + \offset - (2f + 8 - 4 * CONFIG_THUMB)
2:
add \rd, \rd, pc
#else
movw \rd, #:lower16:\val+\offset
movt \rd, #:upper16:\val+\offset
#endif
.endm
.macro movrel rd, val, offset=0
#if defined(PIC) && defined(__APPLE__)
ldr \rd, 1f
@ -50,17 +64,24 @@
.indirect_symbol \val
.word 0
.text
#elif defined(PIC)
ldr \rd, 1f
b 2f
1:
.word \val + \offset - (2f + 8 - 4 * CONFIG_THUMB)
2:
add \rd, \rd, pc
#else
movw \rd, #:lower16:\val+\offset
movt \rd, #:upper16:\val+\offset
movrel_local \rd, \val, \offset
#endif
.endm
.macro transpose_8x8b q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
vtrn.32 \q0, \q2
vtrn.32 \q1, \q3
vtrn.16 \r0, \r2
vtrn.16 \r1, \r3
vtrn.16 \r4, \r6
vtrn.16 \r5, \r7
vtrn.8 \r0, \r1
vtrn.8 \r2, \r3
vtrn.8 \r4, \r5
vtrn.8 \r6, \r7
.endm
#endif /* DAV1D_SRC_ARM_32_UTIL_S */

View File

@ -129,6 +129,14 @@
3:
.endm
.macro load_n_incr dst, src, incr, w
.if \w == 4
ld1 {\dst\().s}[0], [\src], \incr
.else
ld1 {\dst\().8b}, [\src], \incr
.endif
.endm
// void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// /*const*/ pixel *const top[2], int h,
@ -163,9 +171,8 @@ function cdef_padding\w\()_neon, export=1
// CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
0:
ld1 {v0.h}[0], [x3], #2
ldr \rn\()1, [x1]
ldr h2, [x1, #\w]
add x1, x1, x2
load_n_incr v1, x1, x2, \w
subs w5, w5, #1
uxtl v0.8h, v0.8b
uxtl v1.8h, v1.8b
@ -179,11 +186,7 @@ function cdef_padding\w\()_neon, export=1
1:
// CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
ld1 {v0.h}[0], [x3], #2
.if \w == 8
ld1 {v1.8b}, [x1], x2
.else
ld1 {v1.s}[0], [x1], x2
.endif
load_n_incr v1, x1, x2, \w
subs w5, w5, #1
uxtl v0.8h, v0.8b
uxtl v1.8h, v1.8b
@ -198,9 +201,8 @@ function cdef_padding\w\()_neon, export=1
b.eq 1f
// !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
0:
ldr \rn\()0, [x1]
ldr h1, [x1, #\w]
add x1, x1, x2
load_n_incr v0, x1, x2, \w
subs w5, w5, #1
uxtl v0.8h, v0.8b
uxtl v1.8h, v1.8b
@ -212,11 +214,7 @@ function cdef_padding\w\()_neon, export=1
b 3f
1:
// !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
.if \w == 8
ld1 {v0.8b}, [x1], x2
.else
ld1 {v0.s}[0], [x1], x2
.endif
load_n_incr v0, x1, x2, \w
subs w5, w5, #1
uxtl v0.8h, v0.8b
str s31, [x0]
@ -299,17 +297,17 @@ endconst
uabd v20.8h, v0.8h, \s2\().8h // abs(diff)
ushl v17.8h, v16.8h, \shift // abs(diff) >> shift
ushl v21.8h, v20.8h, \shift // abs(diff) >> shift
uqsub v17.8h, \thresh_vec, v17.8h // imax(0, threshold - (abs(diff) >> shift))
uqsub v21.8h, \thresh_vec, v21.8h // imax(0, threshold - (abs(diff) >> shift))
cmhi v18.8h, v0.8h, \s1\().8h // px > p0
cmhi v22.8h, v0.8h, \s2\().8h // px > p1
umin v17.8h, v17.8h, v16.8h // imin(abs(diff), imax())
umin v21.8h, v21.8h, v20.8h // imin(abs(diff), imax())
uqsub v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
uqsub v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
sub v18.8h, \s1\().8h, v0.8h // diff = p0 - px
sub v22.8h, \s2\().8h, v0.8h // diff = p1 - px
neg v16.8h, v17.8h // -clip
neg v20.8h, v21.8h // -clip
smin v18.8h, v18.8h, v17.8h // imin(diff, clip)
smin v22.8h, v22.8h, v21.8h // imin(diff, clip)
dup v19.8h, \tap // taps[k]
neg v16.8h, v17.8h // -imin()
neg v20.8h, v21.8h // -imin()
bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign()
bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign()
smax v18.8h, v18.8h, v16.8h // constrain() = imax(imin(diff, clip), -clip)
smax v22.8h, v22.8h, v20.8h // constrain() = imax(imin(diff, clip), -clip)
mla v1.8h, v18.8h, v19.8h // sum += taps[k] * constrain()
mla v1.8h, v22.8h, v19.8h // sum += taps[k] * constrain()
3:
@ -325,19 +323,18 @@ function cdef_filter\w\()_neon, export=1
add x8, x8, w9, uxtw #1
movrel x9, directions\w
add x5, x9, w5, uxtw #1
movi v30.8h, #15
dup v28.8h, w6 // damping
movi v30.4h, #15
dup v28.4h, w6 // damping
dup v25.8h, w3 // threshold
dup v27.8h, w4 // threshold
clz v24.8h, v25.8h // clz(threshold)
clz v26.8h, v27.8h // clz(threshold)
sub v24.8h, v30.8h, v24.8h // ulog2(threshold)
sub v26.8h, v30.8h, v26.8h // ulog2(threshold)
uqsub v24.8h, v28.8h, v24.8h // shift = imax(0, damping - ulog2(threshold))
uqsub v26.8h, v28.8h, v26.8h // shift = imax(0, damping - ulog2(threshold))
neg v24.8h, v24.8h // -shift
neg v26.8h, v26.8h // -shift
trn1 v24.4h, v25.4h, v27.4h
clz v24.4h, v24.4h // clz(threshold)
sub v24.4h, v30.4h, v24.4h // ulog2(threshold)
uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold))
neg v24.4h, v24.4h // -shift
dup v26.8h, v24.h[1]
dup v24.8h, v24.h[0]
1:
.if \w == 8
@ -467,15 +464,15 @@ function cdef_find_dir_neon, export=1
ext v24.16b, v30.16b, v29.16b, #(16-2*\i)
ext v25.16b, v29.16b, v30.16b, #(16-2*\i)
add v6.8h, v6.8h, v22.8h // sum_alt[0]
add v7.8h, v7.8h, v23.8h // sum_alt[0]
add v7.4h, v7.4h, v23.4h // sum_alt[0]
add v16.8h, v16.8h, v24.8h // sum_alt[1]
add v17.8h, v17.8h, v25.8h // sum_alt[1]
add v17.4h, v17.4h, v25.4h // sum_alt[1]
.endif
.if \i < 6
ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
add v18.8h, v18.8h, v22.8h // sum_alt[2]
add v19.8h, v19.8h, v23.8h // sum_alt[2]
add v19.4h, v19.4h, v23.4h // sum_alt[2]
.else
add v18.8h, v18.8h, v26.8h // sum_alt[2]
.endif
@ -487,7 +484,7 @@ function cdef_find_dir_neon, export=1
ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
add v20.8h, v20.8h, v24.8h // sum_alt[3]
add v21.8h, v21.8h, v25.8h // sum_alt[3]
add v21.4h, v21.4h, v25.4h // sum_alt[3]
.endif
.endr
@ -504,10 +501,8 @@ function cdef_find_dir_neon, export=1
rev64 v1.8h, v1.8h
rev64 v3.8h, v3.8h
ext v1.16b, v1.16b, v1.16b, #8 // sum_diag[0][15-n]
ext v3.16b, v3.16b, v3.16b, #8 // sum_diag[1][15-n]
ext v1.16b, v1.16b, v1.16b, #2 // sum_diag[0][14-n]
ext v3.16b, v3.16b, v3.16b, #2 // sum_diag[1][14-n]
ext v1.16b, v1.16b, v1.16b, #10 // sum_diag[0][14-n]
ext v3.16b, v3.16b, v3.16b, #10 // sum_diag[1][14-n]
str s4, [sp, #2*4] // cost[2]
str s5, [sp, #6*4] // cost[6]
@ -559,16 +554,17 @@ function cdef_find_dir_neon, export=1
addv \d2, v25.4s // *cost_ptr
.endm
cost_alt s6, s16, v6, v7, v16, v17 // cost[1], cost[3]
cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7]
str s6, [sp, #1*4] // cost[1]
str s16, [sp, #3*4] // cost[3]
cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7]
str s18, [sp, #5*4] // cost[5]
str s20, [sp, #7*4] // cost[7]
mov w0, #0 // best_dir
mov w1, v0.s[0] // best_cost
mov w3, #1 // n
str s18, [sp, #5*4] // cost[5]
str s20, [sp, #7*4] // cost[7]
mov w4, v6.s[0]
.macro find_best s1, s2, s3

2443
third_party/dav1d/src/arm/64/ipred.S vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -148,27 +148,6 @@ endconst
.endif
.endm
.macro saddl_sz d0, d1, s0, s1, sz
saddl \d0\().4s, \s0\().4h, \s1\().4h
.ifc \sz, .8h
saddl2 \d1\().4s, \s0\().8h, \s1\().8h
.endif
.endm
.macro ssubl_sz d0, d1, s0, s1, sz
ssubl \d0\().4s, \s0\().4h, \s1\().4h
.ifc \sz, .8h
ssubl2 \d1\().4s, \s0\().8h, \s1\().8h
.endif
.endm
.macro mul_4s_sz d0, d1, s0, s1, c, sz
mul \d0\().4s, \s0\().4s, \c
.ifc \sz, .8h
mul \d1\().4s, \s1\().4s, \c
.endif
.endm
.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
sqrdmulh \r0\sz, \r0\sz, \c
sqrdmulh \r1\sz, \r1\sz, \c
@ -489,18 +468,18 @@ endfunc
.endm
.macro idct_4 r0, r1, r2, r3, sz
add v2\sz, \r0\sz, \r2\sz
sub v3\sz, \r0\sz, \r2\sz
smull_smlal v6, v7, \r1, \r3, v0.h[3], v0.h[2], \sz
smull_smlsl v4, v5, \r1, \r3, v0.h[2], v0.h[3], \sz
sqrdmulh v2\sz, v2\sz, v0.h[1]
sqrdmulh v3\sz, v3\sz, v0.h[1]
smull_smlal v2, v3, \r0, \r2, v0.h[0], v0.h[0], \sz
rshrn_sz v6, v6, v7, #12, \sz
rshrn_sz v4, v4, v5, #12, \sz
rshrn_sz v7, v4, v5, #12, \sz
smull_smlsl v4, v5, \r0, \r2, v0.h[0], v0.h[0], \sz
rshrn_sz v2, v2, v3, #12, \sz
rshrn_sz v3, v4, v5, #12, \sz
sqadd \r0\sz, v2\sz, v6\sz
sqsub \r3\sz, v2\sz, v6\sz
sqadd \r1\sz, v3\sz, v4\sz
sqsub \r2\sz, v3\sz, v4\sz
sqadd \r1\sz, v3\sz, v7\sz
sqsub \r2\sz, v3\sz, v7\sz
.endm
function inv_dct_4x4_neon
@ -780,11 +759,10 @@ def_fn_4x4 identity, flipadst
sqadd v3\sz, \r7\sz, \r5\sz // t7
sqsub \r3\sz, \r7\sz, \r5\sz // t6a
sub \r5\sz, \r3\sz, \r1\sz // -> t5
add \r7\sz, \r3\sz, \r1\sz // -> t6
sqrdmulh v4\sz, \r5\sz, v0.h[1] // t5
sqrdmulh v5\sz, \r7\sz, v0.h[1] // t6
smull_smlsl v4, v5, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t5
smull_smlal v6, v7, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t6
rshrn_sz v4, v4, v5, #12, \sz // t5
rshrn_sz v5, v6, v7, #12, \sz // t6
sqsub \r7\sz, \r0\sz, v3\sz // out7
sqadd \r0\sz, \r0\sz, v3\sz // out0
@ -865,22 +843,14 @@ endfunc
sqsub v5\sz, v5\sz, v19\sz // t7
sqneg \o1\()\sz, \o1\()\sz // out1
movi v0.4s, #2896>>4
saddl_sz v18, v19, v2, v4, \sz // -> out3 (v19 or v20)
ssubl_sz v6, v7, v2, v4, \sz // -> out4 (v20 or v19)
ssubl_sz v20, v21, v3, v5, \sz // -> out5 (v21 or v18)
saddl_sz v4, v5, v3, v5, \sz // -> out2 (v18 or v21)
mul_4s_sz v18, v19, v18, v19, v0.s[0], \sz
mul_4s_sz v6, v7, v6, v7, v0.s[0], \sz
mul_4s_sz v20, v21, v20, v21, v0.s[0], \sz
mul_4s_sz v4, v5, v4, v5, v0.s[0], \sz
rshrn_sz v2, v18, v19, #8, \sz // out3
rshrn_sz v3, v20, v21, #8, \sz // out5
rshrn_sz \o2, v4, v5, #8, \sz // out2 (v18 or v21)
rshrn_sz \o4, v6, v7, #8, \sz // out4 (v20 or v19)
smull_smlal v18, v19, v2, v4, v1.h[0], v1.h[0], \sz // -> out3 (v19 or v20)
smull_smlsl v6, v7, v2, v4, v1.h[0], v1.h[0], \sz // -> out4 (v20 or v19)
smull_smlsl v20, v21, v3, v5, v1.h[0], v1.h[0], \sz // -> out5 (v21 or v18)
rshrn_sz v2, v18, v19, #12, \sz // out3
smull_smlal v18, v19, v3, v5, v1.h[0], v1.h[0], \sz // -> out2 (v18 or v21)
rshrn_sz v3, v20, v21, #12, \sz // out5
rshrn_sz \o2, v18, v19, #12, \sz // out2 (v18 or v21)
rshrn_sz \o4, v6, v7, #12, \sz // out4 (v20 or v19)
sqneg \o3\()\sz, v2\sz // out3
sqneg \o5\()\sz, v3\sz // out5
@ -1127,14 +1097,15 @@ def_fns_48 8, 4
sqsub v25\sz, v27\sz, v29\sz // t13
sqadd v27\sz, v27\sz, v29\sz // t14
sub v23\sz, v3\sz, v2\sz // -> t11
add v29\sz, v3\sz, v2\sz // -> t12
sub v6\sz, v25\sz, v21\sz // -> t10a
add v7\sz, v25\sz, v21\sz // -> t13a
sqrdmulh v2\sz, v23\sz, v0.h[1] // t11
sqrdmulh v3\sz, v29\sz, v0.h[1] // t12
sqrdmulh v4\sz, v6\sz, v0.h[1] // t10a
sqrdmulh v5\sz, v7\sz, v0.h[1] // t13a
smull_smlsl v4, v5, v3, v2, v0.h[0], v0.h[0], \sz // -> t11
smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], \sz // -> t12
smull_smlsl v2, v3, v25, v21, v0.h[0], v0.h[0], \sz // -> t10a
rshrn_sz v4, v4, v5, #12, \sz // t11
rshrn_sz v5, v6, v7, #12, \sz // t12
smull_smlal v6, v7, v25, v21, v0.h[0], v0.h[0], \sz // -> t10a
rshrn_sz v2, v2, v3, #12, \sz // t10a
rshrn_sz v3, v6, v7, #12, \sz // t13a
sqadd v6\sz, v16\sz, v31\sz // out0
sqsub v31\sz, v16\sz, v31\sz // out15
@ -1143,18 +1114,18 @@ def_fns_48 8, 4
sqsub v7\sz, v30\sz, v17\sz // out8
sqadd v17\sz, v18\sz, v27\sz // out1
sqsub v30\sz, v18\sz, v27\sz // out14
sqadd v18\sz, v20\sz, v5\sz // out2
sqsub v29\sz, v20\sz, v5\sz // out13
sqadd v5\sz, v28\sz, v19\sz // out6
sqadd v18\sz, v20\sz, v3\sz // out2
sqsub v29\sz, v20\sz, v3\sz // out13
sqadd v3\sz, v28\sz, v19\sz // out6
sqsub v25\sz, v28\sz, v19\sz // out9
sqadd v19\sz, v22\sz, v3\sz // out3
sqsub v28\sz, v22\sz, v3\sz // out12
sqadd v20\sz, v24\sz, v2\sz // out4
sqsub v27\sz, v24\sz, v2\sz // out11
sqadd v21\sz, v26\sz, v4\sz // out5
sqsub v26\sz, v26\sz, v4\sz // out10
sqadd v19\sz, v22\sz, v5\sz // out3
sqsub v28\sz, v22\sz, v5\sz // out12
sqadd v20\sz, v24\sz, v4\sz // out4
sqsub v27\sz, v24\sz, v4\sz // out11
sqadd v21\sz, v26\sz, v2\sz // out5
sqsub v26\sz, v26\sz, v2\sz // out10
mov v24\szb, v7\szb
mov v22\szb, v5\szb
mov v22\szb, v3\szb
.endm
function inv_dct_8x16_neon
@ -1310,37 +1281,25 @@ endfunc
sqsub v23\sz, v25\sz, v23\sz // t7
sqneg \o3\sz, \o3\sz // out3
movi v0.4s, #2896>>4
smull_smlsl v24, v25, v2, v21, v0.h[0], v0.h[0], \sz // -> out8 (v24 or v23)
smull_smlal v4, v5, v2, v21, v0.h[0], v0.h[0], \sz // -> out7 (v23 or v24)
smull_smlal v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out5 (v21 or v26)
ssubl_sz v24, v25, v2, v21, \sz // -> out8 (v24 or v23)
saddl_sz v4, v5, v2, v21, \sz // -> out7 (v23 or v24)
saddl_sz v6, v7, v26, v3, \sz // -> out5 (v21 or v26)
ssubl_sz v2, v3, v26, v3, \sz // -> out10 (v26 or v21)
rshrn_sz v24, v24, v25, #12, \sz // out8
rshrn_sz v4, v4, v5, #12, \sz // out7
rshrn_sz v5, v6, v7, #12, \sz // out5
smull_smlsl v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out10 (v26 or v21)
smull_smlal v2, v3, v22, v23, v0.h[0], v0.h[0], \sz // -> out4 (v20 or v27)
rshrn_sz v26, v6, v7, #12, \sz // out10
mul_4s_sz v24, v25, v24, v25, v0.s[0], \sz
mul_4s_sz v4, v5, v4, v5, v0.s[0], \sz
mul_4s_sz v6, v7, v6, v7, v0.s[0], \sz
mul_4s_sz v2, v3, v2, v3, v0.s[0], \sz
smull_smlsl v6, v7, v22, v23, v0.h[0], v0.h[0], \sz // -> out11 (v27 or v20)
smull_smlal v22, v23, v27, v20, v0.h[0], v0.h[0], \sz // -> out6 (v22 or v25)
smull_smlsl v21, v25, v27, v20, v0.h[0], v0.h[0], \sz // -> out9 (v25 or v22)
rshrn_sz v24, v24, v25, #8, \sz // out8
rshrn_sz v4, v4, v5, #8, \sz // out7
rshrn_sz v5, v6, v7, #8, \sz // out5
rshrn_sz v26, v2, v3, #8, \sz // out10
saddl_sz v2, v3, v22, v23, \sz // -> out4 (v20 or v27)
ssubl_sz v6, v7, v22, v23, \sz // -> out11 (v27 or v20)
saddl_sz v22, v23, v27, v20, \sz // -> out6 (v22 or v25)
ssubl_sz v21, v25, v27, v20, \sz // -> out9 (v25 or v22)
mul_4s_sz v2, v3, v2, v3, v0.s[0], \sz
mul_4s_sz v6, v7, v6, v7, v0.s[0], \sz
mul_4s_sz v22, v23, v22, v23, v0.s[0], \sz
mul_4s_sz v21, v25, v21, v25, v0.s[0], \sz
rshrn_sz \o4, v2, v3, #8, \sz // out4
rshrn_sz v6, v6, v7, #8, \sz // out11
rshrn_sz v7, v21, v25, #8, \sz // out9
rshrn_sz \o6, v22, v23, #8, \sz // out6
rshrn_sz \o4, v2, v3, #12, \sz // out4
rshrn_sz v6, v6, v7, #12, \sz // out11
rshrn_sz v7, v21, v25, #12, \sz // out9
rshrn_sz \o6, v22, v23, #12, \sz // out6
.ifc \o8, v23
mov \o8\szb, v24\szb
@ -1915,22 +1874,26 @@ function inv_dct32_odd_8x16_neon
sqsub v24.8h, v24.8h, v19.8h // t27a
mov v19.16b, v4.16b // out19
sub v20.8h, v24.8h, v26.8h // -> t20
add v4.8h, v24.8h, v26.8h // -> t27
sub v5.8h, v25.8h, v27.8h // -> t21a
add v26.8h, v25.8h, v27.8h // -> t26a
sqrdmulh v20.8h, v20.8h, v0.h[1] // t20 = out20
sqrdmulh v27.8h, v4.8h, v0.h[1] // t27 = out27
sub v22.8h, v21.8h, v23.8h // -> t22
add v25.8h, v21.8h, v23.8h // -> t25
sqrdmulh v21.8h, v5.8h, v0.h[1] // t21a = out21
sqrdmulh v26.8h, v26.8h, v0.h[1] // t26a = out26
sub v23.8h, v3.8h, v2.8h // -> t23a
add v24.8h, v3.8h, v2.8h // -> t24a
sqrdmulh v22.8h, v22.8h, v0.h[1] // t22 = out22
sqrdmulh v25.8h, v25.8h, v0.h[1] // t25 = out25
sqrdmulh v23.8h, v23.8h, v0.h[1] // t23a = out23
sqrdmulh v24.8h, v24.8h, v0.h[1] // t24a = out24
smull_smlsl v4, v5, v24, v26, v0.h[0], v0.h[0], .8h // -> t20
smull_smlal v6, v7, v24, v26, v0.h[0], v0.h[0], .8h // -> t27
rshrn_sz v20, v4, v5, #12, .8h // t20
rshrn_sz v22, v6, v7, #12, .8h // t27
smull_smlal v4, v5, v25, v27, v0.h[0], v0.h[0], .8h // -> t26a
smull_smlsl v6, v7, v25, v27, v0.h[0], v0.h[0], .8h // -> t21a
mov v27.16b, v22.16b // t27
rshrn_sz v26, v4, v5, #12, .8h // t26a
smull_smlsl v24, v25, v21, v23, v0.h[0], v0.h[0], .8h // -> t22
smull_smlal v4, v5, v21, v23, v0.h[0], v0.h[0], .8h // -> t25
rshrn_sz v21, v6, v7, #12, .8h // t21a
rshrn_sz v22, v24, v25, #12, .8h // t22
rshrn_sz v25, v4, v5, #12, .8h // t25
smull_smlsl v4, v5, v3, v2, v0.h[0], v0.h[0], .8h // -> t23a
smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], .8h // -> t24a
rshrn_sz v23, v4, v5, #12, .8h // t23a
rshrn_sz v24, v6, v7, #12, .8h // t24a
ret
endfunc

View File

@ -2975,7 +2975,9 @@ function warp_filter_horz_neon
ld1 {v16.8b, v17.8b}, [x2], x3
load_filter_row d0, w12, w7
uxtl v16.8h, v16.8b
load_filter_row d1, w12, w7
uxtl v17.8h, v17.8b
load_filter_row d2, w12, w7
sxtl v0.8h, v0.8b
load_filter_row d3, w12, w7
@ -2988,16 +2990,12 @@ function warp_filter_horz_neon
sxtl v4.8h, v4.8b
load_filter_row d7, w12, w7
sxtl v5.8h, v5.8b
sxtl v6.8h, v6.8b
sxtl v7.8h, v7.8b
uxtl v16.8h, v16.8b
uxtl v17.8h, v17.8b
ext v18.16b, v16.16b, v17.16b, #2*1
mul v23.8h, v16.8h, v0.8h
sxtl v6.8h, v6.8b
ext v19.16b, v16.16b, v17.16b, #2*2
mul v18.8h, v18.8h, v1.8h
sxtl v7.8h, v7.8b
ext v20.16b, v16.16b, v17.16b, #2*3
mul v19.8h, v19.8h, v2.8h
ext v21.16b, v16.16b, v17.16b, #2*4
@ -3009,28 +3007,20 @@ function warp_filter_horz_neon
saddlp v19.4s, v19.8h
mul v22.8h, v22.8h, v5.8h
saddlp v20.4s, v20.8h
addv s23, v23.4s
saddlp v21.4s, v21.8h
addv s18, v18.4s
saddlp v22.4s, v22.8h
addv s19, v19.4s
trn1 v18.2s, v23.2s, v18.2s
addv s20, v20.4s
addp v18.4s, v23.4s, v18.4s
ext v23.16b, v16.16b, v17.16b, #2*6
trn1 v19.2s, v19.2s, v20.2s
addv s21, v21.4s
addp v19.4s, v19.4s, v20.4s
mul v23.8h, v23.8h, v6.8h
ext v20.16b, v16.16b, v17.16b, #2*7
addv s22, v22.4s
mul v20.8h, v20.8h, v7.8h
saddlp v23.4s, v23.8h
trn1 v21.2s, v21.2s, v22.2s
addp v21.4s, v21.4s, v22.4s
saddlp v20.4s, v20.8h
addv s23, v23.4s
addv s20, v20.4s
trn1 v20.2s, v23.2s, v20.2s
trn1 v18.2d, v18.2d, v19.2d
trn1 v20.2d, v21.2d, v20.2d
addp v20.4s, v23.4s, v20.4s
addp v18.4s, v18.4s, v19.4s
addp v20.4s, v21.4s, v20.4s
add w5, w5, w8
@ -3047,14 +3037,10 @@ endfunc
.macro warp t, shift
function warp_affine_8x8\t\()_8bpc_neon, export=1
ldr x4, [x4]
ubfx x7, x4, #0, #16
ubfx x8, x4, #16, #16
ubfx x9, x4, #32, #16
ubfx x4, x4, #48, #16
sxth w7, w7
sxth w8, w8
sxth w9, w9
sxth w4, w4
sbfx x7, x4, #0, #16
sbfx x8, x4, #16, #16
sbfx x9, x4, #32, #16
sbfx x4, x4, #48, #16
mov w10, #8
sub x2, x2, x3, lsl #1
sub x2, x2, x3

View File

@ -27,7 +27,7 @@
#include "src/cpu.h"
#include "src/cdef.h"
#if BITDEPTH == 8 && ARCH_AARCH64
#if BITDEPTH == 8
decl_cdef_dir_fn(dav1d_cdef_find_dir_neon);
void dav1d_cdef_padding4_neon(uint16_t *tmp, const pixel *src,
@ -58,8 +58,8 @@ cdef_filter_##w##x##h##_neon(pixel *dst, \
const int damping, \
const enum CdefEdgeFlags edges) \
{ \
ALIGN_STK_16(uint16_t, tmp_buf, 12*tmp_stride,); \
uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \
ALIGN_STK_16(uint16_t, tmp_buf, 12*tmp_stride + 8,); \
uint16_t *tmp = tmp_buf + 2 * tmp_stride + 8; \
dav1d_cdef_padding##w##_neon(tmp, dst, stride, left, top, h, edges); \
dav1d_cdef_filter##w##_neon(dst, stride, tmp, pri_strength, \
sec_strength, dir, damping, h); \
@ -76,7 +76,7 @@ COLD void bitfn(dav1d_cdef_dsp_init_arm)(Dav1dCdefDSPContext *const c) {
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if BITDEPTH == 8 && ARCH_AARCH64
#if BITDEPTH == 8
c->dir = dav1d_cdef_find_dir_neon;
c->fb[0] = cdef_filter_8x8_neon;
c->fb[1] = cdef_filter_4x8_neon;

View File

@ -0,0 +1,80 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/cpu.h"
#include "src/ipred.h"
decl_angular_ipred_fn(dav1d_ipred_dc_neon);
decl_angular_ipred_fn(dav1d_ipred_dc_128_neon);
decl_angular_ipred_fn(dav1d_ipred_dc_top_neon);
decl_angular_ipred_fn(dav1d_ipred_dc_left_neon);
decl_angular_ipred_fn(dav1d_ipred_h_neon);
decl_angular_ipred_fn(dav1d_ipred_v_neon);
decl_angular_ipred_fn(dav1d_ipred_paeth_neon);
decl_angular_ipred_fn(dav1d_ipred_smooth_neon);
decl_angular_ipred_fn(dav1d_ipred_smooth_v_neon);
decl_angular_ipred_fn(dav1d_ipred_smooth_h_neon);
decl_angular_ipred_fn(dav1d_ipred_filter_neon);
decl_cfl_pred_fn(dav1d_ipred_cfl_neon);
decl_cfl_pred_fn(dav1d_ipred_cfl_128_neon);
decl_cfl_pred_fn(dav1d_ipred_cfl_top_neon);
decl_cfl_pred_fn(dav1d_ipred_cfl_left_neon);
decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_neon);
decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_neon);
decl_pal_pred_fn(dav1d_pal_pred_neon);
COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if BITDEPTH == 8 && ARCH_AARCH64
c->intra_pred[DC_PRED] = dav1d_ipred_dc_neon;
c->intra_pred[DC_128_PRED] = dav1d_ipred_dc_128_neon;
c->intra_pred[TOP_DC_PRED] = dav1d_ipred_dc_top_neon;
c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_neon;
c->intra_pred[HOR_PRED] = dav1d_ipred_h_neon;
c->intra_pred[VERT_PRED] = dav1d_ipred_v_neon;
c->intra_pred[PAETH_PRED] = dav1d_ipred_paeth_neon;
c->intra_pred[SMOOTH_PRED] = dav1d_ipred_smooth_neon;
c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_neon;
c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_neon;
c->intra_pred[FILTER_PRED] = dav1d_ipred_filter_neon;
c->cfl_pred[DC_PRED] = dav1d_ipred_cfl_neon;
c->cfl_pred[DC_128_PRED] = dav1d_ipred_cfl_128_neon;
c->cfl_pred[TOP_DC_PRED] = dav1d_ipred_cfl_top_neon;
c->cfl_pred[LEFT_DC_PRED] = dav1d_ipred_cfl_left_neon;
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_neon;
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_ipred_cfl_ac_422_neon;
c->pal_pred = dav1d_pal_pred_neon;
#endif
}

View File

@ -107,9 +107,7 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
c->w_mask[0] = dav1d_w_mask_444_8bpc_neon;
c->w_mask[1] = dav1d_w_mask_422_8bpc_neon;
c->w_mask[2] = dav1d_w_mask_420_8bpc_neon;
#if ARCH_AARCH64
c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon;
c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon;
#endif
#endif
}

View File

@ -1176,14 +1176,18 @@ static int decode_b(Dav1dTileContext *const t,
f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b);
}
dav1d_create_lf_mask_intra(t->lf_mask, f->lf.level, f->b4_stride,
f->frame_hdr, (const uint8_t (*)[8][2])
&ts->lflvl[b->seg_id][0][0][0],
t->bx, t->by, f->w4, f->h4, bs,
b->tx, b->uvtx, f->cur.p.layout,
&t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
if (f->frame_hdr->loopfilter.level_y[0] ||
f->frame_hdr->loopfilter.level_y[1])
{
dav1d_create_lf_mask_intra(t->lf_mask, f->lf.level, f->b4_stride,
(const uint8_t (*)[8][2])
&ts->lflvl[b->seg_id][0][0][0],
t->bx, t->by, f->w4, f->h4, bs,
b->tx, b->uvtx, f->cur.p.layout,
&t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
}
// update contexts
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
@ -1859,17 +1863,21 @@ static int decode_b(Dav1dTileContext *const t,
if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
}
const int is_globalmv =
b->inter_mode == (is_comp ? GLOBALMV_GLOBALMV : GLOBALMV);
const uint8_t (*const lf_lvls)[8][2] = (const uint8_t (*)[8][2])
&ts->lflvl[b->seg_id][0][b->ref[0] + 1][!is_globalmv];
dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride,
f->frame_hdr, lf_lvls, t->bx, t->by,
f->w4, f->h4, b->skip, bs, b->tx_split,
b->uvtx, f->cur.p.layout,
&t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
if (f->frame_hdr->loopfilter.level_y[0] ||
f->frame_hdr->loopfilter.level_y[1])
{
const int is_globalmv =
b->inter_mode == (is_comp ? GLOBALMV_GLOBALMV : GLOBALMV);
const uint8_t (*const lf_lvls)[8][2] = (const uint8_t (*)[8][2])
&ts->lflvl[b->seg_id][0][b->ref[0] + 1][!is_globalmv];
dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride,
lf_lvls, t->bx, t->by, f->w4, f->h4,
b->skip, bs, b->tx_split, b->uvtx,
f->cur.p.layout,
&t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
}
// context updates
if (is_comp) {
@ -2339,7 +2347,7 @@ static void setup_tile(Dav1dTileState *const ts,
((ts->tiling.col_start & 16) >> 4);
}
for (int p = 0; p < 3; p++) {
if (f->frame_hdr->restoration.type[p] == DAV1D_RESTORATION_NONE)
if (!((f->lf.restore_planes >> p) & 1U))
continue;
if (f->frame_hdr->super_res.enabled) {
@ -2503,7 +2511,7 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {
}
// Restoration filter
for (int p = 0; p < 3; p++) {
if (f->frame_hdr->restoration.type[p] == DAV1D_RESTORATION_NONE)
if (!((f->lf.restore_planes >> p) & 1U))
continue;
const int ss_ver = p && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
@ -2817,6 +2825,10 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
}
f->lf.lr_mask_sz = lr_mask_sz;
}
f->lf.restore_planes =
((f->frame_hdr->restoration.type[0] != DAV1D_RESTORATION_NONE) << 0) +
((f->frame_hdr->restoration.type[1] != DAV1D_RESTORATION_NONE) << 1) +
((f->frame_hdr->restoration.type[2] != DAV1D_RESTORATION_NONE) << 2);
if (f->frame_hdr->loopfilter.sharpness != f->lf.last_sharpness) {
dav1d_calc_eih(&f->lf.lim_lut, f->frame_hdr->loopfilter.sharpness);
f->lf.last_sharpness = f->frame_hdr->loopfilter.sharpness;

View File

@ -42,9 +42,12 @@ static void generate_scaling(const int bitdepth,
const uint8_t points[][2], const int num,
uint8_t scaling[SCALING_SIZE])
{
#if BITDEPTH == 8
const int shift_x = 0;
#else
const int shift_x = bitdepth - 8;
#endif
const int scaling_size = 1 << bitdepth;
const int pad = 1 << shift_x;
// Fill up the preceding entries with the initial value
for (int i = 0; i < points[0][0] << shift_x; i++)
@ -69,9 +72,8 @@ static void generate_scaling(const int bitdepth,
for (int i = points[num - 1][0] << shift_x; i < scaling_size; i++)
scaling[i] = points[num - 1][1];
if (pad <= 1) return;
const int rnd = pad >> 1;
#if BITDEPTH != 8
const int pad = 1 << shift_x, rnd = pad >> 1;
for (int i = 0; i < num - 1; i++) {
const int bx = points[i][0] << shift_x;
const int ex = points[i+1][0] << shift_x;
@ -83,6 +85,7 @@ static void generate_scaling(const int bitdepth,
}
}
}
#endif
}
#ifndef UNIT_TEST

View File

@ -51,7 +51,7 @@ typedef decl_generate_grain_y_fn(*generate_grain_y_fn);
#define decl_generate_grain_uv_fn(name) \
void (name)(entry buf[][GRAIN_WIDTH], \
const entry buf_y[][GRAIN_WIDTH], \
const Dav1dFilmGrainData *const data, const int uv HIGHBD_DECL_SUFFIX)
const Dav1dFilmGrainData *const data, const intptr_t uv HIGHBD_DECL_SUFFIX)
typedef decl_generate_grain_uv_fn(*generate_grain_uv_fn);
#define decl_fgy_32x32xn_fn(name) \

View File

@ -88,7 +88,7 @@ static void generate_grain_y_c(entry buf[][GRAIN_WIDTH],
static NOINLINE void
generate_grain_uv_c(entry buf[][GRAIN_WIDTH],
const entry buf_y[][GRAIN_WIDTH],
const Dav1dFilmGrainData *const data, const int uv,
const Dav1dFilmGrainData *const data, const intptr_t uv,
const int subx, const int suby HIGHBD_DECL_SUFFIX)
{
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
@ -156,8 +156,8 @@ gnuv_ss_fn(444, 0, 0);
// samples from the correct block of a grain LUT, while taking into account the
// offsets provided by the offsets cache
static inline entry sample_lut(const entry grain_lut[][GRAIN_WIDTH],
int offsets[2][2], int subx, int suby,
int bx, int by, int x, int y)
const int offsets[2][2], const int subx, const int suby,
const int bx, const int by, const int x, const int y)
{
const int randval = offsets[bx][by];
const int offx = 3 + (2 >> subx) * (3 + (randval >> 4));

View File

@ -228,6 +228,7 @@ struct Dav1dFrameContext {
int tile_row; // for carry-over at tile row edges
pixel *p[3], *sr_p[3];
Av1Filter *mask_ptr, *prev_mask_ptr;
int restore_planes; // enum LrRestorePlanes
} lf;
// threading (refer to tc[] for per-thread things)

View File

@ -89,6 +89,7 @@ typedef struct Dav1dIntraPredDSPContext {
} Dav1dIntraPredDSPContext;
bitfn_decls(void dav1d_intra_pred_dsp_init, Dav1dIntraPredDSPContext *c);
bitfn_decls(void dav1d_intra_pred_dsp_init_arm, Dav1dIntraPredDSPContext *c);
bitfn_decls(void dav1d_intra_pred_dsp_init_x86, Dav1dIntraPredDSPContext *c);
#endif /* DAV1D_SRC_IPRED_H */

View File

@ -324,44 +324,37 @@ static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride,
}
}
static int get_filter_strength(const unsigned blk_wh, const unsigned d,
const int type)
{
int strength = 0;
if (type == 0) {
if (blk_wh <= 8) {
if (d >= 56) strength = 1;
} else if (blk_wh <= 12) {
if (d >= 40) strength = 1;
} else if (blk_wh <= 16) {
if (d >= 40) strength = 1;
} else if (blk_wh <= 24) {
if (d >= 8) strength = 1;
if (d >= 16) strength = 2;
if (d >= 32) strength = 3;
} else if (blk_wh <= 32) {
if (d >= 1) strength = 1;
if (d >= 4) strength = 2;
if (d >= 32) strength = 3;
static int get_filter_strength(const int wh, const int angle, const int is_sm) {
if (is_sm) {
if (wh <= 8) {
if (angle >= 64) return 2;
if (angle >= 40) return 1;
} else if (wh <= 16) {
if (angle >= 48) return 2;
if (angle >= 20) return 1;
} else if (wh <= 24) {
if (angle >= 4) return 3;
} else {
if (d >= 1) strength = 3;
return 3;
}
} else {
if (blk_wh <= 8) {
if (d >= 40) strength = 1;
if (d >= 64) strength = 2;
} else if (blk_wh <= 16) {
if (d >= 20) strength = 1;
if (d >= 48) strength = 2;
} else if (blk_wh <= 24) {
if (d >= 4) strength = 3;
if (wh <= 8) {
if (angle >= 56) return 1;
} else if (wh <= 16) {
if (angle >= 40) return 1;
} else if (wh <= 24) {
if (angle >= 32) return 3;
if (angle >= 16) return 2;
if (angle >= 8) return 1;
} else if (wh <= 32) {
if (angle >= 32) return 3;
if (angle >= 4) return 2;
return 1;
} else {
if (d >= 1) strength = 3;
return 3;
}
}
return strength;
return 0;
}
static void filter_edge(pixel *const out, const int sz,
@ -451,12 +444,12 @@ static void ipred_z1_c(pixel *dst, const ptrdiff_t stride,
for (int y = 0, xpos = dx; y < height;
y++, dst += PXSTRIDE(stride), xpos += dx)
{
const int frac = (xpos >> 1) & 0x1F;
const int frac = xpos & 0x3E;
for (int x = 0, base = xpos >> 6; x < width; x++, base += base_inc) {
if (base < max_base_x) {
const int v = top[base] * (32 - frac) + top[base + 1] * frac;
dst[x] = iclip_pixel((v + 16) >> 5);
const int v = top[base] * (64 - frac) + top[base + 1] * frac;
dst[x] = (v + 32) >> 6;
} else {
pixel_set(&dst[x], top[max_base_x], width - x);
break;
@ -518,30 +511,29 @@ static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,
}
*topleft = *topleft_in;
const int min_base_x = -(1 + upsample_above);
const int base_inc_x = 1 + upsample_above;
const pixel *const left = &topleft[-(1 + upsample_left)];
const pixel *const top = &topleft[1 + upsample_above];
for (int y = 0, xpos = -dx; y < height;
for (int y = 0, xpos = ((1 + upsample_above) << 6) - dx; y < height;
y++, xpos -= dx, dst += PXSTRIDE(stride))
{
int base_x = xpos >> 6;
const int frac_x = (xpos >> 1) & 0x1F;
const int frac_x = xpos & 0x3E;
for (int x = 0, ypos = (y << (6 + upsample_left)) - dy; x < width;
x++, base_x += base_inc_x, ypos -= dy)
{
int v;
if (base_x >= min_base_x) {
v = top[base_x] * (32 - frac_x) + top[base_x + 1] * frac_x;
if (base_x >= 0) {
v = topleft[base_x] * (64 - frac_x) +
topleft[base_x + 1] * frac_x;
} else {
const int base_y = ypos >> 6;
assert(base_y >= -(1 + upsample_left));
const int frac_y = (ypos >> 1) & 0x1F;
v = left[-base_y] * (32 - frac_y) + left[-(base_y + 1)] * frac_y;
const int frac_y = ypos & 0x3E;
v = left[-base_y] * (64 - frac_y) +
left[-(base_y + 1)] * frac_y;
}
dst[x] = iclip_pixel((v + 16) >> 5);
dst[x] = (v + 32) >> 6;
}
}
}
@ -588,13 +580,13 @@ static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
}
const int base_inc = 1 + upsample_left;
for (int x = 0, ypos = dy; x < width; x++, ypos += dy) {
const int frac = (ypos >> 1) & 0x1F;
const int frac = ypos & 0x3E;
for (int y = 0, base = ypos >> 6; y < height; y++, base += base_inc) {
if (base < max_base_y) {
const int v = left[-base] * (32 - frac) +
const int v = left[-base] * (64 - frac) +
left[-(base + 1)] * frac;
dst[y * PXSTRIDE(stride) + x] = iclip_pixel((v + 16) >> 5);
dst[y * PXSTRIDE(stride) + x] = (v + 32) >> 6;
} else {
do {
dst[y * PXSTRIDE(stride) + x] = left[-max_base_y];
@ -605,6 +597,22 @@ static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
}
}
#if ARCH_X86
#define FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6) \
flt_ptr[ 0] * p0 + flt_ptr[ 1] * p1 + \
flt_ptr[16] * p2 + flt_ptr[17] * p3 + \
flt_ptr[32] * p4 + flt_ptr[33] * p5 + \
flt_ptr[48] * p6
#define FLT_INCR 2
#else
#define FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6) \
flt_ptr[ 0] * p0 + flt_ptr[ 8] * p1 + \
flt_ptr[16] * p2 + flt_ptr[24] * p3 + \
flt_ptr[32] * p4 + flt_ptr[40] * p5 + \
flt_ptr[48] * p6
#define FLT_INCR 1
#endif
/* Up to 32x32 only */
static void ipred_filter_c(pixel *dst, const ptrdiff_t stride,
const pixel *const topleft_in,
@ -633,11 +641,8 @@ static void ipred_filter_c(pixel *dst, const ptrdiff_t stride,
const int8_t *flt_ptr = filter;
for (int yy = 0; yy < 2; yy++) {
for (int xx = 0; xx < 4; xx++, flt_ptr += 2) {
int acc = flt_ptr[ 0] * p0 + flt_ptr[ 1] * p1 +
flt_ptr[16] * p2 + flt_ptr[17] * p3 +
flt_ptr[32] * p4 + flt_ptr[33] * p5 +
flt_ptr[48] * p6;
for (int xx = 0; xx < 4; xx++, flt_ptr += FLT_INCR) {
int acc = FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6);
ptr[xx] = iclip_pixel((acc + 8) >> 4);
}
ptr += PXSTRIDE(stride);
@ -751,7 +756,11 @@ COLD void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {
c->pal_pred = pal_pred_c;
#if HAVE_ASM && ARCH_X86
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
bitfn(dav1d_intra_pred_dsp_init_arm)(c);
#elif ARCH_X86
bitfn(dav1d_intra_pred_dsp_init_x86)(c);
#endif
#endif
}

View File

@ -286,7 +286,6 @@ static inline void mask_edges_chroma(uint16_t (*const masks)[32][2][2],
void dav1d_create_lf_mask_intra(Av1Filter *const lflvl,
uint8_t (*const level_cache)[4],
const ptrdiff_t b4_stride,
const Dav1dFrameHeader *const hdr,
const uint8_t (*filter_level)[8][2],
const int bx, const int by,
const int iw, const int ih,
@ -297,9 +296,6 @@ void dav1d_create_lf_mask_intra(Av1Filter *const lflvl,
uint8_t *const ay, uint8_t *const ly,
uint8_t *const auv, uint8_t *const luv)
{
if (!hdr->loopfilter.level_y[0] && !hdr->loopfilter.level_y[1])
return;
const uint8_t *const b_dim = dav1d_block_dimensions[bs];
const int bw4 = imin(iw - bx, b_dim[0]);
const int bh4 = imin(ih - by, b_dim[1]);
@ -350,7 +346,6 @@ void dav1d_create_lf_mask_intra(Av1Filter *const lflvl,
void dav1d_create_lf_mask_inter(Av1Filter *const lflvl,
uint8_t (*const level_cache)[4],
const ptrdiff_t b4_stride,
const Dav1dFrameHeader *const hdr,
const uint8_t (*filter_level)[8][2],
const int bx, const int by,
const int iw, const int ih,
@ -361,9 +356,6 @@ void dav1d_create_lf_mask_inter(Av1Filter *const lflvl,
uint8_t *const ay, uint8_t *const ly,
uint8_t *const auv, uint8_t *const luv)
{
if (!hdr->loopfilter.level_y[0] && !hdr->loopfilter.level_y[1])
return;
const uint8_t *const b_dim = dav1d_block_dimensions[bs];
const int bw4 = imin(iw - bx, b_dim[0]);
const int bh4 = imin(ih - by, b_dim[1]);

View File

@ -63,7 +63,6 @@ typedef struct Av1Restoration {
void dav1d_create_lf_mask_intra(Av1Filter *lflvl, uint8_t (*level_cache)[4],
const ptrdiff_t b4_stride,
const Dav1dFrameHeader *hdr,
const uint8_t (*level)[8][2], int bx, int by,
int iw, int ih, enum BlockSize bs,
enum RectTxfmSize ytx, enum RectTxfmSize uvtx,
@ -71,7 +70,6 @@ void dav1d_create_lf_mask_intra(Av1Filter *lflvl, uint8_t (*level_cache)[4],
uint8_t *ly, uint8_t *auv, uint8_t *luv);
void dav1d_create_lf_mask_inter(Av1Filter *lflvl, uint8_t (*level_cache)[4],
const ptrdiff_t b4_stride,
const Dav1dFrameHeader *hdr,
const uint8_t (*level)[8][2], int bx, int by,
int iw, int ih, int skip_inter,
enum BlockSize bs, const uint16_t *tx_mask,

View File

@ -75,5 +75,6 @@ typedef struct Dav1dLoopRestorationDSPContext {
bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext *c);
bitfn_decls(void dav1d_loop_restoration_dsp_init_arm, Dav1dLoopRestorationDSPContext *c);
bitfn_decls(void dav1d_loop_restoration_dsp_init_x86, Dav1dLoopRestorationDSPContext *c);
bitfn_decls(void dav1d_loop_restoration_dsp_init_ppc, Dav1dLoopRestorationDSPContext *c);
#endif /* DAV1D_SRC_LOOPRESTORATION_H */

View File

@ -580,6 +580,8 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
bitfn(dav1d_loop_restoration_dsp_init_arm)(c);
#elif ARCH_PPC64LE
bitfn(dav1d_loop_restoration_dsp_init_ppc)(c);
#elif ARCH_X86
bitfn(dav1d_loop_restoration_dsp_init_x86)(c);
#endif

View File

@ -112,10 +112,7 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
const ptrdiff_t lr_stride = ((f->sr_cur.p.p.w + 31) & ~31) * sizeof(pixel);
// TODO Also check block level restore type to reduce copying.
const int restore_planes =
((f->frame_hdr->restoration.type[0] != DAV1D_RESTORATION_NONE) << 0) +
((f->frame_hdr->restoration.type[1] != DAV1D_RESTORATION_NONE) << 1) +
((f->frame_hdr->restoration.type[2] != DAV1D_RESTORATION_NONE) << 2);
const int restore_planes = f->lf.restore_planes;
if (restore_planes & LR_RESTORE_Y) {
const int h = f->cur.p.h;
@ -180,12 +177,8 @@ static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
}
while (y + stripe_h <= row_h) {
// TODO Look into getting rid of the this if
if (y + stripe_h == row_h) {
edges &= ~LR_HAVE_BOTTOM;
} else {
edges |= LR_HAVE_BOTTOM;
}
// Change HAVE_BOTTOM bit in edges to (y + stripe_h != row_h)
edges ^= (-(y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM;
if (lr->type == DAV1D_RESTORATION_WIENER) {
dsp->lr.wiener(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
filterh, filterv, edges HIGHBD_CALL_SUFFIX);
@ -239,8 +232,7 @@ static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y,
const int shift_hor = 7 - ss_hor;
pixel pre_lr_border[2][128 + 8 /* maximum sbrow height is 128 + 8 rows offset */][4];
int unit_w = unit_size, bit = 0;
const Av1RestorationUnit *lr[2];
enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) | LR_HAVE_RIGHT |
(row_h < h ? LR_HAVE_BOTTOM : 0);
@ -251,26 +243,27 @@ static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y,
aligned_unit_pos <<= ss_ver;
const int sb_idx = (aligned_unit_pos >> 7) * f->sr_sb128w;
const int unit_idx = ((aligned_unit_pos >> 6) & 1) << 1;
for (int x = 0; x < w; x += unit_w, edges |= LR_HAVE_LEFT, bit ^= 1) {
if (x + max_unit_size > w) {
unit_w = w - x;
edges &= ~LR_HAVE_RIGHT;
}
// Based on the position of the restoration unit, find the corresponding
// AV1Filter unit.
const int u_idx = unit_idx + ((x >> (shift_hor - 1)) & 1);
const Av1RestorationUnit *const lr =
&f->lf.lr_mask[sb_idx + (x >> shift_hor)].lr[plane][u_idx];
// FIXME Don't backup if the next restoration unit is RESTORE_NONE
if (edges & LR_HAVE_RIGHT) {
backup4xU(pre_lr_border[bit], p + unit_w - 4, p_stride, row_h - y);
}
if (lr->type != DAV1D_RESTORATION_NONE) {
lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_w, row_h, lr, edges);
}
p += unit_w;
lr[0] = &f->lf.lr_mask[sb_idx].lr[plane][unit_idx];
int restore = lr[0]->type != DAV1D_RESTORATION_NONE;
int x = 0, bit = 0;
for (; x + max_unit_size <= w; p += unit_size, edges |= LR_HAVE_LEFT, bit ^= 1) {
const int next_x = x + unit_size;
const int next_u_idx = unit_idx + ((next_x >> (shift_hor - 1)) & 1);
lr[!bit] =
&f->lf.lr_mask[sb_idx + (next_x >> shift_hor)].lr[plane][next_u_idx];
const int restore_next = lr[!bit]->type != DAV1D_RESTORATION_NONE;
if (restore_next)
backup4xU(pre_lr_border[bit], p + unit_size - 4, p_stride, row_h - y);
if (restore)
lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_size, row_h,
lr[bit], edges);
x = next_x;
restore = restore_next;
}
if (restore) {
edges &= ~LR_HAVE_RIGHT;
const int unit_w = w - x;
lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_w, row_h, lr[bit], edges);
}
}
@ -279,11 +272,7 @@ void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],
{
const int offset_y = 8 * !!sby;
const ptrdiff_t *const dst_stride = f->sr_cur.p.stride;
const int restore_planes =
((f->frame_hdr->restoration.type[0] != DAV1D_RESTORATION_NONE) << 0) +
((f->frame_hdr->restoration.type[1] != DAV1D_RESTORATION_NONE) << 1) +
((f->frame_hdr->restoration.type[2] != DAV1D_RESTORATION_NONE) << 2);
const int restore_planes = f->lf.restore_planes;
if (restore_planes & LR_RESTORE_Y) {
const int h = f->sr_cur.p.p.h;

View File

@ -93,6 +93,7 @@ if is_asm_enabled
)
libdav1d_tmpl_sources += files(
'arm/cdef_init_tmpl.c',
'arm/ipred_init_tmpl.c',
'arm/itx_init_tmpl.c',
'arm/loopfilter_init_tmpl.c',
'arm/looprestoration_init_tmpl.c',
@ -101,6 +102,7 @@ if is_asm_enabled
if host_machine.cpu_family() == 'aarch64'
libdav1d_sources += files(
'arm/64/cdef.S',
'arm/64/ipred.S',
'arm/64/itx.S',
'arm/64/loopfilter.S',
'arm/64/looprestoration.S',
@ -109,6 +111,7 @@ if is_asm_enabled
)
elif host_machine.cpu_family().startswith('arm')
libdav1d_sources += files(
'arm/32/cdef.S',
'arm/32/looprestoration.S',
'arm/32/mc.S',
)
@ -167,6 +170,7 @@ if is_asm_enabled
)
libdav1d_arch_tmpl_sources += files(
'ppc/cdef_init_tmpl.c',
'ppc/looprestoration_init_tmpl.c',
)
endif
endif

View File

@ -1098,6 +1098,8 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
const int num_uv_pos = num_y_pos + !!fgd->num_y_points;
for (int i = 0; i < num_uv_pos; i++)
fgd->ar_coeffs_uv[pl][i] = dav1d_get_bits(gb, 8) - 128;
if (!fgd->num_y_points)
fgd->ar_coeffs_uv[pl][num_uv_pos] = 0;
}
fgd->ar_coeff_shift = dav1d_get_bits(gb, 2) + 6;
fgd->grain_scale_shift = dav1d_get_bits(gb, 2);

View File

@ -0,0 +1,350 @@
/*
* Copyright © 2019, VideoLAN and dav1d authors
* Copyright © 2019, Michail Alvanos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "common/intops.h"
#include "src/ppc/types.h"
#include "src/cpu.h"
#include "src/looprestoration.h"
#if BITDEPTH == 8
#define REST_UNIT_STRIDE (400)
static inline i32x4 iclip_vec(i32x4 v, const i32x4 minv, const i32x4 maxv) {
v = vec_max(minv, v);
v = vec_min(maxv, v);
return v;
}
#define APPLY_FILTER_H(v, f, ssum1, ssum2) do { \
i16x8 ktmp_u16_high = (i16x8) u8h_to_u16(v); \
i16x8 ktmp_u16_low = (i16x8) u8l_to_u16(v); \
ssum1 = vec_madd(ktmp_u16_high, f, ssum1); \
ssum2 = vec_madd(ktmp_u16_low, f, ssum2); \
} while (0)
static void wiener_filter_h_vsx(int32_t *hor_ptr,
uint8_t *tmp_ptr,
const int16_t filterh[7],
const int w, const int h)
{
static const i32x4 zerov = vec_splats(0);
static const i32x4 seven_vec = vec_splats(7);
static const i32x4 bitdepth_added_vec = vec_splats(1 << 14);
static const i32x4 round_bits_vec = vec_splats(3);
static const i32x4 rounding_off_vec = vec_splats(1<<2);
static const i32x4 clip_limit_v = vec_splats((1 << 13) - 1);
i16x8 filterhvall = vec_vsx_ld(0, filterh);
i16x8 filterhv0 = vec_splat( filterhvall, 0);
i16x8 filterhv1 = vec_splat( filterhvall, 1);
i16x8 filterhv2 = vec_splat( filterhvall, 2);
i16x8 filterhv3 = vec_splat( filterhvall, 3);
i16x8 filterhv4 = vec_splat( filterhvall, 4);
i16x8 filterhv5 = vec_splat( filterhvall, 5);
i16x8 filterhv6 = vec_splat( filterhvall, 6);
for (int j = 0; j < h + 6; j++) {
for (int i = 0; i < w; i+=16) {
i32x4 sum1 = bitdepth_added_vec;
i32x4 sum2 = bitdepth_added_vec;
i32x4 sum3 = bitdepth_added_vec;
i32x4 sum4 = bitdepth_added_vec;
u8x16 tmp_v0 = vec_ld(0, &tmp_ptr[i]);
u8x16 tmp_v7 = vec_ld(0, &tmp_ptr[i+16]);
u8x16 tmp_v1 = vec_sld( tmp_v7, tmp_v0, 15);
u8x16 tmp_v2 = vec_sld( tmp_v7, tmp_v0, 14);
u8x16 tmp_v3 = vec_sld( tmp_v7, tmp_v0, 13);
u8x16 tmp_v4 = vec_sld( tmp_v7, tmp_v0, 12);
u8x16 tmp_v5 = vec_sld( tmp_v7, tmp_v0, 11);
u8x16 tmp_v6 = vec_sld( tmp_v7, tmp_v0, 10);
u16x8 tmp_u16_high = u8h_to_u16(tmp_v3);
u16x8 tmp_u16_low = u8l_to_u16(tmp_v3);
i32x4 tmp_expanded1 = i16h_to_i32(tmp_u16_high);
i32x4 tmp_expanded2 = i16l_to_i32(tmp_u16_high);
i32x4 tmp_expanded3 = i16h_to_i32(tmp_u16_low);
i32x4 tmp_expanded4 = i16l_to_i32(tmp_u16_low);
i16x8 ssum1 = (i16x8) zerov;
i16x8 ssum2 = (i16x8) zerov;
APPLY_FILTER_H(tmp_v0, filterhv0, ssum1, ssum2);
APPLY_FILTER_H(tmp_v1, filterhv1, ssum1, ssum2);
APPLY_FILTER_H(tmp_v2, filterhv2, ssum1, ssum2);
APPLY_FILTER_H(tmp_v3, filterhv3, ssum1, ssum2);
APPLY_FILTER_H(tmp_v4, filterhv4, ssum1, ssum2);
APPLY_FILTER_H(tmp_v5, filterhv5, ssum1, ssum2);
APPLY_FILTER_H(tmp_v6, filterhv6, ssum1, ssum2);
sum1 += i16h_to_i32(ssum1) + (tmp_expanded1 << seven_vec);
sum2 += i16l_to_i32(ssum1) + (tmp_expanded2 << seven_vec);
sum3 += i16h_to_i32(ssum2) + (tmp_expanded3 << seven_vec);
sum4 += i16l_to_i32(ssum2) + (tmp_expanded4 << seven_vec);
sum1 = (sum1 + rounding_off_vec) >> round_bits_vec;
sum2 = (sum2 + rounding_off_vec) >> round_bits_vec;
sum3 = (sum3 + rounding_off_vec) >> round_bits_vec;
sum4 = (sum4 + rounding_off_vec) >> round_bits_vec;
sum1 = iclip_vec(sum1, zerov, clip_limit_v);
sum2 = iclip_vec(sum2, zerov, clip_limit_v);
sum3 = iclip_vec(sum3, zerov, clip_limit_v);
sum4 = iclip_vec(sum4, zerov, clip_limit_v);
vec_st(sum1, 0, &hor_ptr[i]);
vec_st(sum2, 16, &hor_ptr[i]);
vec_st(sum3, 32, &hor_ptr[i]);
vec_st(sum4, 48, &hor_ptr[i]);
}
tmp_ptr += REST_UNIT_STRIDE;
hor_ptr += REST_UNIT_STRIDE;
}
}
static inline i16x8 iclip_u8_vec(i16x8 v) {
static const i16x8 zerov = vec_splats((int16_t)0);
static const i16x8 maxv = vec_splats((int16_t)255);
v = vec_max(zerov, v);
v = vec_min(maxv, v);
return v;
}
#define APPLY_FILTER_V(index, f) do { \
i32x4 v1 = vec_ld( 0, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
i32x4 v2 = vec_ld(16, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
i32x4 v3 = vec_ld(32, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
i32x4 v4 = vec_ld(48, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
sum1 = sum1 + v1 * f; \
sum2 = sum2 + v2 * f; \
sum3 = sum3 + v3 * f; \
sum4 = sum4 + v4 * f; \
} while (0)
#define LOAD_AND_APPLY_FILTER_V(sumpixelv, hor) do { \
i32x4 v_1 = (i32x4) vec_ld( 0, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
i32x4 v_2 = (i32x4) vec_ld(16, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
i32x4 v_3 = (i32x4) vec_ld(32, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
i32x4 v_4 = (i32x4) vec_ld(48, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
i32x4 sum1 = -round_offset_vec; \
i32x4 sum2 = -round_offset_vec; \
i32x4 sum3 = -round_offset_vec; \
i32x4 sum4 = -round_offset_vec; \
APPLY_FILTER_V(0, filterv0); \
APPLY_FILTER_V(1, filterv1); \
APPLY_FILTER_V(2, filterv2); \
APPLY_FILTER_V(3, filterv3); \
APPLY_FILTER_V(4, filterv4); \
APPLY_FILTER_V(5, filterv5); \
APPLY_FILTER_V(6, filterv6); \
sum1 = (v_1 << seven_vec) + sum1 + rounding_off_vec; \
sum2 = (v_2 << seven_vec) + sum2 + rounding_off_vec; \
sum3 = (v_3 << seven_vec) + sum3 + rounding_off_vec; \
sum4 = (v_4 << seven_vec) + sum4 + rounding_off_vec; \
sum1 = sum1 >> round_bits_vec; \
sum2 = sum2 >> round_bits_vec; \
sum3 = sum3 >> round_bits_vec; \
sum4 = sum4 >> round_bits_vec; \
i16x8 sum_short_packed_1 = (i16x8) vec_pack( sum1, sum2 ); \
i16x8 sum_short_packed_2 = (i16x8) vec_pack( sum3, sum4 ); \
sum_short_packed_1 = iclip_u8_vec(sum_short_packed_1); \
sum_short_packed_2 = iclip_u8_vec(sum_short_packed_2); \
sum_pixel = (u8x16) vec_pack(sum_short_packed_1, sum_short_packed_2 ); \
} while (0)
static inline void wiener_filter_v_vsx(uint8_t *p,
const ptrdiff_t p_stride,
const int32_t *hor,
const int16_t filterv[7],
const int w, const int h)
{
static const i32x4 round_bits_vec = vec_splats(11);
static const i32x4 rounding_off_vec = vec_splats(1 << 10);
static const i32x4 round_offset_vec = vec_splats(1 << 18);
static const i32x4 seven_vec = vec_splats(7);
i32x4 filterv0 = vec_splats((int32_t) filterv[0]);
i32x4 filterv1 = vec_splats((int32_t) filterv[1]);
i32x4 filterv2 = vec_splats((int32_t) filterv[2]);
i32x4 filterv3 = vec_splats((int32_t) filterv[3]);
i32x4 filterv4 = vec_splats((int32_t) filterv[4]);
i32x4 filterv5 = vec_splats((int32_t) filterv[5]);
i32x4 filterv6 = vec_splats((int32_t) filterv[6]);
for (int j = 0; j < h; j++) {
for (int i = 0; i <(w-w%16); i += 16) {
u8x16 sum_pixel;
LOAD_AND_APPLY_FILTER_V(sum_pixel, hor);
vec_vsx_st(sum_pixel, 0, &p[j * PXSTRIDE(p_stride) + i]);
}
// remaining loop
if (w & 0xf){
int i=w-w%16;
ALIGN_STK_16(uint8_t, tmp_out, 16,);
u8x16 sum_pixel;
LOAD_AND_APPLY_FILTER_V(sum_pixel, hor);
vec_vsx_st(sum_pixel, 0, tmp_out);
for (int k=0; i<w; i++, k++) {
p[j * PXSTRIDE(p_stride) + i] = tmp_out[k];
}
}
}
}
static inline void padding(uint8_t *dst, const uint8_t *p,
const ptrdiff_t p_stride, const uint8_t (*left)[4],
const uint8_t *lpf, const ptrdiff_t lpf_stride,
int unit_w, const int stripe_h,
const enum LrEdgeFlags edges)
{
const int have_left = !!(edges & LR_HAVE_LEFT);
const int have_right = !!(edges & LR_HAVE_RIGHT);
// Copy more pixels if we don't have to pad them
unit_w += 3 * have_left + 3 * have_right;
uint8_t *dst_l = dst + 3 * !have_left;
p -= 3 * have_left;
lpf -= 3 * have_left;
if (edges & LR_HAVE_TOP) {
// Copy previous loop filtered rows
const uint8_t *const above_1 = lpf;
const uint8_t *const above_2 = above_1 + PXSTRIDE(lpf_stride);
pixel_copy(dst_l, above_1, unit_w);
pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
} else {
// Pad with first row
pixel_copy(dst_l, p, unit_w);
pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
if (have_left) {
pixel_copy(dst_l, &left[0][1], 3);
pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
}
}
uint8_t *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
if (edges & LR_HAVE_BOTTOM) {
// Copy next loop filtered rows
const uint8_t *const below_1 = lpf + 6 * PXSTRIDE(lpf_stride);
const uint8_t *const below_2 = below_1 + PXSTRIDE(lpf_stride);
pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
} else {
// Pad with last row
const uint8_t *const src = p + (stripe_h - 1) * PXSTRIDE(p_stride);
pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
if (have_left) {
pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
}
}
// Inner UNIT_WxSTRIPE_H
for (int j = 0; j < stripe_h; j++) {
pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
dst_tl += REST_UNIT_STRIDE;
p += PXSTRIDE(p_stride);
}
if (!have_right) {
uint8_t *pad = dst_l + unit_w;
uint8_t *row_last = &dst_l[unit_w - 1];
// Pad 3x(STRIPE_H+6) with last column
for (int j = 0; j < stripe_h + 6; j++) {
pixel_set(pad, *row_last, 3);
pad += REST_UNIT_STRIDE;
row_last += REST_UNIT_STRIDE;
}
}
if (!have_left) {
// Pad 3x(STRIPE_H+6) with first column
for (int j = 0; j < stripe_h + 6; j++) {
pixel_set(dst, *dst_l, 3);
dst += REST_UNIT_STRIDE;
dst_l += REST_UNIT_STRIDE;
}
} else {
dst += 3 * REST_UNIT_STRIDE;
for (int j = 0; j < stripe_h; j++) {
pixel_copy(dst, &left[j][1], 3);
dst += REST_UNIT_STRIDE;
}
}
}
// FIXME Could split into luma and chroma specific functions,
// (since first and last tops are always 0 for chroma)
// FIXME Could implement a version that requires less temporary memory
// (should be possible to implement with only 6 rows of temp storage)
static void wiener_filter_vsx(uint8_t *p, const ptrdiff_t p_stride,
const uint8_t (*const left)[4],
const uint8_t *lpf,
const ptrdiff_t lpf_stride,
const int w, const int h,
const int16_t filterh[7],
const int16_t filterv[7],
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
// Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
// of padding above and below
ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,);
wiener_filter_h_vsx(hor, tmp, filterh, w, h);
wiener_filter_v_vsx(p, p_stride, hor, filterv, w, h);
}
#endif
COLD void bitfn(dav1d_loop_restoration_dsp_init_ppc)
(Dav1dLoopRestorationDSPContext *const c)
{
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return;
#if BITDEPTH == 8
c->wiener = wiener_filter_vsx;
#endif
}

View File

@ -47,6 +47,8 @@
#define u8h_to_u16(v) ((u16x8) vec_mergeh((u8x16) v, vec_splat_u8(0)))
#define u8l_to_u16(v) ((u16x8) vec_mergel((u8x16) v, vec_splat_u8(0)))
#define u16h_to_i32(v) ((i32x4) vec_mergeh((u16x8) v, vec_splat_u16(0)))
#define i16h_to_i32(v) ((i32x4) vec_unpackh((i16x8)v))
#define u16l_to_i32(v) ((i32x4) vec_mergel((u16x8) v, vec_splat_u16(0)))
#define i16l_to_i32(v) ((i32x4) vec_unpackl((i16x8)v))
#endif /* DAV1D_SRC_PPC_TYPES_H */

View File

@ -1971,7 +1971,7 @@ void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
start_of_tile_row);
}
if (f->seq_hdr->restoration) {
if (f->lf.restore_planes) {
// Store loop filtered pixels required by loop restoration
bytefn(dav1d_lr_copy_lpf)(f, f->lf.p, sby);
}
@ -2010,7 +2010,7 @@ void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
f->resize_start[!!pl] HIGHBD_CALL_SUFFIX);
}
}
if (f->seq_hdr->restoration) {
if (f->lf.restore_planes) {
bytefn(dav1d_lr_sbrow)(f, f->lf.sr_p, sby);
}

View File

@ -716,52 +716,65 @@ const uint16_t dav1d_dr_intra_derivative[44] = {
3 // 87, 177, 267
};
#if ARCH_X86
#define F(idx, f0, f1, f2, f3, f4, f5, f6) \
[2*idx+0] = f0, [2*idx+1] = f1, \
[2*idx+16] = f2, [2*idx+17] = f3, \
[2*idx+32] = f4, [2*idx+33] = f5, \
[2*idx+48] = f6
#else
#define F(idx, f0, f1, f2, f3, f4, f5, f6) \
[1*idx+0] = f0, [1*idx+8] = f1, \
[1*idx+16] = f2, [1*idx+24] = f3, \
[1*idx+32] = f4, [1*idx+40] = f5, \
[1*idx+48] = f6
#endif
const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 16) = {
{
-6, 10, -5, 2, -3, 1, -3, 1,
-4, 6, -3, 2, -3, 2, -3, 1,
0, 0, 10, 0, 1, 10, 1, 2,
0, 0, 6, 0, 2, 6, 2, 2,
0, 12, 0, 9, 0, 7, 10, 5,
0, 2, 0, 2, 0, 2, 6, 3,
0, 0, 0, 0, 0, 0, 0, 0,
12, 0, 9, 0, 7, 0, 5, 0
F( 0, -6, 10, 0, 0, 0, 12, 0 ),
F( 1, -5, 2, 10, 0, 0, 9, 0 ),
F( 2, -3, 1, 1, 10, 0, 7, 0 ),
F( 3, -3, 1, 1, 2, 10, 5, 0 ),
F( 4, -4, 6, 0, 0, 0, 2, 12 ),
F( 5, -3, 2, 6, 0, 0, 2, 9 ),
F( 6, -3, 2, 2, 6, 0, 2, 7 ),
F( 7, -3, 1, 2, 2, 6, 3, 5 ),
}, {
-10, 16, -6, 0, -4, 0, -2, 0,
-10, 16, -6, 0, -4, 0, -2, 0,
0, 0, 16, 0, 0, 16, 0, 0,
0, 0, 16, 0, 0, 16, 0, 0,
0, 10, 0, 6, 0, 4, 16, 2,
0, 0, 0, 0, 0, 0, 16, 0,
0, 0, 0, 0, 0, 0, 0, 0,
10, 0, 6, 0, 4, 0, 2, 0
F( 0, -10, 16, 0, 0, 0, 10, 0 ),
F( 1, -6, 0, 16, 0, 0, 6, 0 ),
F( 2, -4, 0, 0, 16, 0, 4, 0 ),
F( 3, -2, 0, 0, 0, 16, 2, 0 ),
F( 4, -10, 16, 0, 0, 0, 0, 10 ),
F( 5, -6, 0, 16, 0, 0, 0, 6 ),
F( 6, -4, 0, 0, 16, 0, 0, 4 ),
F( 7, -2, 0, 0, 0, 16, 0, 2 ),
}, {
-8, 8, -8, 0, -8, 0, -8, 0,
-4, 4, -4, 0, -4, 0, -4, 0,
0, 0, 8, 0, 0, 8, 0, 0,
0, 0, 4, 0, 0, 4, 0, 0,
0, 16, 0, 16, 0, 16, 8, 16,
0, 0, 0, 0, 0, 0, 4, 0,
0, 0, 0, 0, 0, 0, 0, 0,
16, 0, 16, 0, 16, 0, 16, 0
F( 0, -8, 8, 0, 0, 0, 16, 0 ),
F( 1, -8, 0, 8, 0, 0, 16, 0 ),
F( 2, -8, 0, 0, 8, 0, 16, 0 ),
F( 3, -8, 0, 0, 0, 8, 16, 0 ),
F( 4, -4, 4, 0, 0, 0, 0, 16 ),
F( 5, -4, 0, 4, 0, 0, 0, 16 ),
F( 6, -4, 0, 0, 4, 0, 0, 16 ),
F( 7, -4, 0, 0, 0, 4, 0, 16 ),
}, {
-2, 8, -1, 3, -1, 2, 0, 1,
-1, 4, -1, 3, -1, 2, -1, 2,
0, 0, 8, 0, 3, 8, 2, 3,
0, 0, 4, 0, 3, 4, 2, 3,
0, 10, 0, 6, 0, 4, 8, 2,
0, 3, 0, 4, 0, 4, 4, 3,
0, 0, 0, 0, 0, 0, 0, 0,
10, 0, 6, 0, 4, 0, 3, 0
F( 0, -2, 8, 0, 0, 0, 10, 0 ),
F( 1, -1, 3, 8, 0, 0, 6, 0 ),
F( 2, -1, 2, 3, 8, 0, 4, 0 ),
F( 3, 0, 1, 2, 3, 8, 2, 0 ),
F( 4, -1, 4, 0, 0, 0, 3, 10 ),
F( 5, -1, 3, 4, 0, 0, 4, 6 ),
F( 6, -1, 2, 3, 4, 0, 4, 4 ),
F( 7, -1, 2, 2, 3, 4, 3, 3 ),
}, {
-12, 14, -10, 0, -9, 0, -8, 0,
-10, 12, -9, 1, -8, 0, -7, 0,
0, 0, 14, 0, 0, 14, 0, 0,
0, 0, 12, 0, 0, 12, 0, 1,
0, 14, 0, 12, 0, 11, 14, 10,
0, 0, 0, 0, 0, 1, 12, 1,
0, 0, 0, 0, 0, 0, 0, 0,
14, 0, 12, 0, 11, 0, 9, 0
F( 0, -12, 14, 0, 0, 0, 14, 0 ),
F( 1, -10, 0, 14, 0, 0, 12, 0 ),
F( 2, -9, 0, 0, 14, 0, 11, 0 ),
F( 3, -8, 0, 0, 0, 14, 10, 0 ),
F( 4, -10, 12, 0, 0, 0, 0, 14 ),
F( 5, -9, 1, 12, 0, 0, 0, 12 ),
F( 6, -8, 0, 0, 12, 0, 1, 11 ),
F( 7, -7, 0, 0, 1, 12, 1, 9 ),
}
};

View File

@ -32,6 +32,8 @@ pw_1024: times 16 dw 1024
pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
pw_seed_xor: times 2 dw 0xb524
times 2 dw 0x49d8
pd_m65536: dd ~0xffff
pb_23_22: times 2 db 23, 22
pb_1: times 4 db 1
@ -55,6 +57,7 @@ pb_27_17_17_27: db 27, 17, 17, 27
%endmacro
JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3
JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3
struc FGData
.seed: resd 1
@ -409,6 +412,443 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
jg .y_loop_ar3
RET
INIT_XMM avx2
cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
lea r4, [pb_mask]
%define base r4-pb_mask
movq xm1, [base+rnd_next_upperbit_mask]
movq xm4, [base+mul_bits]
movq xm7, [base+hmul_bits]
mov r5d, [fg_dataq+FGData.grain_scale_shift]
vpbroadcastw xm8, [base+round+r5*2]
mova xm5, [base+pb_mask]
vpbroadcastw xm0, [fg_dataq+FGData.seed]
vpbroadcastw xm9, [base+pw_seed_xor+uvq*4]
pxor xm0, xm9
vpbroadcastd xm9, [base+pd_m65536]
lea r6, [gaussian_sequence]
mov r7d, 38
add bufq, 44
.loop_y:
mov r5, -44
.loop_x:
pand xm2, xm0, xm1
psrlw xm3, xm2, 10
por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
pmullw xm2, xm4 ; bits 0x0f00 are set
pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds
psllq xm6, xm2, 30
por xm2, xm6
psllq xm6, xm2, 15
por xm2, xm6 ; aggregate each bit into next seed's high bit
pmulhuw xm3, xm0, xm7
por xm2, xm3 ; 4 next output seeds
pshuflw xm0, xm2, q3333
psrlw xm2, 5
pmovzxwd xm3, xm2
mova xm6, xm9
vpgatherdd xm2, [r6+xm3*2], xm6
pandn xm2, xm9, xm2
packusdw xm2, xm2
pmulhrsw xm2, xm8
packsswb xm2, xm2
movd [bufq+r5], xm2
add r5, 4
jl .loop_x
add bufq, 82
dec r7d
jg .loop_y
; auto-regression code
movsxd r5, [fg_dataq+FGData.ar_coeff_lag]
movsxd r5, [base+generate_grain_uv_420_avx2_table+r5*4]
lea r5, [r5+base+generate_grain_uv_420_avx2_table]
jmp r5
.ar0:
INIT_YMM avx2
DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
imul uvd, 25
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
movd xm3, [base+hmul_bits+shiftq*2]
DEFINE_ARGS buf, bufy, h
pmovsxbw xm4, xm4
vpbroadcastd m7, [pb_1]
vpbroadcastw m6, [hmul_bits+4]
vpbroadcastw m4, xm4
vpbroadcastw m3, xm3
sub bufq, 82*38+82-(82*3+41)
add bufyq, 3+82*3
mov hd, 35
.y_loop_ar0:
; first 32 pixels
movu xm8, [bufyq]
movu xm9, [bufyq+82]
movu xm10, [bufyq+16]
movu xm11, [bufyq+82+16]
vinserti128 m8, [bufyq+32], 1
vinserti128 m9, [bufyq+82+32], 1
vinserti128 m10, [bufyq+48], 1
vinserti128 m11, [bufyq+82+48], 1
pmaddubsw m8, m7, m8
pmaddubsw m9, m7, m9
pmaddubsw m10, m7, m10
pmaddubsw m11, m7, m11
paddw m8, m9
paddw m10, m11
pmulhrsw m8, m6
pmulhrsw m10, m6
pmullw m8, m4
pmullw m10, m4
pmulhrsw m8, m3
pmulhrsw m10, m3
packsswb m8, m10
movu m0, [bufq]
punpckhbw m1, m0, m8
punpcklbw m0, m8
pmaddubsw m1, m7, m1
pmaddubsw m0, m7, m0
packsswb m0, m1
movu [bufq], m0
; last 6 pixels
movu xm8, [bufyq+32*2]
movu xm9, [bufyq+32*2+82]
pmaddubsw xm8, xm7, xm8
pmaddubsw xm9, xm7, xm9
paddw xm8, xm9
pmulhrsw xm8, xm6
pmullw xm8, xm4
pmulhrsw xm8, xm3
packsswb xm8, xm8
movq xm0, [bufq+32]
punpcklbw xm8, xm0
pmaddubsw xm8, xm7, xm8
packsswb xm8, xm8
vpblendw xm0, xm8, xm0, 1000b
movq [bufq+32], xm0
add bufq, 82
add bufyq, 82*2
dec hd
jg .y_loop_ar0
RET
.ar1:
INIT_XMM avx2
DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift
imul uvd, 25
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3
DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift
pmovsxbw xm4, xm4
pshufd xm5, xm4, q1111
pshufd xm4, xm4, q0000
pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd
vpbroadcastd xm7, [pb_1]
vpbroadcastw xm6, [hmul_bits+4]
vpbroadcastd xm3, xm3
sub bufq, 82*38+44-(82*3+41)
add bufyq, 79+82*3
mov hd, 35
mov mind, -128
mov maxd, 127
.y_loop_ar1:
mov xq, -38
movsx val3d, byte [bufq+xq-1]
.x_loop_ar1:
pmovsxbw xm0, [bufq+xq-82-1] ; top/left
movq xm8, [bufyq+xq*2]
movq xm9, [bufyq+xq*2+82]
psrldq xm2, xm0, 2 ; top
psrldq xm1, xm0, 4 ; top/right
pmaddubsw xm8, xm7, xm8
pmaddubsw xm9, xm7, xm9
paddw xm8, xm9
pmulhrsw xm8, xm6
punpcklwd xm0, xm2
punpcklwd xm1, xm8
pmaddwd xm0, xm4
pmaddwd xm1, xm5
paddd xm0, xm1
paddd xm0, xm3
.x_loop_ar1_inner:
movd val0d, xm0
psrldq xm0, 4
imul val3d, cf3d
add val3d, val0d
sarx val3d, val3d, shiftd
movsx val0d, byte [bufq+xq]
add val3d, val0d
cmp val3d, maxd
cmovg val3d, maxd
cmp val3d, mind
cmovl val3d, mind
mov byte [bufq+xq], val3b
; keep val3d in-place as left for next x iteration
inc xq
jz .x_loop_ar1_end
test xq, 3
jnz .x_loop_ar1_inner
jmp .x_loop_ar1
.x_loop_ar1_end:
add bufq, 82
add bufyq, 82*2
dec hd
jg .y_loop_ar1
RET
.ar2:
DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
imul uvd, 25
movd xm15, [base+hmul_bits-10+shiftq*2]
pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7
pmovsxbw xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12
DEFINE_ARGS buf, bufy, h, x
pshufd xm12, xm9, q0000
pshufd xm13, xm9, q1111
pshufd xm14, xm9, q2222
pxor xm10, xm10
vpblendw xm14, xm10, 10101010b
pshufd xm11, xm8, q3333
pshufd xm10, xm8, q2222
pshufd xm9, xm8, q1111
pshufd xm8, xm8, q0000
sub bufq, 82*38+44-(82*3+41)
add bufyq, 79+82*3
mov hd, 35
.y_loop_ar2:
mov xq, -38
.x_loop_ar2:
pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5]
pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5]
psrldq xm2, xm0, 2 ; y=-2,x=[-1,+5]
psrldq xm3, xm1, 2 ; y=-1,x=[-1,+5]
psrldq xm4, xm1, 4 ; y=-1,x=[+0,+5]
punpcklwd xm2, xm0, xm2
punpcklwd xm3, xm4
pmaddwd xm2, xm8
pmaddwd xm3, xm11
paddd xm2, xm3
psrldq xm4, xm0, 4 ; y=-2,x=[+0,+5]
psrldq xm5, xm0, 6 ; y=-2,x=[+1,+5]
psrldq xm6, xm0, 8 ; y=-2,x=[+2,+5]
punpcklwd xm4, xm5
punpcklwd xm6, xm1
psrldq xm7, xm1, 6 ; y=-1,x=[+1,+5]
psrldq xm1, xm1, 8 ; y=-1,x=[+2,+5]
punpcklwd xm7, xm1
pmaddwd xm4, xm9
pmaddwd xm6, xm10
pmaddwd xm7, xm12
paddd xm4, xm6
paddd xm2, xm7
paddd xm2, xm4
vpbroadcastd xm4, [base+pb_1]
movq xm6, [bufyq+xq*2]
movq xm7, [bufyq+xq*2+82]
pmaddubsw xm6, xm4, xm6
pmaddubsw xm7, xm4, xm7
vpbroadcastw xm4, [base+hmul_bits+4]
paddw xm6, xm7
pmulhrsw xm6, xm4
pxor xm7, xm7
punpcklwd xm6, xm7
pmaddwd xm6, xm14
paddd xm2, xm6
movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5]
.x_loop_ar2_inner:
pmovsxbw xm0, xm0
pmaddwd xm3, xm0, xm13
paddd xm3, xm2
psrldq xm2, 4 ; shift top to next pixel
psrad xm3, 5
packssdw xm3, xm3
pmulhrsw xm3, xm15
pslldq xm3, 2
psrldq xm0, 2
paddw xm3, xm0
vpblendw xm0, xm3, 00000010b
packsswb xm0, xm0
pextrb [bufq+xq], xm0, 1
inc xq
jz .x_loop_ar2_end
test xq, 3
jnz .x_loop_ar2_inner
jmp .x_loop_ar2
.x_loop_ar2_end:
add bufq, 82
add bufyq, 82*2
dec hd
jg .y_loop_ar2
RET
.ar3:
DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
SUB rsp, 16*12
%assign stack_size_padded (stack_size_padded+16*12)
%assign stack_size (stack_size+16*12)
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
imul uvd, 25
movd xm14, [base+hmul_bits-10+shiftq*2]
pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-7
pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 8] ; cf8-15
pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23
pmovsxbw xm5, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma]
pshufd xm9, xm0, q1111
pshufd xm10, xm0, q2222
pshufd xm11, xm0, q3333
pshufd xm0, xm0, q0000
pshufd xm6, xm1, q1111
pshufd xm7, xm1, q2222
pshufd xm8, xm1, q3333
pshufd xm1, xm1, q0000
pshufd xm3, xm2, q1111
pshufd xm4, xm2, q2222
vpbroadcastw xm5, xm5
vpblendw xm4, xm5, 10101010b ; interleave luma cf
psrldq xm5, xm2, 10
pshufd xm2, xm2, q0000
pinsrw xm5, [base+round_vals+shiftq*2-10], 3
mova [rsp+ 0*16], xm0
mova [rsp+ 1*16], xm9
mova [rsp+ 2*16], xm10
mova [rsp+ 3*16], xm11
mova [rsp+ 4*16], xm1
mova [rsp+ 5*16], xm6
mova [rsp+ 6*16], xm7
mova [rsp+ 7*16], xm8
mova [rsp+ 8*16], xm2
mova [rsp+ 9*16], xm3
mova [rsp+10*16], xm4
mova [rsp+11*16], xm5
vpbroadcastd xm13, [base+pb_1]
vpbroadcastw xm15, [base+hmul_bits+4]
DEFINE_ARGS buf, bufy, h, x
sub bufq, 82*38+44-(82*3+41)
add bufyq, 79+82*3
mov hd, 35
.y_loop_ar3:
mov xq, -38
.x_loop_ar3:
movu xm0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12]
movu xm1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12]
movu xm2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12]
pxor xm3, xm3
pcmpgtb xm6, xm3, xm2
pcmpgtb xm5, xm3, xm1
pcmpgtb xm4, xm3, xm0
punpckhbw xm3, xm0, xm4
punpcklbw xm0, xm4
punpckhbw xm4, xm1, xm5
punpcklbw xm1, xm5
punpckhbw xm5, xm2, xm6
punpcklbw xm2, xm6
psrldq xm6, xm0, 2
psrldq xm7, xm0, 4
psrldq xm8, xm0, 6
psrldq xm9, xm0, 8
palignr xm10, xm3, xm0, 10
palignr xm11, xm3, xm0, 12
punpcklwd xm0, xm6
punpcklwd xm7, xm8
punpcklwd xm9, xm10
punpcklwd xm11, xm1
pmaddwd xm0, [rsp+ 0*16]
pmaddwd xm7, [rsp+ 1*16]
pmaddwd xm9, [rsp+ 2*16]
pmaddwd xm11, [rsp+ 3*16]
paddd xm0, xm7
paddd xm9, xm11
paddd xm0, xm9
psrldq xm6, xm1, 2
psrldq xm7, xm1, 4
psrldq xm8, xm1, 6
psrldq xm9, xm1, 8
palignr xm10, xm4, xm1, 10
palignr xm11, xm4, xm1, 12
psrldq xm12, xm2, 2
punpcklwd xm6, xm7
punpcklwd xm8, xm9
punpcklwd xm10, xm11
punpcklwd xm12, xm2, xm12
pmaddwd xm6, [rsp+ 4*16]
pmaddwd xm8, [rsp+ 5*16]
pmaddwd xm10, [rsp+ 6*16]
pmaddwd xm12, [rsp+ 7*16]
paddd xm6, xm8
paddd xm10, xm12
paddd xm6, xm10
paddd xm0, xm6
psrldq xm6, xm2, 4
psrldq xm7, xm2, 6
psrldq xm8, xm2, 8
palignr xm9, xm5, xm2, 10
palignr xm5, xm5, xm2, 12
movq xm1, [bufyq+xq*2]
movq xm2, [bufyq+xq*2+82]
pmaddubsw xm1, xm13, xm1
pmaddubsw xm2, xm13, xm2
paddw xm1, xm2
vpbroadcastw xm3, xm15
pmulhrsw xm1, xm3
punpcklwd xm6, xm7
punpcklwd xm8, xm9
punpcklwd xm5, xm1
pmaddwd xm6, [rsp+ 8*16]
pmaddwd xm8, [rsp+ 9*16]
pmaddwd xm5, [rsp+10*16]
paddd xm0, xm6
paddd xm8, xm5
paddd xm0, xm8
movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4]
.x_loop_ar3_inner:
pmovsxbw xm1, xm1
pmaddwd xm2, xm1, [rsp+16*11]
pshufd xm3, xm2, q1111
paddd xm2, xm3 ; left+cur
paddd xm2, xm0 ; add top
psrldq xm0, 4
psrad xm2, 5
packssdw xm2, xm2
pmulhrsw xm2, xm14
pslldq xm2, 6
vpblendw xm1, xm2, 1000b
packsswb xm1, xm1
pextrb [bufq+xq], xm1, 3
psrldq xm1, 1
inc xq
jz .x_loop_ar3_end
test xq, 3
jnz .x_loop_ar3_inner
jmp .x_loop_ar3
.x_loop_ar3_end:
add bufq, 82
add bufyq, 82*2
dec hd
jg .y_loop_ar3
RET
INIT_YMM avx2
cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
pcmpeqw m10, m10

View File

@ -29,6 +29,7 @@
#include "src/film_grain.h"
decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2);
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_avx2);
decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2);
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2);
@ -39,6 +40,7 @@ COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c
#if BITDEPTH == 8 && ARCH_X86_64
c->generate_grain_y = dav1d_generate_grain_y_avx2;
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_avx2;
c->fgy_32x32xn = dav1d_fgy_32x32xn_avx2;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_avx2;
#endif

File diff suppressed because it is too large Load Diff

View File

@ -39,6 +39,7 @@ decl_angular_ipred_fn(dav1d_ipred_smooth_avx2);
decl_angular_ipred_fn(dav1d_ipred_smooth_v_avx2);
decl_angular_ipred_fn(dav1d_ipred_smooth_h_avx2);
decl_angular_ipred_fn(dav1d_ipred_z1_avx2);
decl_angular_ipred_fn(dav1d_ipred_z2_avx2);
decl_angular_ipred_fn(dav1d_ipred_z3_avx2);
decl_angular_ipred_fn(dav1d_ipred_filter_avx2);
@ -119,6 +120,7 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c
c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_avx2;
c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_avx2;
c->intra_pred[Z1_PRED] = dav1d_ipred_z1_avx2;
c->intra_pred[Z2_PRED] = dav1d_ipred_z2_avx2;
c->intra_pred[Z3_PRED] = dav1d_ipred_z3_avx2;
c->intra_pred[FILTER_PRED] = dav1d_ipred_filter_avx2;

View File

@ -50,7 +50,6 @@ pw_2482_3344: dw 2482, 3344
pw_m3344_3344: dw -3344, 3344
pw_m3803_3344: dw -3803, 3344
pw_m3803_m6688: dw -3803, -6688
COEF_PAIR 2896, 2896
pw_2896_m2896: dw 2896, -2896
pw_5: times 2 dw 5
@ -63,6 +62,7 @@ pw_5793x4: times 2 dw 5793*4
pd_2048: dd 2048
COEF_PAIR 2896, 2896
COEF_PAIR 1567, 3784
COEF_PAIR 3784, 1567
COEF_PAIR 201, 4091
@ -194,7 +194,7 @@ SECTION .text
; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
%macro ITX_MULSUB_2W 7 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2]
%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2
punpckhwd m%3, m%2, m%1
punpcklwd m%2, m%1
%if %7 < 32
@ -222,20 +222,20 @@ SECTION .text
paddd m%2, m%5
psrad m%3, 12
psrad m%2, 12
%if %0 == 8
packssdw m%8, m%2, m%3
%else
packssdw m%2, m%3
%endif
%endmacro
%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784 ; t2, t3
vpbroadcastd m%6, [o(pw_2896x8)]
paddw m%5, m%1, m%3
psubw m%1, m%3
pmulhrsw m%1, m%6 ; t1
pmulhrsw m%5, m%6 ; t0
ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, %5 ; t2, t3
ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, %4 ; t1, t0
psubsw m%3, m%1, m%2
paddsw m%2, m%1
paddsw m%1, m%5, m%4
psubsw m%4, m%5, m%4
paddsw m%1, m%4, m%5
psubsw m%4, m%5
%endmacro
%macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048
@ -246,27 +246,20 @@ SECTION .text
psubsw m%2, m%6 ; t5a
paddsw m%10, m%8, m%4 ; t7
psubsw m%8, m%4 ; t6a
vpbroadcastd m%4, [o(pw_2896x8)]
psubw m%6, m%1, m%5
paddw m%1, m%5
psubw m%5, m%8, m%2
paddw m%8, m%2
pmulhrsw m%1, m%4 ; t0
pmulhrsw m%6, m%4 ; t1
pmulhrsw m%8, m%4 ; t6
pmulhrsw m%5, m%4 ; t5
psubsw m%4, m%1, m%7 ; dct4 out3
paddsw m%1, m%7 ; dct4 out0
paddsw m%7, m%6, m%3 ; dct4 out1
psubsw m%6, m%3 ; dct4 out2
paddsw m%2, m%7, m%8 ; out1
psubsw m%7, m%8 ; out6
ITX_MULSUB_2W %1, %5, %4, %6, %11, 2896, 2896 ; t1, t0
ITX_MULSUB_2W %8, %2, %4, %6, %11, 2896, 2896 ; t5, t6
psubsw m%6, m%1, m%3 ; dct4 out2
paddsw m%3, m%1 ; dct4 out1
paddsw m%1, m%5, m%7 ; dct4 out0
psubsw m%5, m%7 ; dct4 out3
psubsw m%7, m%3, m%2 ; out6
paddsw m%2, m%3 ; out1
paddsw m%3, m%6, m%8 ; out2
psubsw m%6, m%8 ; out5
psubsw m%8, m%1, m%10 ; out7
paddsw m%1, m%10 ; out0
paddsw m%3, m%6, m%5 ; out2
psubsw m%6, m%5 ; out5
psubsw m%5, m%4, m%9 ; out4
paddsw m%4, m%9 ; out3
paddsw m%4, m%5, m%9 ; out3
psubsw m%5, m%9 ; out4
%endmacro
; in1 = %1, in3 = %2, in5 = %3, in7 = %4
@ -286,20 +279,16 @@ SECTION .text
paddsw m%1, m%5 ; t8
ITX_MULSUB_2W %2, %7, %5, %10, %11, 1567, 3784 ; t9a, t14a
ITX_MULSUB_2W %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a
vpbroadcastd m%10, [o(pw_2896x8)]
psubsw m%5, m%2, m%9 ; t10
paddsw m%2, m%9 ; t9
psubsw m%9, m%1, m%3 ; t11a
psubsw m%5, m%1, m%3 ; t11a
paddsw m%1, m%3 ; t8a
psubsw m%3, m%7, m%4 ; t13
paddsw m%7, m%4 ; t14
psubsw m%4, m%8, m%6 ; t12a
paddsw m%8, m%6 ; t15a
paddw m%6, m%3, m%5 ; t13a
psubw m%3, m%5 ; t10a
paddw m%5, m%4, m%9 ; t12
psubw m%4, m%9 ; t11
REPX {pmulhrsw x, m%10}, m%6, m%3, m%5, m%4
psubsw m%6, m%2, m%9 ; t10
paddsw m%2, m%9 ; t9
ITX_MULSUB_2W %3, %6, %9, %10, %11, 2896, 2896 ; t10a, t13a
ITX_MULSUB_2W %4, %5, %9, %10, %11, 2896, 2896 ; t11, t12
%endmacro
%macro WRAP_XMM 1+
@ -446,21 +435,14 @@ ALIGN function_align
%endif
%endmacro
%macro IDCT4_1D_PACKED 0-1 ; pw_2896x8
%macro IDCT4_1D_PACKED 0
vpbroadcastd m4, [o(pd_2048)]
punpckhwd m2, m1, m0
psubw m3, m0, m1
paddw m0, m1
punpcklqdq m0, m3
ITX_MUL2X_PACK 2, 1, 3, 4, 1567, 3784
%if %0 == 1
pmulhrsw m0, m%1
%else
vpbroadcastd m4, [o(pw_2896x8)]
pmulhrsw m0, m4 ; t0 t1
%endif
psubsw m1, m0, m2 ; out3 out2
paddsw m0, m2 ; out0 out1
punpcklwd m1, m0
ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784
ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896
paddsw m0, m1, m2 ; out0 out1
psubsw m1, m2 ; out3 out2
%endmacro
%macro IADST4_1D_PACKED 0
@ -683,30 +665,30 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
vpbroadcastd m6, [o(pd_2048)]
punpckhwd m5, m3, m0 ; in7 in1
punpckhwd m4, m1, m2 ; in3 in5
punpcklwd m3, m1 ; in2 in6
psubw m1, m0, m2
paddw m0, m2
punpcklqdq m0, m1 ; in0+in4 in0-in4
ITX_MUL2X_PACK 5, 1, 2, 6, 799, 4017, 1 ; t4a t7a
ITX_MUL2X_PACK 4, 1, 2, 6, 3406, 2276, 1 ; t5a t6a
ITX_MUL2X_PACK 3, 1, 2, 6, 1567, 3784 ; t3 t2
vpbroadcastd m6, [o(pw_2896x8)]
psubsw m2, m5, m4 ; t4 t7
paddsw m5, m4 ; t5a t6a
pshufd m4, m2, q1032
psubw m1, m2, m4
paddw m4, m2
vpblendd m4, m4, m1, 0xcc
pmulhrsw m0, m6 ; t0 t1
pmulhrsw m4, m6 ; t6 t5
psubsw m1, m0, m3 ; tmp3 tmp2
paddsw m0, m3 ; tmp0 tmp1
shufps m2, m5, m4, q1032 ; t7 t6
vpblendd m5, m5, m4, 0xcc ; t4 t5
psubsw m3, m0, m2 ; out7 out6
paddsw m0, m2 ; out0 out1
psubsw m2, m1, m5 ; out4 out5
paddsw m1, m5 ; out3 out2
punpcklwd m3, m1 ; in6 in2
punpcklwd m2, m0 ; in4 in0
ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a
ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a
ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2
psubsw m0, m5, m4 ; t5a t6a (interleaved)
paddsw m4, m5 ; t4 t7 (interleaved)
ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1
vpbroadcastd m1, [o(pw_m2896_2896)]
ITX_MUL2X_PACK 0, 1, _, 6, 1, 5, 4 ; t6 t5
%if mmsize > 16
vbroadcasti128 m1, [o(deint_shuf)]
pshufb m4, m1
%else
pshufb m4, [o(deint_shuf)]
%endif
psubsw m1, m2, m3 ; tmp3 tmp2
paddsw m3, m2 ; tmp0 tmp1
shufps m2, m4, m0, q1032 ; t7 t6
vpblendd m4, m0, 0xcc ; t4 t5
paddsw m0, m3, m2 ; out0 out1
psubsw m3, m2 ; out7 out6
psubsw m2, m1, m4 ; out4 out5
paddsw m1, m4 ; out3 out2
%endmacro
%macro IADST8_1D_PACKED 1 ; pass
@ -797,10 +779,10 @@ INV_TXFM_4X8_FN dct, flipadst
cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpermq m0, [cq+32*0], q3120
vpermq m1, [cq+32*1], q3120
vpbroadcastd m5, [o(pw_2896x8)]
pmulhrsw m0, m5
pmulhrsw m1, m5
IDCT4_1D_PACKED 5
vpbroadcastd m2, [o(pw_2896x8)]
pmulhrsw m0, m2
pmulhrsw m1, m2
IDCT4_1D_PACKED
vbroadcasti128 m2, [o(deint_shuf)]
shufps m3, m0, m1, q1331
shufps m0, m0, m1, q0220
@ -1011,9 +993,7 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpbroadcastd m10, [o(pd_2048)]
.main2:
punpckhwd m8, m7, m0 ; dct16 in15 in1
paddw m9, m0, m4
psubw m0, m4
punpcklqdq m9, m0 ; dct4 in0+in2 in0-in2
punpcklwd m9, m4, m0 ; dct4 in2 in0
punpckhwd m0, m3, m4 ; dct16 in7 in9
punpcklwd m7, m1 ; dct8 in7 in1
punpckhwd m1, m6 ; dct16 in3 in13
@ -1024,47 +1004,44 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 3 ; t9a t14a
ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a
ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a
ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 1 ; t4a t7a
ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 1 ; t5a t6a
ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 3 ; t4a t7a
ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 3 ; t5a t6a
ITX_MUL2X_PACK 6, 2, 4, 10, 1567, 3784 ; t3 t2
psubsw m2, m8, m0 ; t9 t14
paddsw m8, m0 ; t8 t15
psubsw m0, m1, m5 ; t10 t13
paddsw m1, m5 ; t11 t12
%if mmsize > 16
vbroadcasti128 m5, [o(deint_shuf)]
%else
mova m5, [o(deint_shuf)]
%endif
pshufb m8, m5
pshufb m1, m5
vpbroadcastd m5, [o(pw_m3784_1567)] ; reuse pw_1567_3784
ITX_MUL2X_PACK 2, 4, _, 10, 4, 5, 4 ; t9a t14a
ITX_MUL2X_PACK 2, 4, _, 10, 4, 5, 6 ; t9a t14a
vpbroadcastd m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567
ITX_MUL2X_PACK 0, 5, _, 10, 5, 4, 4 ; t10a t13a
psubsw m5, m7, m3 ; t5a t6a
paddsw m7, m3 ; t4 t7
ITX_MUL2X_PACK 0, 5, _, 10, 5, 4, 6 ; t10a t13a
psubsw m4, m8, m1 ; t11a t12a
paddsw m8, m1 ; t8a t15a
paddsw m1, m2, m0 ; t9 t14
psubsw m1, m7, m3 ; t5a t6a
paddsw m7, m3 ; t4 t7
paddsw m3, m2, m0 ; t9 t14
psubsw m2, m0 ; t10 t13
punpckhqdq m0, m8, m1 ; t15a t14
punpcklqdq m8, m1 ; t8a t9
pshufd m3, m5, q1032
psubw m1, m5, m3
paddw m3, m5
vpblendd m3, m3, m1, 0xcc ; t6 t5
vpbroadcastd m1, [o(pw_2896x8)]
punpckhqdq m5, m4, m2 ; t12a t13
punpcklqdq m2, m4, m2 ; t11a t10
psubw m4, m5, m2
paddw m5, m2
pmulhrsw m9, m1 ; t0 t1
pmulhrsw m3, m1 ; t6 t5
pmulhrsw m4, m1 ; t11 t10a
pmulhrsw m5, m1 ; t12 t13a
shufps m2, m7, m3, q1032 ; t7 t6
vpblendd m7, m7, m3, 0xcc ; t4 t5
%if mmsize > 16
vbroadcasti128 m0, [o(deint_shuf)]
%else
mova m0, [o(deint_shuf)]
%endif
pshufb m8, m0
pshufb m7, m0
pshufb m3, m0
ITX_MUL2X_PACK 9, 0, 5, 10, 2896, 2896 ; t0 t1
vpbroadcastd m0, [o(pw_m2896_2896)]
ITX_MUL2X_PACK 4, 5, _, 10, 5, 0, 4 ; t11 t12
vpbroadcastd m5, [o(pw_2896_2896)]
ITX_MUL2X_PACK 1, 0, _, 10, 0, 5, 4 ; t6 t5
vpbroadcastd m0, [o(pw_m2896_2896)]
ITX_MUL2X_PACK 2, 0, _, 10, 0, 5, 4, ; t13a t10a
punpckhqdq m0, m8, m3 ; t15a t14
punpcklqdq m8, m3 ; t8a t9
shufps m5, m4, m2, q1032 ; t12 t13a
vpblendd m4, m2, 0xcc ; t11 t10a
shufps m2, m7, m1, q1032 ; t7 t6
vpblendd m7, m1, 0xcc ; t4 t5
psubsw m1, m9, m6 ; dct4 out3 out2
paddsw m9, m6 ; dct4 out0 out1
psubsw m3, m9, m2 ; dct8 out7 out6
@ -3699,12 +3676,11 @@ ALIGN function_align
paddsw m6, m11 ; t17 t30
psubsw m11, m0, m14 ; t21 t26
paddsw m0, m14 ; t22 t25
ITX_MUL2X_PACK 15, 12, 14, 10, 1567, 3784, 1 ; t18a t29a
ITX_MUL2X_PACK 13, 12, 14, 10, 1567, 3784, 1 ; t19 t28
ITX_MUL2X_PACK 9, 12, 14, 10, m3784, 1567, 1 ; t20 t27
ITX_MUL2X_PACK 11, 12, 14, 10, m3784, 1567, 1 ; t21a t26a
ITX_MUL2X_PACK 15, 12, 14, 10, 1567, 3784, 3 ; t18a t29a
ITX_MUL2X_PACK 13, 12, 14, 10, 1567, 3784, 3 ; t19 t28
ITX_MUL2X_PACK 9, 12, 14, 10, m3784, 1567, 3 ; t20 t27
ITX_MUL2X_PACK 11, 12, 14, 10, m3784, 1567, 3 ; t21a t26a
vbroadcasti128 m12, [o(deint_shuf)]
REPX {pshufb x, m12}, m0, m1, m6, m8
psubsw m14, m1, m8 ; t23 t24
paddsw m1, m8 ; t16 t31
psubsw m8, m6, m0 ; t22a t25a
@ -3713,16 +3689,18 @@ ALIGN function_align
paddsw m15, m11 ; t18 t29
psubsw m11, m13, m9 ; t20a t27a
paddsw m13, m9 ; t19a t28a
vpbroadcastd m12, [o(pw_2896x8)]
punpcklqdq m9, m11, m0 ; t20a t21
punpckhqdq m11, m0 ; t27a t26
punpcklqdq m0, m14, m8 ; t23 t22a
punpckhqdq m14, m8 ; t24 t25a
psubw m8, m11, m9 ; t20 t21a
paddw m11, m9 ; t27 t26a
psubw m9, m14, m0 ; t23a t22
paddw m14, m0 ; t24a t25
REPX {pmulhrsw x, m12}, m8, m9, m14, m11
REPX {pshufb x, m12}, m1, m6, m15, m13
ITX_MUL2X_PACK 14, 9, 12, 10, 2896, 2896 ; t24a t23a
vpbroadcastd m9, [o(pw_m2896_2896)]
ITX_MUL2X_PACK 8, 12, _, 10, 12, 9, 4 ; t22 t25
vpbroadcastd m12, [o(pw_2896_2896)]
ITX_MUL2X_PACK 0, 12, _, 10, 12, 9, 4 ; t21a t26a
vpbroadcastd m12, [o(pw_2896_2896)]
ITX_MUL2X_PACK 11, 9, _, 10, 9, 12, 4 ; t27 t20
shufps m9, m14, m8, q1032 ; t23a t22
vpblendd m14, m8, 0xcc ; t24a t25
shufps m8, m11, m0, q1032 ; t20 t21a
vpblendd m11, m0, 0xcc ; t27 t26a
punpcklqdq m0, m1, m6 ; t16 t17a
punpckhqdq m1, m6 ; t31 t30a
psubsw m10, m5, m8 ; out20 out21
@ -4327,33 +4305,29 @@ ALIGN function_align
mova m5, [rsp+gprsize+32*0] ; t22
mova m6, [rsp+gprsize+32*1] ; t23
mova m3, [rsp+gprsize+32*2] ; t24a
vpbroadcastd m8, [o(pw_2896x8)]
psubsw m1, m14, m5 ; t22a
paddsw m14, m5 ; t17a
psubsw m5, m0, m6 ; t23
paddsw m0, m6 ; t16
psubsw m6, m4, m3 ; t24
paddsw m4, m3 ; t31
vpbroadcastd m8, [o(pw_m2896_2896)]
vpbroadcastd m3, [o(pw_2896_2896)]
mova [tmp1q-32*4], m0
mova [tmp1q-32*3], m14
mova [tmp2q+32*3], m4
psubw m3, m13, m9 ; t20
paddw m13, m9 ; t27
psubw m9, m2, m10 ; t21a
paddw m2, m10 ; t26a
psubw m10, m7, m1 ; t22
paddw m7, m1 ; t25
psubw m1, m6, m5 ; t23a
paddw m6, m5 ; t24a
REPX {pmulhrsw x, m8}, m3, m13, m9, m2, m10, m7, m1, m6
mova [tmp1q+32*0], m3
mova [tmp1q+32*1], m9
mova [tmp1q+32*2], m10
mova [tmp1q+32*3], m1
mova [tmp2q-32*4], m6
mova [tmp2q-32*3], m7
mova [tmp2q-32*2], m2
mova [tmp2q-32*1], m13
ITX_MULSUB_2W 13, 9, 0, 4, 15, 3, 8 ; t20, t27
ITX_MULSUB_2W 2, 10, 0, 4, 15, 3, 8 ; t21a, t26a
ITX_MULSUB_2W 7, 1, 0, 4, 15, 3, 8 ; t22, t25
ITX_MULSUB_2W 6, 5, 0, 4, 15, 3, 8 ; t23a, t24a
mova [tmp1q+32*0], m13
mova [tmp1q+32*1], m2
mova [tmp1q+32*2], m7
mova [tmp1q+32*3], m6
mova [tmp2q-32*4], m5
mova [tmp2q-32*3], m1
mova [tmp2q-32*2], m10
mova [tmp2q-32*1], m9
ret
ALIGN function_align
.transpose_2x8x8_round:
@ -5237,11 +5211,10 @@ ALIGN function_align
sub rax, o_idct64_offset + 8
vpbroadcastd m11, [o(pw_1567_3784)]
vpbroadcastd m12, [o(pw_m3784_1567)]
vpbroadcastd m13, [o(pw_m1567_m3784)]
vpbroadcastd m14, [o(pw_2896x8)]
vpbroadcastd m13, [o(pw_2896_2896)]
vpbroadcastd m14, [o(pw_m2896_2896)]
.main_part2_pass1_loop:
call .main_part2_internal
REPX {pmulhrsw x, m14}, m1, m2, m4, m3
IDCT64_PART2_END 0, 7, 0, 6, 9, 10
IDCT64_PART2_END 7, 8, 5, 0, 6, 7
IDCT64_PART2_END 8, 2, 1, 0, 6, 7
@ -5251,53 +5224,51 @@ ALIGN function_align
ret
.main_part2_internal:
mova m0, [tmp1q-32*12] ; t32a
mova m1, [tmp2q-32*13] ; t39a
mova m2, [tmp1q-32* 4] ; t40a
mova m6, [tmp2q-32*13] ; t39a
mova m1, [tmp1q-32* 4] ; t40a
mova m5, [tmp2q+32* 3] ; t55a
add tmp1q, 32
sub tmp2q, 32
mova m4, [tmp1q+32* 3] ; t48a
mova m3, [tmp2q-32* 4] ; t47a
mova m6, [tmp1q+32*11] ; t56a
mova m2, [tmp1q+32* 3] ; t48a
mova m4, [tmp2q-32* 4] ; t47a
mova m3, [tmp1q+32*11] ; t56a
mova m7, [tmp2q+32*12] ; t63a
psubsw m8, m0, m1 ; t39
paddsw m0, m1 ; t32
psubsw m1, m3, m2 ; t40
paddsw m3, m2 ; t47
psubsw m2, m4, m5 ; t55
paddsw m4, m5 ; t48
psubsw m5, m7, m6 ; t56
paddsw m7, m6 ; t63
ITX_MULSUB_2W 5, 8, 6, 9, 15, 11, 12 ; t39a, t56a
ITX_MULSUB_2W 2, 1, 6, 9, 15, 12, 13 ; t40a, t55a
psubsw m6, m0, m3 ; t47a
paddsw m0, m3 ; t32a
psubsw m3, m7, m4 ; t48a
paddsw m7, m4 ; t63a
psubsw m4, m5, m2 ; t40
paddsw m5, m2 ; t39
psubsw m2, m8, m1 ; t55
paddsw m8, m1 ; t56
psubw m1, m2, m4 ; t40a
paddw m2, m4 ; t55a
psubw m4, m3, m6 ; t47
paddw m3, m6 ; t48
psubsw m8, m0, m6 ; t39
paddsw m0, m6 ; t32
psubsw m6, m4, m1 ; t40
paddsw m4, m1 ; t47
psubsw m1, m2, m5 ; t55
paddsw m2, m5 ; t48
psubsw m5, m7, m3 ; t56
paddsw m7, m3 ; t63
ITX_MULSUB_2W 5, 8, 3, 9, 15, 11, 12 ; t39a, t56a
vpbroadcastd m9, [o(pw_m1567_m3784)]
ITX_MULSUB_2W 1, 6, 3, 9, 15, 12, 9 ; t40a, t55a
psubsw m3, m0, m4 ; t47a
paddsw m0, m4 ; t32a
psubsw m4, m7, m2 ; t48a
paddsw m7, m2 ; t63a
psubsw m2, m5, m1 ; t40
paddsw m5, m1 ; t39
psubsw m1, m8, m6 ; t55
paddsw m8, m6 ; t56
ITX_MULSUB_2W 4, 3, 6, 9, 15, 13, 14 ; t47, t48
ITX_MULSUB_2W 1, 2, 6, 9, 15, 13, 14 ; t40a, t55a
ret
.main_part2_pass2:
sub rax, o_idct64_offset + 8
vpbroadcastd m11, [o(pw_1567_3784)]
vpbroadcastd m12, [o(pw_m3784_1567)]
vpbroadcastd m13, [o(pw_m1567_m3784)]
vpbroadcastd m14, [o(pw_2048)]
vpbroadcastd m13, [o(pw_2896_2896)]
lea r9, [strideq*5] ; stride*5
lea r3, [r9+strideq*1] ; stride*6
lea r7, [r9+strideq*2] ; stride*7
lea r8, [r3+strideq*2] ; stride*8
lea r2, [dstq+r7]
.main_part2_pass2_loop:
vpbroadcastd m14, [o(pw_m2896_2896)]
call .main_part2_internal
vpbroadcastd m10, [o(pw_2896x8)]
REPX {pmulhrsw x, m10}, m1, m2, m4, m3
vpbroadcastd m14, [o(pw_2048)]
IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*4, r7*8
IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*4, r7*8
IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8

View File

@ -202,7 +202,7 @@ SECTION .text
ret
%endmacro
; flags: 1 = swap, 2: coef_regs
; flags: 1 = swap, 2: coef_regs, 4: no_pack
%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
%if %6 & 2
pmaddwd m%2, m%4, m%1
@ -218,24 +218,17 @@ SECTION .text
paddd m%1, m%3
psrad m%2, 12
psrad m%1, 12
%if %6 & 4 == 0
packssdw m%1, m%2
%endif
%endmacro
%macro IDCT4_1D_PACKED 0-1 ;pw_2896x8
punpckhwd m2, m0, m1 ;unpacked in1 in3
psubw m3, m0, m1
paddw m0, m1
punpcklqdq m0, m3 ;high: in0-in2 ;low: in0+in2
mova m3, [o(pd_2048)]
punpckhwd m2, m0, m1 ;unpacked in1 in3
punpcklwd m0, m1 ;unpacked in0 in2
ITX_MUL2X_PACK 2, 1, 3, 1567, 3784
%if %0 == 1
pmulhrsw m0, m%1
%else
pmulhrsw m0, [o(pw_2896x8)] ;high: t1 ;low: t0
%endif
ITX_MUL2X_PACK 0, 1, 3, 2896, 2896
psubsw m1, m0, m2 ;high: out2 ;low: out3
paddsw m0, m2 ;high: out1 ;low: out0
%endmacro
@ -499,79 +492,81 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff
%macro IDCT8_1D_PACKED 0
mova m6, [o(pd_2048)]
punpckhwd m5, m0, m3 ;unpacked in1 in7
punpckhwd m4, m2, m1 ;unpacked in5 in3
punpckhwd m4, m0, m3 ;unpacked in1 in7
punpcklwd m0, m2 ;unpacked in0 in4
punpckhwd m2, m1 ;unpacked in5 in3
punpcklwd m1, m3 ;unpacked in2 in6
psubw m3, m0, m2
paddw m0, m2
punpcklqdq m0, m3 ;low: in0+in4 high: in0-in4
ITX_MUL2X_PACK 5, 2, 6, 799, 4017, 1 ;low: t4a high: t7a
ITX_MUL2X_PACK 4, 2, 6, 3406, 2276, 1 ;low: t5a high: t6a
ITX_MUL2X_PACK 1, 2, 6, 1567, 3784 ;low: t3 high: t2
mova m6, [o(pw_2896x8)]
psubsw m2, m5, m4 ;low: t5a high: t6a
paddsw m5, m4 ;low: t4 high: t7
punpckhqdq m4, m2, m2 ;low: t6a high: t6a
psubw m3, m4, m2 ;low: t6a - t5a
paddw m4, m2 ;low: t6a + t5a
punpcklqdq m4, m3 ;low: t6a + t5a high: t6a - t5a
pmulhrsw m0, m6 ;low: t0 high: t1
pmulhrsw m4, m6 ;low: t6 high: t5
shufps m2, m5, m4, q1032 ;low: t7 high: t6
shufps m5, m4, q3210 ;low: t4 high: t5
psubsw m4, m0, m1 ;low: tmp3 high: tmp2
ITX_MUL2X_PACK 4, 3, 6, 799, 4017 ;low: t7a high: t4a
ITX_MUL2X_PACK 2, 3, 6, 3406, 2276 ;low: t6a high: t5a
ITX_MUL2X_PACK 1, 3, 6, 1567, 3784 ;low: t3 high: t2
psubsw m3, m4, m2 ;low: t6a high: t5a
paddsw m4, m2 ;low: t7 high: t4
pshufb m3, [o(deint_shuf1)]
ITX_MUL2X_PACK 0, 2, 6, 2896, 2896 ;low: t0 high: t1
ITX_MUL2X_PACK 3, 2, 6, 2896, 2896 ;low: t6 high: t5
psubsw m2, m0, m1 ;low: tmp3 high: tmp2
paddsw m0, m1 ;low: tmp0 high: tmp1
psubsw m3, m0, m2 ;low: out7 high: out6
paddsw m0, m2 ;low: out0 high: out1
psubsw m2, m4, m5 ;low: out4 high: out5
paddsw m1, m4, m5 ;low: out3 high: out2
punpcklqdq m1, m4, m3 ;low: t7 high: t6
punpckhqdq m4, m3 ;low: t4 high: t5
psubsw m3, m0, m1 ;low: out7 high: out6
paddsw m0, m1 ;low: out0 high: out1
paddsw m1, m2, m4 ;low: out3 high: out2
psubsw m2, m4 ;low: out4 high: out5
%endmacro
;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
%macro ITX_MULSUB_2W 7 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2]
punpckhwd m%3, m%1, m%2
%macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1
punpckhwd m%4, m%1, m%2
punpcklwd m%1, m%2
%if %7 < 8
pmaddwd m%2, m%7, m%1
pmaddwd m%4, m%7, m%3
pmaddwd m%3, m%7, m%4
%else
mova m%2, [o(pw_%7_%6)]
pmaddwd m%4, m%3, m%2
%if %8
pmaddwd m%3, m%1, m%2
pmaddwd m%2, m%4
%else
pmaddwd m%3, m%4, m%2
pmaddwd m%2, m%1
%endif
paddd m%4, m%5
paddd m%2, m%5
psrad m%4, 12
psrad m%2, 12
packssdw m%2, m%4 ;dst2
%if %7 < 8
pmaddwd m%3, m%6
pmaddwd m%1, m%6
%else
mova m%4, [o(pw_%6_m%7)]
pmaddwd m%3, m%4
pmaddwd m%1, m%4
%endif
paddd m%3, m%5
paddd m%1, m%5
paddd m%2, m%5
psrad m%3, 12
psrad m%2, 12
%if %8
packssdw m%3, m%2
%else
packssdw m%2, m%3 ;dst2
%endif
%if %7 < 8
pmaddwd m%4, m%6
pmaddwd m%1, m%6
%elif %8
mova m%2, [o(pw_%6_m%7)]
pmaddwd m%4, m%2
pmaddwd m%1, m%2
%else
mova m%3, [o(pw_%6_m%7)]
pmaddwd m%4, m%3
pmaddwd m%1, m%3
%endif
paddd m%4, m%5
paddd m%1, m%5
psrad m%4, 12
psrad m%1, 12
packssdw m%1, m%3 ;dst1
packssdw m%1, m%4 ;dst1
%endmacro
%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784 ;t2, t3
mova m%6, [o(pw_2896x8)]
paddw m%5, m%1, m%3
psubw m%1, m%3
pmulhrsw m%1, m%6 ;t1
pmulhrsw m%5, m%6 ;t0
psubsw m%3, m%1, m%2 ;out2
paddsw m%2, m%1 ;out1
paddsw m%1, m%5, m%4 ;out0
psubsw m%5, m%4 ;out3
mova m%4, m%5
ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3
ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0
psubsw m%3, m%1, m%2 ;out2
paddsw m%2, m%1 ;out1
paddsw m%1, m%5, m%4 ;out0
psubsw m%4, m%5 ;out3
%endmacro
%macro WRITE_4X8 4 ;row[1-4]
@ -1286,17 +1281,13 @@ cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
%endmacro
%macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048
ITX_MULSUB_2W %1, %4, %5, %6, %7, 799, 4017 ;t4a, t7a
ITX_MULSUB_2W %3, %2, %5, %6, %7, 3406, 2276 ;t5a, t6a
psubsw m%5, m%1, m%3 ;t5a
paddsw m%1, m%3 ;t4
psubsw m%6, m%4, m%2 ;t6a
paddsw m%4, m%2 ;t7
mova m%3, [o(pw_2896x8)]
psubw m%2, m%6, m%5 ;t6a - t5a
paddw m%6, m%5 ;t6a + t5a
pmulhrsw m%2, m%3 ;t5
pmulhrsw m%3, m%6 ;t6
ITX_MULSUB_2W %1, %4, %5, %6, %7, 799, 4017 ;t4a, t7a
ITX_MULSUB_2W %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a
psubsw m%2, m%4, m%5 ;t6a
paddsw m%4, m%5 ;t7
psubsw m%5, m%1, m%3 ;t5a
paddsw m%1, m%3 ;t4
ITX_MULSUB_2W %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6
%endmacro
INV_TXFM_8X8_FN dct, dct, 0
@ -2063,37 +2054,34 @@ cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
%macro IDCT16_1D_PACKED_ODDHALF 7 ;src[1-4], tmp[1-3]
punpckhwd m%5, m%4, m%1 ;packed in13 in3
punpcklwd m%1, m%4 ;packed in1 in15
punpcklwd m%6, m%3, m%2 ;packed in9 in7
punpcklwd m%4, m%3, m%2 ;packed in9 in7
punpckhwd m%2, m%3 ;packed in5 in11
mova m%7, [o(pd_2048)]
ITX_MUL2X_PACK %1, %4, %7, 401, 4076, 1 ;low: t8a high: t15a
ITX_MUL2X_PACK %6, %4, %7, 3166, 2598, 1 ;low: t9a high: t14a
ITX_MUL2X_PACK %2, %4, %7, 1931, 3612, 1 ;low: t10a high: t13a
ITX_MUL2X_PACK %5, %4, %7, 3920, 1189, 1 ;low: t11a high: t12a
psubsw m%4, m%1, m%6 ;low: t9 high: t14
paddsw m%1, m%6 ;low: t8 high: t15
ITX_MUL2X_PACK %1, %6, %7, 401, 4076, 1 ;low: t8a high: t15a
ITX_MUL2X_PACK %4, %6, %7, 3166, 2598, 1 ;low: t9a high: t14a
ITX_MUL2X_PACK %2, %6, %7, 1931, 3612, 1 ;low: t10a high: t13a
ITX_MUL2X_PACK %5, %6, %7, 3920, 1189, 1 ;low: t11a high: t12a
psubsw m%6, m%1, m%4 ;low: t9 high: t14
paddsw m%1, m%4 ;low: t8 high: t15
psubsw m%3, m%5, m%2 ;low: t10 high: t13
paddsw m%2, m%5 ;low: t11 high: t12
punpcklqdq m%5, m%4, m%3 ;low: t9 high: t10
punpckhqdq m%4, m%3 ;low: t14 high: t13
punpcklwd m%6, m%4, m%5 ;packed t14 t9
punpckhwd m%5, m%4 ;packed t10 t13
paddsw m%5, m%2 ;low: t11 high: t12
mova m%2, [o(deint_shuf2)]
pshufb m%6, m%2
pshufb m%3, [o(deint_shuf1)]
pxor m%4, m%4
psubw m%4, m%5 ;packed -t10 -t13
psubw m%4, m%3 ;packed -t10 -t13
ITX_MUL2X_PACK %6, %3, %7, 1567, 3784, 1 ;low: t9a high: t14a
ITX_MUL2X_PACK %4, %3, %7, 3784, 1567 ;low: t10a high: t13a
psubsw m%3, m%1, m%2 ;low: t11a high: t12a
paddsw m%1, m%2 ;low: t8a high: t15a
psubsw m%3, m%1, m%5 ;low: t11a high: t12a
paddsw m%1, m%5 ;low: t8a high: t15a
psubsw m%5, m%6, m%4 ;low: t10 high: t13
paddsw m%6, m%4 ;low: t9 high: t14
mova m%7, [o(pw_2896x8)]
punpckhqdq m%4, m%3, m%5 ;low: t12a high: t13
punpcklqdq m%3, m%5 ;low: t11a high: t10
psubw m%2, m%4, m%3
paddw m%3, m%4
pmulhrsw m%2, m%7 ;low: t11 high: t10a
pmulhrsw m%3, m%7 ;low: t12 high: t13a
pshufb m%3, m%2
pshufb m%5, m%2
ITX_MUL2X_PACK %3, %2, %7, 2896, 2896, 4 ;t12, t11
ITX_MUL2X_PACK %5, %4, %7, 2896, 2896, 4 ;t13a, t10a
packssdw m%2, m%4 ;low: t11 high: t10a
packssdw m%3, m%5 ;low: t12 high: t13a
punpckhqdq m%4, m%1, m%6 ;low: t15a high: t14
punpcklqdq m%1, m%6 ;low: t8a high: t9
%endmacro
@ -2918,19 +2906,14 @@ ALIGN function_align
mova m0, [rsp+gprsize*2+16*1]
mova m2, [rsp+gprsize*2+16*2]
mova [rsp+gprsize*2+16*1], m4
psubsw m4, m0, m3 ;t13
psubsw m5, m0, m3 ;t13
paddsw m0, m3 ;t14
psubsw m3, m2, m1 ;t12a
mova m3, [o(pd_2048)]
psubsw m4, m2, m1 ;t12a
paddsw m1, m2 ;t15a
mova m5, [o(pw_2896x8)]
psubw m2, m4, m7 ;t13-t10
paddw m7, m4 ;t13+t10
psubw m4, m3, m6 ;t12a-t11a
paddw m6, m3 ;t12a+t11a
pmulhrsw m7, m5 ;t13a
pmulhrsw m4, m5 ;t11
pmulhrsw m6, m5 ;t12
pmulhrsw m5, m2 ;t10a
mova [rsp+gprsize*2+16*2], m1
ITX_MULSUB_2W 5, 7, 1, 2, 3, 2896, 2896 ;t10a, t13a
ITX_MULSUB_2W 4, 6, 1, 2, 3, 2896, 2896 ;t11, t12
mova m3, [rsp+gprsize*2+16*8]
psubsw m2, m3, m5 ;out10
paddsw m3, m5 ;out5
@ -2950,6 +2933,7 @@ ALIGN function_align
mova [rsp+gprsize*2+16*5], m6
psubsw m6, m7, m0 ;out14
paddsw m7, m0 ;out1
mova m1, [rsp+gprsize*2+16*2]
mova m0, [rsp+gprsize*2+16*3]
mova [rsp+gprsize*2+16*4], m7
psubsw m7, m0, m1 ;out15
@ -4211,35 +4195,30 @@ ALIGN function_align
psubsw m5, m3, m2 ;t28a
paddsw m3, m2 ;t31a
ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t19, t28
mova m2, [rsp+gprsize*2+16*15] ;tmp12
psubsw m1, m5, m6 ;t20a
paddsw m5, m6 ;t19a
psubsw m6, m2, m5 ;out19
paddsw m2, m5 ;out12
mova m5, [rsp+gprsize*2+16*30] ;t27
mova [rsp+gprsize*2+16*22], m6 ;out19
mova [rsp+gprsize*2+16*15], m2 ;out12
mova m5, [rsp+gprsize*2+16*30] ;t27
psubsw m6, m4, m5 ;t27a
paddsw m4, m5 ;t28a
ITX_MULSUB_2W 6, 1, 2, 5, 7, 2896, 2896 ;t20, t27
mova m2, [rsp+gprsize*2+16*6 ] ;tmp3
mova m7, [o(pw_2896x8)]
psubw m5, m6, m1 ;t27a - t20a
paddw m6, m1 ;t27a + t20a
psubsw m1, m2, m4 ;out28
psubsw m5, m2, m4 ;out28
paddsw m2, m4 ;out3
pmulhrsw m5, m7 ;t20
pmulhrsw m6, m7 ;t27
mova m4, [rsp+gprsize*2+16*14] ;tmp11
mova [rsp+gprsize*2+16*31], m1 ;out28
mova [rsp+gprsize*2+16*31], m5 ;out28
mova [rsp+gprsize*2+16*6 ], m2 ;out3
psubsw m1, m4, m5 ;out20
paddsw m4, m5 ;out11
psubsw m5, m4, m6 ;out20
paddsw m4, m6 ;out11
mova m2, [rsp+gprsize*2+16*7 ] ;tmp4
mova [rsp+gprsize*2+16*23], m1 ;out20
mova [rsp+gprsize*2+16*23], m5 ;out20
mova [rsp+gprsize*2+16*14], m4 ;out11
psubsw m5, m2, m6 ;out27
paddsw m2, m6 ;out4
psubsw m5, m2, m1 ;out27
paddsw m2, m1 ;out4
mova m1, [rsp+gprsize*2+16*26] ;t23a
mova m4, [rsp+gprsize*2+16*27] ;t24a
mova [rsp+gprsize*2+16*30], m5 ;out27
@ -4248,27 +4227,24 @@ ALIGN function_align
paddsw m0, m1 ;t16
psubsw m2, m3, m4 ;t24
paddsw m3, m4 ;t31
ITX_MULSUB_2W 2, 5, 4, 6, 7, 2896, 2896 ;t23a, t24a
mova m6, [rsp+gprsize*2+16*18] ;tmp15
psubw m1, m2, m5 ;t24 - t23
paddw m2, m5 ;t24 + t23
psubsw m4, m6, m0 ;out16
paddsw m6, m0 ;out15
pmulhrsw m1, m7 ;t23a
pmulhrsw m2, m7 ;t24a
mova m0, [rsp+gprsize*2+16*3 ] ;tmp0
mova m5, [rsp+gprsize*2+16*11] ;tmp8
mova m1, [rsp+gprsize*2+16*11] ;tmp8
mova [rsp+gprsize*2+16*18], m6 ;out15
mova [rsp+gprsize*2+16*19], m4 ;out16
psubsw m6, m0, m3 ;out31
paddsw m0, m3 ;out0
psubsw m4, m5, m1 ;out23
paddsw m5, m1 ;out8
psubsw m4, m1, m2 ;out23
paddsw m1, m2 ;out8
mova m3, [rsp+gprsize*2+16*10] ;tmp7
mova [rsp+gprsize*2+16*34], m6 ;out31
mova [rsp+gprsize*2+16*11], m5 ;out8
mova [rsp+gprsize*2+16*11], m1 ;out8
mova [rsp+gprsize*2+16*26], m4 ;out23
paddsw m6, m3, m2 ;out7
psubsw m3, m2 ;out24
paddsw m6, m3, m5 ;out7
psubsw m3, m5 ;out24
mova m1, [rsp+gprsize*2+16*20] ;t17
mova m5, [rsp+gprsize*2+16*25] ;t22
mova m2, [rsp+gprsize*2+16*17] ;tmp14
@ -4283,23 +4259,20 @@ ALIGN function_align
mova [rsp+gprsize*2+16*20], m3 ;out17
psubsw m2, m1, m5 ;t25a
paddsw m1, m5 ;t30a
psubw m3, m2, m4 ;t25a - t22a
paddw m2, m4 ;t25a + t22a
ITX_MULSUB_2W 2, 4, 3, 5, 7, 2896, 2896 ;t22, t25
mova m5, [rsp+gprsize*2+16*4 ] ;tmp1
pmulhrsw m3, m7 ;t22
pmulhrsw m2, m7 ;t25
psubsw m4, m5, m1 ;out30
psubsw m3, m5, m1 ;out30
paddsw m5, m1 ;out1
mova m1, [rsp+gprsize*2+16*12] ;tmp9
mova [rsp+gprsize*2+16*33], m4 ;out30
mova [rsp+gprsize*2+16*33], m3 ;out30
mova [rsp+gprsize*2+16*4 ], m5 ;out1
psubsw m4, m1, m3 ;out22
paddsw m1, m3 ;out9
psubsw m3, m1, m2 ;out22
paddsw m1, m2 ;out9
mova m5, [rsp+gprsize*2+16*9 ] ;tmp6
mova [rsp+gprsize*2+16*25], m4 ;out22
mova [rsp+gprsize*2+16*25], m3 ;out22
mova [rsp+gprsize*2+16*12], m1 ;out9
psubsw m3, m5, m2 ;out25
paddsw m5, m2 ;out6
psubsw m3, m5, m4 ;out25
paddsw m5, m4 ;out6
mova m4, [rsp+gprsize*2+16*21] ;t18a
mova m1, [rsp+gprsize*2+16*24] ;t21a
mova m2, [rsp+gprsize*2+16*16] ;tmp13
@ -4315,17 +4288,14 @@ ALIGN function_align
mova [rsp+gprsize*2+16*16], m2 ;out13
psubsw m5, m3, m1 ;t26
paddsw m3, m1 ;t29
ITX_MULSUB_2W 5, 4, 1, 2, 7, 2896, 2896 ;t21a, t26a
mova m2, [rsp+gprsize*2+16*5 ] ;tmp2
psubw m1, m5, m4 ;t26 - t21
paddw m4, m5 ;t26 + t21
psubsw m5, m2, m3 ;out29
psubsw m1, m2, m3 ;out29
paddsw m2, m3 ;out2
pmulhrsw m1, m7 ;t21a
pmulhrsw m4, m7 ;t26a
mova m3, [rsp+gprsize*2+16*13] ;tmp10
mova [rsp+gprsize*2+16*32], m5 ;out29
psubsw m7, m3, m1 ;out21
paddsw m3, m1 ;out10
mova [rsp+gprsize*2+16*32], m1 ;out29
psubsw m7, m3, m5 ;out21
paddsw m3, m5 ;out10
mova m5, [rsp+gprsize*2+16*8 ] ;tmp5
mova [rsp+gprsize*2+16*24], m7 ;out21
mova [rsp+gprsize*2+16*13], m3 ;out10
@ -6010,262 +5980,237 @@ ALIGN function_align
psubw m5, m6, m3
ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t43, t52
mova m7, [o(pw_2896x8)]
mova m2, [rsp+gprsize*2+16*38] ;t35a
mova m3, [rsp+gprsize*2+16*31] ;tmp[28]
psubsw m6, m2, m0 ;t44
paddsw m2, m0 ;t35
psubsw m0, m3, m2 ;out35
paddsw m2, m3 ;out28
mova m3, [rsp+gprsize*2+16*63] ;t60a
mova [rsp+gprsize*2+16*38], m0 ;out35
mova [rsp+gprsize*2+16*31], m2 ;out28
mova m3, [rsp+gprsize*2+16*63] ;t60a
mova m2, [rsp+gprsize*2+16*6 ] ;tmp[3]
psubsw m0, m3, m1 ;t51
paddsw m3, m1 ;t60
psubw m1, m0, m6 ;t44a
paddw m0, m6 ;t51a
psubsw m6, m2, m3 ;out60
ITX_MULSUB_2W 0, 6, 1, 2, 7, 2896, 2896 ;t44a, t51a
mova m2, [rsp+gprsize*2+16*6 ] ;tmp[3]
psubsw m1, m2, m3 ;out60
paddsw m2, m3 ;out3
pmulhrsw m1, m7 ;t44a
pmulhrsw m0, m7 ;t51a
mova m3, [rsp+gprsize*2+16*22] ;tmp[19]
mova [rsp+gprsize*2+16*63], m6 ;out60
mova [rsp+gprsize*2+16*63], m1 ;out60
mova [rsp+gprsize*2+16*6 ], m2 ;out3
psubsw m6, m3, m1 ;out44
paddsw m3, m1 ;out19
psubsw m1, m3, m0 ;out44
paddsw m3, m0 ;out19
mova m2, [rsp+gprsize*2+16*15] ;tmp[12]
mova [rsp+gprsize*2+16*47], m6 ;out44
mova [rsp+gprsize*2+16*22], m3 ;out19
psubsw m1, m2, m0 ;out51
paddsw m2, m0 ;out12
mova [rsp+gprsize*2+16*54], m1 ;out51
mova [rsp+gprsize*2+16*15], m2 ;out12
mova m0, [rsp+gprsize*2+16*39] ;t36
mova [rsp+gprsize*2+16*47], m1 ;out44
mova [rsp+gprsize*2+16*22], m3 ;out19
mova m1, [rsp+gprsize*2+16*62] ;t59
psubsw m3, m2, m6 ;out51
paddsw m2, m6 ;out12
mova [rsp+gprsize*2+16*54], m3 ;out51
mova [rsp+gprsize*2+16*15], m2 ;out12
psubsw m2, m0, m5 ;t43a
paddsw m0, m5 ;t36a
mova m5, [rsp+gprsize*2+16*30] ;tmp[27]
psubsw m3, m1, m4 ;t52a
paddsw m1, m4 ;t59a
psubw m5, m3, m2 ;t43
paddw m3, m2 ;t52
mova m2, [rsp+gprsize*2+16*30] ;tmp[27]
ITX_MULSUB_2W 3, 2, 4, 6, 7, 2896, 2896 ;t43, t52
mova m4, [rsp+gprsize*2+16*7 ] ;tmp[4 ]
pmulhrsw m5, m7 ;t43
pmulhrsw m3, m7 ;t52
psubsw m6, m2, m0 ;out36
paddsw m2, m0 ;out27
psubsw m6, m5, m0 ;out36
paddsw m5, m0 ;out27
psubsw m0, m4, m1 ;out59
paddsw m4, m1 ;out4
mova [rsp+gprsize*2+16*39], m6 ;out36
mova [rsp+gprsize*2+16*30], m2 ;out27
mova [rsp+gprsize*2+16*30], m5 ;out27
mova [rsp+gprsize*2+16*62], m0 ;out59
mova [rsp+gprsize*2+16*7 ], m4 ;out4
mova m0, [rsp+gprsize*2+16*23] ;tmp[20]
mova m2, [rsp+gprsize*2+16*14] ;tmp[11]
psubsw m4, m0, m5 ;out43
paddsw m0, m5 ;out20
psubsw m6, m2, m3 ;out52
paddsw m2, m3 ;out11
mova m5, [rsp+gprsize*2+16*14] ;tmp[11]
psubsw m4, m0, m3 ;out43
paddsw m0, m3 ;out20
psubsw m6, m5, m2 ;out52
paddsw m5, m2 ;out11
mova [rsp+gprsize*2+16*46], m4 ;out43
mova [rsp+gprsize*2+16*23], m0 ;out20
mova [rsp+gprsize*2+16*55], m6 ;out52
mova [rsp+gprsize*2+16*14], m2 ;out11
mova [rsp+gprsize*2+16*14], m5 ;out11
mova m0, [rsp+gprsize*2+16*40] ;t37a
mova m2, [rsp+gprsize*2+16*45] ;t42a
mova m5, [rsp+gprsize*2+16*45] ;t42a
mova m3, [rsp+gprsize*2+16*56] ;t53a
mova m1, [rsp+gprsize*2+16*61] ;t58a
psubsw m4, m0, m2 ;t42
paddsw m0, m2 ;t37
mova m2, [rsp+gprsize*2+16*29] ;tmp[26]
psubsw m4, m0, m5 ;t42
paddsw m0, m5 ;t37
psubsw m5, m1, m3 ;t53
paddsw m1, m3 ;t58
psubw m6, m5, m4 ;t42a
paddw m5, m4 ;t53a
mova m2, [rsp+gprsize*2+16*29] ;tmp[26]
ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t43, t52
mova m3, [rsp+gprsize*2+16*8 ] ;tmp[5 ]
pmulhrsw m6, m7 ;t42a
pmulhrsw m5, m7 ;t53a
psubsw m4, m2, m0 ;out37
psubsw m6, m2, m0 ;out37
paddsw m2, m0 ;out26
psubsw m0, m3, m1 ;out58
paddsw m3, m1 ;out5
mova [rsp+gprsize*2+16*40], m4 ;out37
mova [rsp+gprsize*2+16*40], m6 ;out37
mova [rsp+gprsize*2+16*29], m2 ;out26
mova [rsp+gprsize*2+16*61], m0 ;out58
mova [rsp+gprsize*2+16*8 ], m3 ;out5
mova m0, [rsp+gprsize*2+16*24] ;tmp[21]
mova m1, [rsp+gprsize*2+16*13] ;tmp[10]
psubsw m2, m0, m6 ;out42
paddsw m0, m6 ;out21
psubsw m3, m1, m5 ;out53
paddsw m1, m5 ;out10
psubsw m2, m0, m5 ;out42
paddsw m0, m5 ;out21
psubsw m3, m1, m4 ;out53
paddsw m1, m4 ;out10
mova [rsp+gprsize*2+16*45], m2 ;out42
mova [rsp+gprsize*2+16*24], m0 ;out21
mova [rsp+gprsize*2+16*56], m3 ;out53
mova [rsp+gprsize*2+16*13], m1 ;out10
mova m0, [rsp+gprsize*2+16*41] ;t38
mova m2, [rsp+gprsize*2+16*44] ;t41
mova m5, [rsp+gprsize*2+16*44] ;t41
mova m3, [rsp+gprsize*2+16*57] ;t54
mova m1, [rsp+gprsize*2+16*60] ;t57
psubsw m4, m0, m2 ;t41a
paddsw m0, m2 ;t38a
mova m2, [rsp+gprsize*2+16*28] ;tmp[25]
psubsw m4, m0, m5 ;t41a
paddsw m0, m5 ;t38a
psubsw m5, m1, m3 ;t54a
paddsw m1, m3 ;t57a
psubw m6, m5, m4 ;t41
paddw m5, m4 ;t54
mova m2, [rsp+gprsize*2+16*28] ;tmp[25]
ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t41a, t54a
mova m3, [rsp+gprsize*2+16*9 ] ;tmp[6 ]
pmulhrsw m6, m7 ;t41a
pmulhrsw m5, m7 ;t54a
psubsw m4, m2, m0 ;out38
psubsw m6, m2, m0 ;out38
paddsw m2, m0 ;out25
psubsw m0, m3, m1 ;out57
paddsw m3, m1 ;out6
mova [rsp+gprsize*2+16*41], m4 ;out38
mova [rsp+gprsize*2+16*41], m6 ;out38
mova [rsp+gprsize*2+16*28], m2 ;out25
mova [rsp+gprsize*2+16*60], m0 ;out57
mova [rsp+gprsize*2+16*9 ], m3 ;out6
mova m0, [rsp+gprsize*2+16*25] ;tmp[22]
mova m1, [rsp+gprsize*2+16*12] ;tmp[9 ]
psubsw m2, m0, m6 ;out41
paddsw m0, m6 ;out22
psubsw m3, m1, m5 ;out54
paddsw m1, m5 ;out9
psubsw m2, m0, m5 ;out41
paddsw m0, m5 ;out22
psubsw m3, m1, m4 ;out54
paddsw m1, m4 ;out9
mova [rsp+gprsize*2+16*44], m2 ;out41
mova [rsp+gprsize*2+16*25], m0 ;out22
mova [rsp+gprsize*2+16*57], m3 ;out54
mova [rsp+gprsize*2+16*12], m1 ;out9
mova m0, [rsp+gprsize*2+16*42] ;t39a
mova m2, [rsp+gprsize*2+16*43] ;t40a
mova m5, [rsp+gprsize*2+16*43] ;t40a
mova m3, [rsp+gprsize*2+16*58] ;t55a
mova m1, [rsp+gprsize*2+16*59] ;t56a
psubsw m4, m0, m2 ;t40
paddsw m0, m2 ;t39
mova m2, [rsp+gprsize*2+16*27] ;tmp[24]
psubsw m4, m0, m5 ;t40
paddsw m0, m5 ;t39
psubsw m5, m1, m3 ;t55
paddsw m1, m3 ;t56
psubw m6, m5, m4 ;t40a
paddw m5, m4 ;t55a
mova m2, [rsp+gprsize*2+16*27] ;tmp[24]
ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t40a, t55a
mova m3, [rsp+gprsize*2+16*10] ;tmp[7 ]
pmulhrsw m6, m7 ;t40a
pmulhrsw m5, m7 ;t55a
psubsw m4, m2, m0 ;out39
psubsw m6, m2, m0 ;out39
paddsw m2, m0 ;out24
psubsw m0, m3, m1 ;out56
paddsw m3, m1 ;out7
mova [rsp+gprsize*2+16*42], m4 ;out39
mova [rsp+gprsize*2+16*42], m6 ;out39
mova [rsp+gprsize*2+16*27], m2 ;out24
mova [rsp+gprsize*2+16*59], m0 ;out56
mova [rsp+gprsize*2+16*10], m3 ;out7
mova m0, [rsp+gprsize*2+16*26] ;tmp[23]
mova m1, [rsp+gprsize*2+16*11] ;tmp[8 ]
psubsw m2, m0, m6 ;out40
paddsw m0, m6 ;out23
psubsw m3, m1, m5 ;out55
paddsw m1, m5 ;out8
psubsw m2, m0, m5 ;out40
paddsw m0, m5 ;out23
psubsw m3, m1, m4 ;out55
paddsw m1, m4 ;out8
mova [rsp+gprsize*2+16*43], m2 ;out40
mova [rsp+gprsize*2+16*26], m0 ;out23
mova [rsp+gprsize*2+16*58], m3 ;out55
mova [rsp+gprsize*2+16*11], m1 ;out8
mova m0, [rsp+gprsize*2+16*37] ;t34
mova m2, [rsp+gprsize*2+16*48] ;t45
mova m5, [rsp+gprsize*2+16*48] ;t45
mova m3, [rsp+gprsize*2+16*53] ;t50
mova m1, [rsp+gprsize*2+16*64] ;t61
psubsw m4, m0, m2 ;t45a
paddsw m0, m2 ;t34a
mova m2, [rsp+gprsize*2+16*32] ;tmp[29]
psubsw m4, m0, m5 ;t45a
paddsw m0, m5 ;t34a
psubsw m5, m1, m3 ;t50a
paddsw m1, m3 ;t61a
psubw m6, m5, m4 ;t45
paddw m5, m4 ;t50
mova m2, [rsp+gprsize*2+16*32] ;tmp[29]
ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50
mova m3, [rsp+gprsize*2+16*5 ] ;tmp[2 ]
pmulhrsw m6, m7 ;t45
pmulhrsw m5, m7 ;t50
psubsw m4, m2, m0 ;out34
psubsw m6, m2, m0 ;out34
paddsw m2, m0 ;out29
psubsw m0, m3, m1 ;out61
paddsw m3, m1 ;out2
mova [rsp+gprsize*2+16*37], m4 ;out34
mova [rsp+gprsize*2+16*37], m6 ;out34
mova [rsp+gprsize*2+16*32], m2 ;out29
mova [rsp+gprsize*2+16*64], m0 ;out61
mova [rsp+gprsize*2+16*5 ], m3 ;out2
mova m0, [rsp+gprsize*2+16*21] ;tmp[18]
mova m1, [rsp+gprsize*2+16*16] ;tmp[13]
psubsw m2, m0, m6 ;out45
paddsw m0, m6 ;out18
psubsw m3, m1, m5 ;out50
paddsw m1, m5 ;out13
psubsw m2, m0, m5 ;out45
paddsw m0, m5 ;out18
psubsw m3, m1, m4 ;out50
paddsw m1, m4 ;out13
mova [rsp+gprsize*2+16*48], m2 ;out45
mova [rsp+gprsize*2+16*21], m0 ;out18
mova [rsp+gprsize*2+16*53], m3 ;out50
mova [rsp+gprsize*2+16*16], m1 ;out13
mova m0, [rsp+gprsize*2+16*36] ;t33a
mova m2, [rsp+gprsize*2+16*49] ;t46a
mova m5, [rsp+gprsize*2+16*49] ;t46a
mova m3, [rsp+gprsize*2+16*52] ;t49a
mova m1, [rsp+gprsize*2+16*65] ;t62a
psubsw m4, m0, m2 ;t46
paddsw m0, m2 ;t33
mova m2, [rsp+gprsize*2+16*33] ;tmp[30]
psubsw m4, m0, m5 ;t46
paddsw m0, m5 ;t33
psubsw m5, m1, m3 ;t49
paddsw m1, m3 ;t62
psubw m6, m5, m4 ;t46a
paddw m5, m4 ;t49a
mova m2, [rsp+gprsize*2+16*33] ;tmp[30]
ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50
mova m3, [rsp+gprsize*2+16*4 ] ;tmp[1 ]
pmulhrsw m6, m7 ;t46a
pmulhrsw m5, m7 ;t49a
psubsw m4, m2, m0 ;out33
psubsw m6, m2, m0 ;out33
paddsw m2, m0 ;out30
psubsw m0, m3, m1 ;out62
paddsw m3, m1 ;out1
mova [rsp+gprsize*2+16*36], m4 ;out33
mova [rsp+gprsize*2+16*36], m6 ;out33
mova [rsp+gprsize*2+16*33], m2 ;out30
mova [rsp+gprsize*2+16*65], m0 ;out62
mova [rsp+gprsize*2+16*4 ], m3 ;out1
mova m0, [rsp+gprsize*2+16*20] ;tmp[17]
mova m1, [rsp+gprsize*2+16*17] ;tmp[14]
psubsw m2, m0, m6 ;out46
paddsw m0, m6 ;out17
psubsw m3, m1, m5 ;out49
paddsw m1, m5 ;out14
psubsw m2, m0, m5 ;out46
paddsw m0, m5 ;out17
psubsw m3, m1, m4 ;out49
paddsw m1, m4 ;out14
mova [rsp+gprsize*2+16*49], m2 ;out46
mova [rsp+gprsize*2+16*20], m0 ;out17
mova [rsp+gprsize*2+16*52], m3 ;out49
mova [rsp+gprsize*2+16*17], m1 ;out14
mova m0, [rsp+gprsize*2+16*35] ;t32
mova m2, [rsp+gprsize*2+16*50] ;t47
mova m5, [rsp+gprsize*2+16*50] ;t47
mova m3, [rsp+gprsize*2+16*51] ;t48
mova m1, [rsp+gprsize*2+16*66] ;t63
psubsw m4, m0, m2 ;t47a
paddsw m0, m2 ;t32a
mova m2, [rsp+gprsize*2+16*34] ;tmp[31]
psubsw m4, m0, m5 ;t47a
paddsw m0, m5 ;t32a
psubsw m5, m1, m3 ;t48a
paddsw m1, m3 ;t63a
psubw m6, m5, m4 ;t47
paddw m5, m4 ;t48
mova m2, [rsp+gprsize*2+16*34] ;tmp[31]
ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t47, t48
mova m3, [rsp+gprsize*2+16*3 ] ;tmp[0 ]
pmulhrsw m6, m7 ;t47
pmulhrsw m5, m7 ;t48
psubsw m4, m2, m0 ;out32
psubsw m6, m2, m0 ;out32
paddsw m2, m0 ;out31
psubsw m0, m3, m1 ;out63
paddsw m3, m1 ;out0
mova [rsp+gprsize*2+16*35], m4 ;out32
mova [rsp+gprsize*2+16*35], m6 ;out32
mova [rsp+gprsize*2+16*34], m2 ;out31
mova [rsp+gprsize*2+16*66], m0 ;out63
mova [rsp+gprsize*2+16*3 ], m3 ;out0
mova m0, [rsp+gprsize*2+16*19] ;tmp[16]
mova m1, [rsp+gprsize*2+16*18] ;tmp[15]
psubsw m2, m0, m6 ;out47
paddsw m0, m6 ;out16
psubsw m3, m1, m5 ;out48
paddsw m1, m5 ;out15
psubsw m2, m0, m5 ;out47
paddsw m0, m5 ;out16
psubsw m3, m1, m4 ;out48
paddsw m1, m4 ;out15
mova [rsp+gprsize*2+16*50], m2 ;out47
mova [rsp+gprsize*2+16*19], m0 ;out16
mova [rsp+gprsize*2+16*51], m3 ;out48
@ -6273,7 +6218,6 @@ ALIGN function_align
ret
cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2
%if ARCH_X86_32
LEA r5, $$

View File

@ -88,7 +88,11 @@ decl_blend_dir_fn(dav1d_blend_h_avx2);
decl_blend_dir_fn(dav1d_blend_h_ssse3);
decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2);
decl_warp8x8_fn(dav1d_warp_affine_8x8_sse4);
decl_warp8x8_fn(dav1d_warp_affine_8x8_ssse3);
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2);
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse4);
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_ssse3);
decl_emu_edge_fn(dav1d_emu_edge_avx2);
decl_emu_edge_fn(dav1d_emu_edge_ssse3);
@ -134,9 +138,21 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
c->blend = dav1d_blend_ssse3;
c->blend_v = dav1d_blend_v_ssse3;
c->blend_h = dav1d_blend_h_ssse3;
c->warp8x8 = dav1d_warp_affine_8x8_ssse3;
c->warp8x8t = dav1d_warp_affine_8x8t_ssse3;
c->emu_edge = dav1d_emu_edge_ssse3;
#endif
if(!(flags & DAV1D_X86_CPU_FLAG_SSE41))
return;
#if BITDEPTH == 8
c->warp8x8 = dav1d_warp_affine_8x8_sse4;
c->warp8x8t = dav1d_warp_affine_8x8t_sse4;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
return;

View File

@ -44,6 +44,10 @@ obmc_masks: db 0, 0, 0, 0
db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8
warp_8x8_shufB: db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12
warp_8x8_shufC: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10
warp_8x8_shufD: db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
@ -53,17 +57,18 @@ subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
pb_64: times 16 db 64
pw_8: times 8 dw 8
pw_26: times 8 dw 26
pw_34: times 8 dw 34
pw_512: times 8 dw 512
pw_1024: times 8 dw 1024
pw_2048: times 8 dw 2048
pw_6903: times 8 dw 6903
pw_8192: times 8 dw 8192
pd_32: times 4 dd 32
pd_512: times 4 dd 512
pb_64: times 16 db 64
pw_8: times 8 dw 8
pw_26: times 8 dw 26
pw_34: times 8 dw 34
pw_512: times 8 dw 512
pw_1024: times 8 dw 1024
pw_2048: times 8 dw 2048
pw_6903: times 8 dw 6903
pw_8192: times 8 dw 8192
pd_32: times 4 dd 32
pd_512: times 4 dd 512
pd_32768: times 4 dd 32768
pw_258: times 2 dw 258
@ -146,6 +151,8 @@ HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128
%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
cextern mc_warp_filter
SECTION .text
INIT_XMM ssse3
@ -3302,6 +3309,580 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
jg .hv_w8_loop0
RET
%if ARCH_X86_32
%macro SAVE_ALPHA_BETA 0
mov alpham, alphad
mov betam, betad
%endmacro
%macro SAVE_DELTA_GAMMA 0
mov deltam, deltad
mov gammam, gammad
%endmacro
%macro LOAD_ALPHA_BETA_MX 0
mov mym, myd
mov alphad, alpham
mov betad, betam
mov mxd, mxm
%endmacro
%macro LOAD_DELTA_GAMMA_MY 0
mov mxm, mxd
mov deltad, deltam
mov gammad, gammam
mov myd, mym
%endmacro
%define PIC_reg r2
%define PIC_base_offset $$
%define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
%else
%define SAVE_ALPHA_BETA
%define SAVE_DELTA_GAMMA
%define PIC_sym(sym) sym
%endif
%if ARCH_X86_32
%if STACK_ALIGNMENT < required_stack_alignment
%assign copy_args 8*4
%else
%assign copy_args 0
%endif
%endif
%macro RELOC_ARGS 0
%if copy_args
mov r0, r0m
mov r1, r1m
mov r2, r2m
mov r3, r3m
mov r5, r5m
mov dstm, r0
mov dsm, r1
mov srcm, r2
mov ssm, r3
mov mxm, r5
mov r0, r6m
mov mym, r0
%endif
%endmacro
%macro BLENDHWDW 2 ; blend high words from dwords, src1, src2
%if cpuflag(sse4)
pblendw %1, %2, 0xAA
%else
pand %2, m10
por %1, %2
%endif
%endmacro
%macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7
; Can be done using gathers, but that's terribly slow on many CPU:s
%if ARCH_X86_32
%define m8 m4
%define m9 m5
%define m14 m6
%define m15 m7
%define m11 m7
pxor m11, m11
%endif
lea tmp1d, [myq+deltaq*4]
lea tmp2d, [myq+deltaq*1]
shr myd, 10
shr tmp1d, 10
movq m2, [filterq+myq *8] ; a
movq m8, [filterq+tmp1q*8] ; e
lea tmp1d, [tmp2q+deltaq*4]
lea myd, [tmp2q+deltaq*1]
shr tmp2d, 10
shr tmp1d, 10
movq m3, [filterq+tmp2q*8] ; b
movq m0, [filterq+tmp1q*8] ; f
punpcklwd m2, m3
punpcklwd m8, m0
lea tmp1d, [myq+deltaq*4]
lea tmp2d, [myq+deltaq*1]
shr myd, 10
shr tmp1d, 10
movq m0, [filterq+myq *8] ; c
movq m9, [filterq+tmp1q*8] ; g
lea tmp1d, [tmp2q+deltaq*4]
lea myd, [tmp2q+gammaq] ; my += gamma
shr tmp2d, 10
shr tmp1d, 10
movq m3, [filterq+tmp2q*8] ; d
movq m1, [filterq+tmp1q*8] ; h
punpcklwd m0, m3
punpcklwd m9, m1
punpckldq m1, m2, m0
punpckhdq m2, m0
punpcklbw m0, m11, m1 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
punpckhbw m3, m11, m1 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
punpcklbw m1, m11, m2 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
punpckhbw m14, m11, m2 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
pmaddwd m0, %3
pmaddwd m3, %5
pmaddwd m1, %7
pmaddwd m14, %9
paddd m0, m3
paddd m1, m14
paddd m0, m1
mova %1, m0
%if ARCH_X86_64
SWAP m3, m14
%endif
punpckldq m0, m8, m9
punpckhdq m8, m9
punpcklbw m1, m11, m0 ; e0 e2 f0 f2 g0 g2 h0 h2 << 8
punpckhbw m14, m11, m0 ; e4 e6 f4 f6 g4 g6 h4 h6 << 8
punpcklbw m2, m11, m8 ; e1 e3 f1 f3 g1 g3 h1 h3 << 8
punpckhbw m15, m11, m8 ; e5 e7 f5 f7 g5 g7 h5 h7 << 8
pmaddwd m1, %4
pmaddwd m14, %6
pmaddwd m2, %8
pmaddwd m15, %10
paddd m1, m14
paddd m2, m15
paddd m1, m2
mova %2, m1
%if ARCH_X86_64
SWAP m14, m3
%endif
%endmacro
%if ARCH_X86_64
%define counterd r4d
%else
%if copy_args == 0
%define counterd dword r4m
%else
%define counterd dword [esp+stack_size-4*7]
%endif
%endif
%macro WARP_AFFINE_8X8T 0
%if ARCH_X86_64
cglobal warp_affine_8x8t, 6, 14, 16, 0x90, tmp, ts
%else
cglobal warp_affine_8x8t, 0, 7, 16, -0x130-copy_args, tmp, ts
%if copy_args
%define tmpm [esp+stack_size-4*1]
%define tsm [esp+stack_size-4*2]
%endif
%endif
call mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).main
.loop:
%if ARCH_X86_32
%define m12 m4
%define m13 m5
%define m14 m6
%define m15 m7
mova m12, [esp+0xC0]
mova m13, [esp+0xD0]
mova m14, [esp+0xE0]
mova m15, [esp+0xF0]
%endif
psrad m12, 13
psrad m13, 13
psrad m14, 13
psrad m15, 13
packssdw m12, m13
packssdw m14, m15
mova m13, [PIC_sym(pw_8192)]
pmulhrsw m12, m13 ; (x + (1 << 6)) >> 7
pmulhrsw m14, m13
mova [tmpq+tsq*0], m12
mova [tmpq+tsq*2], m14
dec counterd
jz mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).end
%if ARCH_X86_32
mov tmpm, tmpd
mov r0, [esp+0x100]
mov r1, [esp+0x104]
%endif
call mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).main2
lea tmpq, [tmpq+tsq*4]
jmp .loop
%endmacro
%macro WARP_AFFINE_8X8 0
%if ARCH_X86_64
cglobal warp_affine_8x8, 6, 14, 16, 0x90, \
dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
filter, tmp1, delta, my, gamma
%else
cglobal warp_affine_8x8, 0, 7, 16, -0x130-copy_args, \
dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
filter, tmp1, delta, my, gamma
%define alphaq r0
%define alphad r0
%define alpham [esp+gprsize+0x100]
%define betaq r1
%define betad r1
%define betam [esp+gprsize+0x104]
%define deltaq r0
%define deltad r0
%define deltam [esp+gprsize+0x108]
%define gammaq r1
%define gammad r1
%define gammam [esp+gprsize+0x10C]
%define filterq r3
%define tmp1q r4
%define tmp1d r4
%define tmp1m [esp+gprsize+0x110]
%define myq r5
%define myd r5
%define mym r6m
%if copy_args
%define dstm [esp+stack_size-4*1]
%define dsm [esp+stack_size-4*2]
%define srcm [esp+stack_size-4*3]
%define ssm [esp+stack_size-4*4]
%define mxm [esp+stack_size-4*5]
%define mym [esp+stack_size-4*6]
%endif
%endif
call .main
jmp .start
.loop:
%if ARCH_X86_32
mov dstm, dstd
mov alphad, [esp+0x100]
mov betad, [esp+0x104]
%endif
call .main2
lea dstq, [dstq+dsq*2]
.start:
%if cpuflag(ssse3)
%if ARCH_X86_64
mova m10, [PIC_sym(pw_8192)]
%else
%define m10 [PIC_sym(pw_8192)]
%endif
%endif
%if ARCH_X86_32
%define m12 m5
%define m13 m6
mova m12, [esp+0xC0]
mova m13, [esp+0xD0]
%endif
%if cpuflag(sse4)
%if ARCH_X86_32
%define m11 m4
pxor m11, m11
%endif
psrad m12, 18
psrad m13, 18
packusdw m12, m13
pavgw m12, m11 ; (x + (1 << 10)) >> 11
%else
psrad m12, 17
psrad m13, 17
packssdw m12, m13
pmulhrsw m12, m10 ; (x + (1 << 10)) >> 11
%endif
%if ARCH_X86_32
%define m14 m6
%define m15 m7
mova m14, [esp+0xE0]
mova m15, [esp+0xF0]
%endif
%if cpuflag(sse4)
psrad m14, 18
psrad m15, 18
packusdw m14, m15
pavgw m14, m11 ; (x + (1 << 10)) >> 11
%else
psrad m14, 17
psrad m15, 17
packssdw m14, m15
pmulhrsw m14, m10 ; (x + (1 << 10)) >> 11
%endif
packuswb m12, m14
movq [dstq+dsq*0], m12
movhps [dstq+dsq*1], m12
dec counterd
jg .loop
.end:
RET
ALIGN function_align
.main:
%assign stack_offset stack_offset+gprsize
%if ARCH_X86_32
%assign stack_size stack_size+4
%if copy_args
%assign stack_offset stack_offset-4
%endif
RELOC_ARGS
LEA PIC_reg, $$
%define PIC_mem [esp+gprsize+0x114]
mov abcdd, abcdm
%if copy_args == 0
mov ssd, ssm
mov mxd, mxm
%endif
mov PIC_mem, PIC_reg
mov srcd, srcm
%endif
movsx deltad, word [abcdq+2*2]
movsx gammad, word [abcdq+2*3]
lea tmp1d, [deltaq*3]
sub gammad, tmp1d ; gamma -= delta*3
SAVE_DELTA_GAMMA
%if ARCH_X86_32
mov abcdd, abcdm
%endif
movsx alphad, word [abcdq+2*0]
movsx betad, word [abcdq+2*1]
lea tmp1q, [ssq*3+3]
add mxd, 512+(64<<10)
lea tmp2d, [alphaq*3]
sub srcq, tmp1q ; src -= src_stride*3 + 3
%if ARCH_X86_32
mov srcm, srcd
mov PIC_reg, PIC_mem
%endif
sub betad, tmp2d ; beta -= alpha*3
lea filterq, [PIC_sym(mc_warp_filter)]
%if ARCH_X86_64
mov myd, r6m
pxor m11, m11
%endif
call .h
psrld m2, m0, 16
psrld m3, m1, 16
%if ARCH_X86_32
mova [esp+gprsize+0x10], m3
%endif
call .h
psrld m4, m0, 16
psrld m5, m1, 16
%if ARCH_X86_32
mova [esp+gprsize+0x20], m4
mova [esp+gprsize+0x30], m5
%endif
call .h
%if ARCH_X86_64
%define blendmask [rsp+gprsize+0x80]
%else
mova m3, [esp+gprsize+0x10]
%define blendmask [esp+gprsize+0x120]
%define m10 m7
%endif
pcmpeqd m10, m10
pslld m10, 16
mova blendmask, m10
BLENDHWDW m2, m0 ; 0
BLENDHWDW m3, m1 ; 2
mova [rsp+gprsize+0x00], m2
mova [rsp+gprsize+0x10], m3
call .h
%if ARCH_X86_32
mova m4, [esp+gprsize+0x20]
mova m5, [esp+gprsize+0x30]
%endif
mova m10, blendmask
BLENDHWDW m4, m0 ; 1
BLENDHWDW m5, m1 ; 3
mova [rsp+gprsize+0x20], m4
mova [rsp+gprsize+0x30], m5
call .h
%if ARCH_X86_32
mova m3, [esp+gprsize+0x10]
%define m10 m5
%endif
psrld m6, m2, 16
psrld m7, m3, 16
mova m10, blendmask
BLENDHWDW m6, m0 ; 2
BLENDHWDW m7, m1 ; 4
mova [rsp+gprsize+0x40], m6
mova [rsp+gprsize+0x50], m7
call .h
%if ARCH_X86_32
mova m4, [esp+gprsize+0x20]
mova m5, [esp+gprsize+0x30]
%endif
psrld m2, m4, 16
psrld m3, m5, 16
mova m10, blendmask
BLENDHWDW m2, m0 ; 3
BLENDHWDW m3, m1 ; 5
mova [rsp+gprsize+0x60], m2
mova [rsp+gprsize+0x70], m3
call .h
%if ARCH_X86_32
mova m6, [esp+gprsize+0x40]
mova m7, [esp+gprsize+0x50]
%define m10 m7
%endif
psrld m4, m6, 16
psrld m5, m7, 16
mova m10, blendmask
BLENDHWDW m4, m0 ; 4
BLENDHWDW m5, m1 ; 6
%if ARCH_X86_64
add myd, 512+(64<<10)
mova m6, m2
mova m7, m3
%else
mova [esp+gprsize+0x80], m4
mova [esp+gprsize+0x90], m5
add dword mym, 512+(64<<10)
%endif
mov counterd, 4
SAVE_ALPHA_BETA
.main2:
call .h
%if ARCH_X86_32
mova m6, [esp+gprsize+0x60]
mova m7, [esp+gprsize+0x70]
%define m10 m5
%endif
psrld m6, 16
psrld m7, 16
mova m10, blendmask
BLENDHWDW m6, m0 ; 5
BLENDHWDW m7, m1 ; 7
%if ARCH_X86_64
WARP_V m12, m13, [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
m4, m5, \
[rsp+gprsize+0x20], [rsp+gprsize+0x30], \
m6, m7
%else
mova [esp+gprsize+0xA0], m6
mova [esp+gprsize+0xB0], m7
LOAD_DELTA_GAMMA_MY
WARP_V [esp+gprsize+0xC0], [esp+gprsize+0xD0], \
[esp+gprsize+0x00], [esp+gprsize+0x10], \
[esp+gprsize+0x80], [esp+gprsize+0x90], \
[esp+gprsize+0x20], [esp+gprsize+0x30], \
[esp+gprsize+0xA0], [esp+gprsize+0xB0]
LOAD_ALPHA_BETA_MX
%endif
call .h
mova m2, [rsp+gprsize+0x40]
mova m3, [rsp+gprsize+0x50]
%if ARCH_X86_32
mova m4, [rsp+gprsize+0x80]
mova m5, [rsp+gprsize+0x90]
%define m10 m7
%endif
mova [rsp+gprsize+0x00], m2
mova [rsp+gprsize+0x10], m3
mova [rsp+gprsize+0x40], m4
mova [rsp+gprsize+0x50], m5
psrld m4, 16
psrld m5, 16
mova m10, blendmask
BLENDHWDW m4, m0 ; 6
BLENDHWDW m5, m1 ; 8
%if ARCH_X86_64
WARP_V m14, m15, [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
m6, m7, \
[rsp+gprsize+0x00], [rsp+gprsize+0x10], \
m4, m5
%else
mova [esp+gprsize+0x80], m4
mova [esp+gprsize+0x90], m5
LOAD_DELTA_GAMMA_MY
WARP_V [esp+gprsize+0xE0], [esp+gprsize+0xF0], \
[esp+gprsize+0x20], [esp+gprsize+0x30], \
[esp+gprsize+0xA0], [esp+gprsize+0xB0], \
[esp+gprsize+0x00], [esp+gprsize+0x10], \
[esp+gprsize+0x80], [esp+gprsize+0x90]
mov mym, myd
mov dstd, dstm
mov dsd, dsm
mov mxd, mxm
%endif
mova m2, [rsp+gprsize+0x60]
mova m3, [rsp+gprsize+0x70]
%if ARCH_X86_32
mova m6, [esp+gprsize+0xA0]
mova m7, [esp+gprsize+0xB0]
%endif
mova [rsp+gprsize+0x20], m2
mova [rsp+gprsize+0x30], m3
mova [rsp+gprsize+0x60], m6
mova [rsp+gprsize+0x70], m7
ret
ALIGN function_align
.h:
%if ARCH_X86_32
%define m8 m3
%define m9 m4
%define m10 m5
%define m14 m6
%define m15 m7
%endif
lea tmp1d, [mxq+alphaq*4]
lea tmp2d, [mxq+alphaq*1]
%if ARCH_X86_32
%assign stack_offset stack_offset+4
%assign stack_size stack_size+4
%define PIC_mem [esp+gprsize*2+0x114]
mov PIC_mem, PIC_reg
mov srcd, srcm
%endif
movu m10, [srcq]
%if ARCH_X86_32
add srcd, ssm
mov srcm, srcd
mov PIC_reg, PIC_mem
%else
add srcq, ssq
%endif
shr mxd, 10
shr tmp1d, 10
movq m1, [filterq+mxq *8] ; 0 X
movq m8, [filterq+tmp1q*8] ; 4 X
lea tmp1d, [tmp2q+alphaq*4]
lea mxd, [tmp2q+alphaq*1]
shr tmp2d, 10
shr tmp1d, 10
movhps m1, [filterq+tmp2q*8] ; 0 1
movhps m8, [filterq+tmp1q*8] ; 4 5
lea tmp1d, [mxq+alphaq*4]
lea tmp2d, [mxq+alphaq*1]
shr mxd, 10
shr tmp1d, 10
movq m14, [filterq+mxq *8] ; 2 X
movq m9, [filterq+tmp1q*8] ; 6 X
lea tmp1d, [tmp2q+alphaq*4]
lea mxd, [tmp2q+betaq] ; mx += beta
shr tmp2d, 10
shr tmp1d, 10
movhps m14, [filterq+tmp2q*8] ; 2 3
movhps m9, [filterq+tmp1q*8] ; 6 7
pshufb m0, m10, [PIC_sym(warp_8x8_shufA)]
pmaddubsw m0, m1
pshufb m1, m10, [PIC_sym(warp_8x8_shufB)]
pmaddubsw m1, m8
pshufb m15, m10, [PIC_sym(warp_8x8_shufC)]
pmaddubsw m15, m14
pshufb m10, m10, [PIC_sym(warp_8x8_shufD)]
pmaddubsw m10, m9
mova m14, [PIC_sym(pw_8192)]
mova m9, [PIC_sym(pd_32768)]
phaddw m0, m15
phaddw m1, m10
pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13
pmaddwd m1, m14
paddd m0, m9 ; rounded 14-bit result in upper 16 bits of dword
paddd m1, m9
ret
%endmacro
INIT_XMM sse4
WARP_AFFINE_8X8
WARP_AFFINE_8X8T
INIT_XMM ssse3
WARP_AFFINE_8X8
WARP_AFFINE_8X8T
%if WIN64
DECLARE_REG_TMP 6, 4
%else

View File

@ -142,6 +142,7 @@ static struct {
unsigned int seed;
int bench_c;
int verbose;
int function_listing;
} state;
/* float compare support code */
@ -365,6 +366,14 @@ static void print_benchs(const CheckasmFunc *const f) {
}
#endif
static void print_functions(const CheckasmFunc *const f) {
if (f) {
print_functions(f->child[0]);
printf("%s\n", f->name);
print_functions(f->child[1]);
}
}
#define is_digit(x) ((x) >= '0' && (x) <= '9')
/* ASCIIbetical sort except preserving natural order for numbers */
@ -515,7 +524,8 @@ int main(int argc, char *argv[]) {
"Options:\n"
" --test=<test_name> Test only <test_name>\n"
" --bench=<pattern> Test and benchmark the functions matching <pattern>\n"
" --list List the available tests\n"
" --list-functions List available functions\n"
" --list-tests List available tests\n"
" --bench-c Benchmark the C-only functions\n"
" --verbose -v Print failures verbosely\n");
return 0;
@ -534,11 +544,11 @@ int main(int argc, char *argv[]) {
state.bench_pattern = "";
} else if (!strncmp(argv[1], "--test=", 7)) {
state.test_name = argv[1] + 7;
} else if (!strcmp(argv[1], "--list")) {
fprintf(stderr, "checkasm: available tests [");
for (int i = 0; tests[i].func; i++)
fprintf(stderr, "%s%s", i ? ", ": "", tests[i].name);
fprintf(stderr, "]\n");
} else if (!strcmp(argv[1], "--list-functions")) {
state.function_listing = 1;
} else if (!strcmp(argv[1], "--list-tests")) {
for (int i = 0; tests[i].name; i++)
printf("%s\n", tests[i].name);
return 0;
} else if (!strcmp(argv[1], "--verbose") || !strcmp(argv[1], "-v")) {
state.verbose = 1;
@ -553,24 +563,28 @@ int main(int argc, char *argv[]) {
fprintf(stderr, "checkasm: using random seed %u\n", state.seed);
check_cpu_flag(NULL, 0);
for (int i = 0; cpus[i].flag; i++)
check_cpu_flag(cpus[i].name, cpus[i].flag);
if (!state.num_checked) {
fprintf(stderr, "checkasm: no tests to perform\n");
} else if (state.num_failed) {
fprintf(stderr, "checkasm: %d of %d tests have failed\n",
state.num_failed, state.num_checked);
ret = 1;
if (state.function_listing) {
print_functions(state.funcs);
} else {
fprintf(stderr, "checkasm: all %d tests passed\n", state.num_checked);
for (int i = 0; cpus[i].flag; i++)
check_cpu_flag(cpus[i].name, cpus[i].flag);
if (!state.num_checked) {
fprintf(stderr, "checkasm: no tests to perform\n");
} else if (state.num_failed) {
fprintf(stderr, "checkasm: %d of %d tests have failed\n",
state.num_failed, state.num_checked);
ret = 1;
} else {
fprintf(stderr, "checkasm: all %d tests passed\n", state.num_checked);
#ifdef readtime
if (state.bench_pattern) {
state.nop_time = measure_nop_time();
printf("nop: %d.%d\n", state.nop_time/10, state.nop_time%10);
print_benchs(state.funcs);
}
if (state.bench_pattern) {
state.nop_time = measure_nop_time();
printf("nop: %d.%d\n", state.nop_time/10, state.nop_time%10);
print_benchs(state.funcs);
}
#endif
}
}
destroy_func_tree(state.funcs);
@ -592,6 +606,10 @@ void *checkasm_check_func(void *const func, const char *const name, ...) {
return NULL;
state.current_func = get_func(&state.funcs, name_buf);
if (state.function_listing) /* Save function names without running tests */
return NULL;
state.funcs->color = 1;
CheckasmFuncVersion *v = &state.current_func->versions;
void *ref = func;

View File

@ -34,6 +34,12 @@
#define UNIT_TEST 1
#include "src/fg_apply_tmpl.c"
static const char ss_name[][4] = {
[DAV1D_PIXEL_LAYOUT_I420 - 1] = "420",
[DAV1D_PIXEL_LAYOUT_I422 - 1] = "422",
[DAV1D_PIXEL_LAYOUT_I444 - 1] = "444",
};
static void check_gen_grny(const Dav1dFilmGrainDSPContext *const dsp) {
entry grain_lut_c[GRAIN_HEIGHT][GRAIN_WIDTH];
entry grain_lut_a[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
@ -72,6 +78,64 @@ static void check_gen_grny(const Dav1dFilmGrainDSPContext *const dsp) {
report("gen_grain_y");
}
static void check_gen_grnuv(const Dav1dFilmGrainDSPContext *const dsp) {
entry grain_lut_y[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
entry grain_lut_c[GRAIN_HEIGHT][GRAIN_WIDTH];
entry grain_lut_a[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
declare_func(void, entry grain_lut[][GRAIN_WIDTH],
const entry grain_lut_y[][GRAIN_WIDTH],
const Dav1dFilmGrainData *data, intptr_t uv HIGHBD_DECL_SUFFIX);
for (int layout_idx = 0; layout_idx < 3; layout_idx++) {
const enum Dav1dPixelLayout layout = layout_idx + 1;
const int ss_x = layout != DAV1D_PIXEL_LAYOUT_I444;
const int ss_y = layout == DAV1D_PIXEL_LAYOUT_I420;
for (int i = 0; i < 4; i++) {
if (check_func(dsp->generate_grain_uv[layout_idx],
"gen_grain_uv_ar%d_%dbpc_%s",
i, BITDEPTH, ss_name[layout_idx]))
{
Dav1dFilmGrainData fg_data;
fg_data.seed = rnd() & 0xFFFF;
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#endif
fg_data.num_y_points = rnd() & 1;
fg_data.grain_scale_shift = rnd() & 3;
fg_data.ar_coeff_shift = (rnd() & 3) + 6;
fg_data.ar_coeff_lag = i;
const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
for (int n = 0; n < num_y_pos; n++)
fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
dsp->generate_grain_y(grain_lut_y, &fg_data HIGHBD_TAIL_SUFFIX);
const int uv = rnd() & 1;
const int num_uv_pos = num_y_pos + !!fg_data.num_y_points;
for (int n = 0; n < num_uv_pos; n++)
fg_data.ar_coeffs_uv[uv][n] = (rnd() & 0xff) - 128;
if (!fg_data.num_y_points)
fg_data.ar_coeffs_uv[uv][num_uv_pos] = 0;
memset(grain_lut_c, 0xff, sizeof(grain_lut_c));
memset(grain_lut_a, 0xff, sizeof(grain_lut_a));
call_ref(grain_lut_c, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
call_new(grain_lut_a, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
int diff = 0, w = ss_x ? 44 : GRAIN_WIDTH;
for (int y = 0; y < (ss_y ? 38 : GRAIN_HEIGHT); y++)
diff |= memcmp(grain_lut_a[y], grain_lut_c[y], w * sizeof(entry));
if (diff) fail();
bench_new(grain_lut_a, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
}
}
}
report("gen_grain_uv");
}
static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
ALIGN_STK_32(pixel, c_dst, 128 * 32,);
ALIGN_STK_32(pixel, a_dst, 128 * 32,);
@ -157,11 +221,6 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
int is_identity HIGHBD_DECL_SUFFIX);
for (int layout_idx = 0; layout_idx < 3; layout_idx++) {
const char ss_name[][4] = {
[DAV1D_PIXEL_LAYOUT_I420 - 1] = "420",
[DAV1D_PIXEL_LAYOUT_I422 - 1] = "422",
[DAV1D_PIXEL_LAYOUT_I444 - 1] = "444",
};
const enum Dav1dPixelLayout layout = layout_idx + 1;
const int ss_x = layout != DAV1D_PIXEL_LAYOUT_I444;
const int ss_y = layout == DAV1D_PIXEL_LAYOUT_I420;
@ -264,6 +323,7 @@ void bitfn(checkasm_check_filmgrain)(void) {
bitfn(dav1d_film_grain_dsp_init)(&c);
check_gen_grny(&c);
check_gen_grnuv(&c);
check_fgy_sbrow(&c);
check_fguv_sbrow(&c);
}

View File

@ -29,6 +29,8 @@
#include "src/ipred.h"
#include "src/levels.h"
#include <stdio.h>
static const char *const intra_pred_mode_names[N_IMPL_INTRA_PRED_MODES] = {
[DC_PRED] = "dc",
[DC_128_PRED] = "dc_128",
@ -83,11 +85,16 @@ static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
{
const ptrdiff_t stride = w * sizeof(pixel);
int a = 0;
if (mode >= Z1_PRED && mode <= Z3_PRED) /* angle */
int a = 0, maxw = 0, maxh = 0;
if (mode >= Z1_PRED && mode <= Z3_PRED) { /* angle */
a = (90 * (mode - Z1_PRED) + z_angles[rnd() % 27]) |
(rnd() & 0x600);
else if (mode == FILTER_PRED) /* filter_idx */
if (mode == Z2_PRED) {
maxw = rnd(), maxh = rnd();
maxw = 1 + (maxw & (maxw & 4096 ? 4095 : w - 1));
maxh = 1 + (maxh & (maxh & 4096 ? 4095 : h - 1));
}
} else if (mode == FILTER_PRED) /* filter_idx */
a = (rnd() % 5) | (rnd() & ~511);
#if BITDEPTH == 16
@ -99,13 +106,23 @@ static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
for (int i = -h * 2; i <= w * 2; i++)
topleft[i] = rnd() & bitdepth_max;
const int maxw = 1 + (rnd() % 128), maxh = 1 + (rnd() % 128);
call_ref(c_dst, stride, topleft, w, h, a, maxw, maxh
HIGHBD_TAIL_SUFFIX);
call_new(a_dst, stride, topleft, w, h, a, maxw, maxh
HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, stride, a_dst, stride,
w, h, "dst");
if (checkasm_check_pixel(c_dst, stride, a_dst, stride,
w, h, "dst"))
{
if (mode == Z1_PRED || mode == Z3_PRED)
fprintf(stderr, "angle = %d (0x%03x)\n",
a & 0x1ff, a & 0x600);
else if (mode == Z2_PRED)
fprintf(stderr, "angle = %d (0x%03x), "
"max_width = %d, max_height = %d\n",
a & 0x1ff, a & 0x600, maxw, maxh);
else if (mode == FILTER_PRED)
fprintf(stderr, "filter_idx = %d\n", a & 0x1ff);
}
bench_new(a_dst, stride, topleft, w, h, a, 128, 128
HIGHBD_TAIL_SUFFIX);